mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-31 12:04:33 +00:00 
			
		
		
		
	Compare commits
	
		
			507 Commits
		
	
	
		
			feature/mi
			...
			feature/pa
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 1300b0b04b | ||
|  | e6d984b484 | ||
|  | 1d18d95d4f | ||
|  | ae39ec85a3 | ||
|  | b96daf53a0 | ||
|  | 46879e1658 | ||
|  | ae4de94798 | ||
|  | 0ab555b4f5 | ||
|  | 8e9be9f84f | ||
|  | d572170170 | ||
|  | 12ccc73cf5 | ||
|  | e7564f8330 | ||
|  | 91199a8ea0 | ||
|  | 0494feec98 | ||
|  | a16b1e134e | ||
|  | 769ad578f5 | ||
|  | eaac0044b5 | ||
|  | 56042f002c | ||
|  | 3bfd1f13e6 | ||
|  | 092dcd4e04 | ||
|  | 094c3d091a | ||
|  | 4b98e524a0 | ||
|  | 1a1f6d55f9 | ||
|  | 21421656ab | ||
|  | 6f687a67cd | ||
|  | b30754e762 | ||
|  | 1e429a0d57 | ||
|  | d38a4de36c | ||
|  | ef1b7db374 | ||
|  | 53a9aeb965 | ||
|  | e30fa9f4b8 | ||
|  | 58e8d0a10d | ||
|  | 62cf9cf638 | ||
|  | 0fb458879d | ||
|  | 725c513d94 | ||
|  | 69470ccc10 | ||
|  | b8b5934193 | ||
|  | a8c10b1933 | ||
|  | 15e801af3f | ||
|  | 0ffc235741 | ||
|  | 8e19c99c7d | ||
|  | a0bc0ad06f | ||
|  | a8fb2835ca | ||
|  | bc862ce3ab | ||
|  | 3267683e22 | ||
|  | f46a67ffb3 | ||
|  | f7b8383ef5 | ||
|  | 10f2872aae | ||
|  | cd73897b8d | ||
|  | 7a8f6af5f8 | ||
|  | 49a5d9bac7 | ||
|  | 2b3fdd4a58 | ||
|  | 34502ec471 | ||
|  | 8a43e88b4f | ||
|  | 238df20370 | ||
|  | 97a32a6145 | ||
|  | 655492a443 | ||
|  | 1cab06f6bd | ||
|  | f8024c262b | ||
|  | 4cc5f01f4a | ||
|  | 9c12c37aaf | ||
|  | 806eaa0530 | ||
|  | 01d0e54594 | ||
|  | 5aafa335fe | ||
|  | 8ba0494485 | ||
|  | d99d98d9fd | ||
|  | 95a017a4ae | ||
|  | 92f92379e6 | ||
|  | 529e78d43f | ||
|  | 4ec746d262 | ||
|  | 66d819c054 | ||
|  | 3f3686f869 | ||
|  | 26bb829f8c | ||
|  | 67cb04fc66 | ||
|  | a40bd68aed | ||
|  | 36495e0fd2 | ||
|  | 93f6c15772 | ||
|  | cb93eeff21 | ||
|  | c7cc7e6101 | ||
|  | c349aa6511 | ||
|  | 3bae0a2d5c | ||
|  | c1c7566089 | ||
|  | 2439999ec8 | ||
|  | 1d96f662e3 | ||
|  | 41d1889941 | ||
|  | 0c3981e0c3 | ||
|  | c727bd4609 | ||
|  | db23749b67 | ||
|  | 751f2b9703 | ||
|  | 697c0603ce | ||
|  | 14bedebb11 | ||
|  | 47b5c07ffb | ||
|  | da86a2bf54 | ||
|  | c1cb60a0b3 | ||
|  | 5ed5b4bfbf | ||
|  | de84aacdfd | ||
|  | 2888003765 | ||
|  | da06bf5b95 | ||
|  | 20999c1370 | ||
|  | 33f0ed1a33 | ||
|  | 50be56433b | ||
|  | 43924007db | ||
|  | 78ef10e60f | ||
| 679ae98b14 | |||
|  | 90f6bc16bb | ||
|  | 9b5b639546 | ||
|  | 945767c6d8 | ||
|  | 422cdf4979 | ||
|  | 38db174f3b | ||
|  | 92e364a35f | ||
| 58299b8ba2 | |||
| 124bf4d829 | |||
| e8e56b3414 | |||
| 89c430136d | |||
| ea9aef7baa | |||
| c9e9e8061d | |||
|  | 453cf2a1c6 | ||
|  | de7bbfa5f9 | ||
| dda8d77c87 | |||
| aa29f4346a | |||
|  | 86116dbed6 | ||
|  | 7bd31e3f7c | ||
|  | 74f451715f | ||
|  | 655be8ed76 | ||
|  | 4063238943 | ||
|  | 3344788fa1 | ||
|  | 99220f6531 | ||
|  | 2a6d093749 | ||
|  | c947947fad | ||
|  | f555b50547 | ||
|  | 738c1a11c2 | ||
|  | f8797e1e3e | ||
|  | fd1eb7de13 | ||
|  | 2ce898efa3 | ||
|  | ab66bac4e6 | ||
|  | 56277a11c8 | ||
|  | 916e9e1d3e | ||
|  | 5b55867a7a | ||
|  | 3accb1ef89 | ||
|  | e3d0e31525 | ||
|  | 5812eb8a8c | ||
|  | 4dd3763294 | ||
|  | c429ace748 | ||
|  | ac58565d0a | ||
|  | 3703b718aa | ||
|  | b722889234 | ||
|  | abba44a837 | ||
|  | f301be94ce | ||
|  | 1d1b225497 | ||
|  | 53a785a3dd | ||
|  | 736bf3c866 | ||
|  | b9bbe5d188 | ||
|  | 3844bcf800 | ||
|  | e1a2319d01 | ||
|  | 180c732b4c | ||
|  | 957a706d0b | ||
|  | d2312e9874 | ||
|  | fc4ab9ccd5 | ||
|  | 4a340aa5ca | ||
|  | 3b7de792d5 | ||
|  | 557c3fa109 | ||
|  | ec18e9f7f6 | ||
|  | a839d5bc55 | ||
|  | de41b84c5c | ||
|  | 8e161152e4 | ||
|  | 3141ebac10 | ||
|  | 7ede696126 | ||
|  | bf516c3b81 | ||
|  | 441a52ee5d | ||
|  | a8db024c92 | ||
|  | a9c22d5f43 | ||
|  | 3ca41458a3 | ||
|  | 9e2d29c644 | ||
|  | 951be75292 | ||
|  | b9113ed310 | ||
|  | 42fb49d3fd | ||
|  | 2a54c9aaab | ||
|  | 0957378679 | ||
|  | 2ed6c76fc5 | ||
|  | d3b9a7fa14 | ||
|  | 75ea306ce9 | ||
|  | 4226c633c4 | ||
|  | 5a4eafbf7e | ||
|  | eb8e26018b | ||
|  | db5ea001a3 | ||
|  | 2846f079e5 | ||
|  | 1d502e4ed6 | ||
|  | 73cdf0fffe | ||
|  | 1c25773319 | ||
|  | c38400b26f | ||
|  | 9c3065b860 | ||
|  | 94eb829d08 | ||
|  | 68392ddb5b | ||
|  | cb6b81ae82 | ||
| 90ec6eda0c | |||
| fe8d625694 | |||
| 53e76b41d2 | |||
| 8ef4300412 | |||
| 98a24ebf31 | |||
|  | b12dc89d26 | ||
|  | d80d802f9d | ||
|  | 3d99b09dba | ||
|  | db5f6d3ae3 | ||
|  | 683550f116 | ||
|  | 5e477ec553 | ||
|  | 55d0329624 | ||
|  | 86aaa35294 | ||
|  | 172d3dc93a | ||
|  | 8c540333d5 | ||
|  | 5592f7b8c1 | ||
|  | 35da4ece0b | ||
|  | 061b15b9e9 | ||
| ff4e54ef80 | |||
|  | 561426f6eb | ||
|  | 83f6fab8fa | ||
|  | 0fade84ab2 | ||
|  | 9dc7ca4c3b | ||
|  | 935d82f5b1 | ||
|  | 9cbcdd65d7 | ||
|  | f18f5ed926 | ||
|  | d1d63a4f2d | ||
|  | 7e5faa0f34 | ||
|  | 6af459cae4 | ||
|  | 1c4bc7ed38 | ||
|  | cd1bd921bd | ||
|  | fff5751b1a | ||
|  | 2c81696fdd | ||
|  | c9dc22efa1 | ||
|  | 0ab04a000f | ||
|  | 93ea5d9468 | ||
|  | 1ec5d32369 | ||
|  | 9fd23faadf | ||
|  | 10e4fa0dc8 | ||
|  | c4aca1dde4 | ||
|  | b9e8ea3aaa | ||
|  | 077aa728b9 | ||
|  | a8d83d886e | ||
|  | 7fd46eeec4 | ||
|  | e0c4eeb3ec | ||
|  | cb9a297a0a | ||
|  | 2b115929dc | ||
|  | 5c6571dab1 | ||
|  | 417ec56cca | ||
|  | 756bc25008 | ||
|  | 35695ba57a | ||
|  | 81ead48850 | ||
|  | d805867e02 | ||
|  | e55a751e23 | ||
|  | 358eb75995 | ||
|  | 98f9318279 | ||
|  | 4b17e8eba8 | ||
|  | 75112a632a | ||
|  | 18bde08d1b | ||
|  | d45cd7e677 | ||
|  | 4e96679797 | ||
|  | 4c1ea8677e | ||
|  | fc93f0b2ec | ||
|  | 8c8473998d | ||
|  | 120fb59978 | ||
|  | fd56b3ff38 | ||
|  | 0ec6829edc | ||
|  | 18b7845b7b | ||
|  | 3d0fe15374 | ||
|  | 91886068fe | ||
|  | 6d1e9e5f92 | ||
|  | b640230b1e | ||
|  | e7c36771ed | ||
|  | 8dc57a1e25 | ||
|  | f57bd770b0 | ||
|  | 4ed10a3d06 | ||
|  | dfefc70b57 | ||
|  | b64e004555 | ||
|  | 447c5e6cd7 | ||
|  | 8b99d80d8c | ||
|  | b3dede4dd3 | ||
|  | 4e34132f4d | ||
|  | c07cb10247 | ||
|  | d7767a2a62 | ||
|  | ec035983fd | ||
|  | 3901b17ade | ||
|  | af230a1fb8 | ||
|  | 06a132e3f9 | ||
|  | 596dcd85b2 | ||
|  | 96d44d5c55 | ||
|  | 7270c6a150 | ||
|  | 7fe797daf8 | ||
|  | 486a01294a | ||
|  | 586a7c90b7 | ||
|  | e099dcdae7 | ||
|  | 4e7ab3166f | ||
|  | aac80cbb44 | ||
|  | c80948411b | ||
|  | 95625a7bd1 | ||
|  | 0796696733 | ||
|  | f8b9ad7d50 | ||
|  | 04a1959895 | ||
|  | 1c30e9a961 | ||
|  | 93cc270016 | ||
|  | 29b60f7e1a | ||
|  | bf7e3f20d4 | ||
|  | 902afcfbaf | ||
|  | 3ae92fa2e6 | ||
|  | 3906cd2149 | ||
|  | 5a1fb29db7 | ||
|  | 661fc4d3d1 | ||
|  | 41009cc142 | ||
|  | 37720c4db7 | ||
|  | 1a30455a10 | ||
|  | 97a6b61551 | ||
|  | cd0da81196 | ||
|  | f246fe3304 | ||
|  | 8a29c16bde | ||
|  | d68907fc3e | ||
|  | 5c0adf7bf2 | ||
|  | be3a8249c6 | ||
|  | bd600702cf | ||
|  | f011bdb869 | ||
|  | bafb101e4f | ||
|  | 08fdf05528 | ||
|  | aca7a3ef0a | ||
|  | 9e72a6b22e | ||
|  | 1c12c5612c | ||
|  | a8193c4bcb | ||
|  | c3d7ec65fa | ||
|  | 8b6a6c8236 | ||
|  | e0571c872b | ||
|  | c67f41887b | ||
|  | 84687ccf1f | ||
|  | 3274561cf8 | ||
| e08fbb3771 | |||
|  | d7464aa0fe | ||
|  | 00d29153f0 | ||
| 2ce989f220 | |||
|  | d7a1dc85be | ||
|  | fc19503673 | ||
|  | beba824136 | ||
|  | 6ebf8b12b6 | ||
|  | e5a7ed4362 | ||
|  | b9f7ea47c3 | ||
|  | 06f7ee202e | ||
|  | 2b2fc6453f | ||
|  | bdd2765461 | ||
|  | 2c246551d0 | ||
|  | 71ac2e7940 | ||
|  | 2bf4688e83 | ||
|  | a48ee6f0f2 | ||
|  | 73547cca66 | ||
|  | 123c673db7 | ||
|  | 61f82216e2 | ||
|  | 8e7ca92278 | ||
|  | 485ad6fde0 | ||
|  | 6ea2184e18 | ||
|  | fdc170b8a3 | ||
|  | 060da786e9 | ||
|  | 85c7bc4321 | ||
|  | 0883d6a7ce | ||
|  | 9ff97b4711 | ||
|  | b5e9c900a4 | ||
|  | 4bbdfb434c | ||
|  | 4a45c06dd7 | ||
|  | d6a7d7d1e0 | ||
|  | 1a122a0dd8 | ||
|  | 20e20733e8 | ||
|  | b7cd1a19e3 | ||
|  | f510002a62 | ||
|  | 1e257a1251 | ||
|  | 522f6bf91a | ||
|  | d35d87d2c2 | ||
|  | 74a5cda84b | ||
|  | 5be05d85b8 | ||
|  | 35ac85aea8 | ||
|  | fa237401ff | ||
|  | 97053adcb5 | ||
|  | f8fbe4d7a3 | ||
|  | ef31c012bf | ||
|  | 9e9f621d5d | ||
|  | 651e1a7cbc | ||
|  | c4d3672720 | ||
|  | 16be6d378c | ||
|  | f05d0565aa | ||
|  | 9bf4108d1f | ||
|  | 6929a84c70 | ||
|  | 5c779a789b | ||
|  | e863a948e3 | ||
|  | 977f34dca6 | ||
|  | 90ad956340 | ||
|  | 7996f06335 | ||
|  | 7b40a3e3e5 | ||
|  | f7fbbaaca3 | ||
|  | 17629b8d9e | ||
|  | 0baa20d292 | ||
|  | 4571c918a4 | ||
|  | 5251ea4d30 | ||
|  | 7f456b4173 | ||
|  | c291ef77b5 | ||
|  | 7dd2764bb2 | ||
|  | 244f8fb6dc | ||
|  | 05c1924819 | ||
|  | 27dfe816fa | ||
|  | af29be2c90 | ||
|  | f96fac0aee | ||
|  | 07f2ebea1b | ||
|  | 851f2ad8ef | ||
|  | 23e0561dd6 | ||
|  | 8ae1a95ec6 | ||
|  | 82b7d4eaf0 | ||
|  | 78774fbdc0 | ||
|  | 924130833e | ||
|  | 0157274762 | ||
|  | 87e8aad5a0 | ||
|  | c6f59c2933 | ||
|  | b7f90aa011 | ||
|  | f22b79da8f | ||
|  | 3855673ebf | ||
|  | 4db82da0db | ||
|  | 0cdc3d2fa5 | ||
|  | 0dfda4bb90 | ||
|  | 1189ebc8b5 | ||
|  | 1bb8578173 | ||
|  | c3b6d573b9 | ||
|  | 1e179c903d | ||
|  | 669cfca9b7 | ||
|  | ff2f559a57 | ||
|  | 03c81bd902 | ||
|  | a869addef1 | ||
|  | 1caa3fbc2d | ||
|  | 3d21297bbb | ||
|  | 25efefc5b4 | ||
|  | eabf316ed9 | ||
|  | 04ae7929a3 | ||
|  | caba0d42a5 | ||
|  | 9ae81c06d2 | ||
|  | 0903c48caa | ||
|  | 7dc36628a1 | ||
|  | b8cdb3e90a | ||
|  | 5241245534 | ||
|  | 960316e207 | ||
|  | 5214846341 | ||
|  | ce1a115e0b | ||
|  | 3f2d53a994 | ||
|  | df9108154d | ||
|  | b3e7f600da | ||
|  | d4071daf2a | ||
|  | a2a6329094 | ||
|  | eabc577940 | ||
|  | 0bd296dda4 | ||
|  | af0ccdd8e9 | ||
|  | 2fb92dbc6e | ||
|  | 5c74b6028b | ||
|  | e0be2b6e6c | ||
|  | ef72f322d2 | ||
|  | 426197e446 | ||
|  | 99e2c1e666 | ||
|  | 1440565a10 | ||
|  | e9f0c0ea39 | ||
|  | 7bc2065113 | ||
|  | 2bd4233919 | ||
|  | 143c70e29f | ||
|  | b812d5e39c | ||
|  | 01480da0a8 | ||
|  | c097fd041a | ||
|  | 77fb25fb29 | ||
|  | 389e0a77bd | ||
|  | 95f43d27ae | ||
|  | 668ca57702 | ||
|  | 62749d05a6 | ||
|  | 3834feb4b7 | ||
|  | 454302414d | ||
|  | 6f8b771a37 | ||
|  | 4e1ffdd17c | ||
|  | a783282b8b | ||
|  | 19b85d8486 | ||
|  | ee686a7d85 | ||
|  | 1c5b7a6be5 | ||
|  | 164d3691db | ||
|  | 1d666771f9 | ||
|  | d50055cd96 | ||
|  | 47c7159177 | ||
|  | f415db583a | ||
|  | f55c16f984 | ||
|  | df67e013ca | ||
|  | 3e990c9d0a | ||
|  | 4b740fc8fd | ||
|  | cccd14b09e | ||
|  | e6acffdfc2 | ||
|  | 392130a537 | ||
|  | deef2673b2 | ||
|  | 977b0a6dd9 | ||
|  | 977d844394 | ||
|  | 590675e2ca | ||
|  | 8c65bdf6d3 | ||
|  | 74f1ed3bc5 | ||
|  | 79270ef510 | ||
|  | e250e6b7bb | ||
|  | 261342c15f | ||
|  | eda4dd622e | ||
|  | c68a2b9637 | ||
|  | 293df6cd20 | ||
|  | 65f61bb3bf | ||
|  | 26b9740d53 | ||
|  | 6eb873dd96 | ||
|  | 11b4c80b27 | ||
|  | c065e454c3 | ||
|  | d9b5fbd374 | ||
|  | cfbc1a26b8 | ||
|  | 257f69f931 | ||
|  | e415260961 | 
							
								
								
									
										6
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										6
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -92,6 +92,7 @@ build*/* | ||||
| ##################### | ||||
| *.xcodeproj/* | ||||
| build.sh | ||||
| .vscode | ||||
|  | ||||
| # Eigen source # | ||||
| ################ | ||||
| @@ -106,6 +107,10 @@ lib/fftw/* | ||||
| m4/lt* | ||||
| m4/libtool.m4 | ||||
|  | ||||
| # github pages # | ||||
| ################ | ||||
| gh-pages/ | ||||
|  | ||||
| # Buck files # | ||||
| ############## | ||||
| .buck* | ||||
| @@ -117,3 +122,4 @@ make-bin-BUCK.sh | ||||
| ##################### | ||||
| lib/qcd/spin/gamma-gen/*.h | ||||
| lib/qcd/spin/gamma-gen/*.cc | ||||
|  | ||||
|   | ||||
							
								
								
									
										28
									
								
								.travis.yml
									
									
									
									
									
								
							
							
						
						
									
										28
									
								
								.travis.yml
									
									
									
									
									
								
							| @@ -7,9 +7,11 @@ cache: | ||||
| matrix: | ||||
|   include: | ||||
|     - os:        osx | ||||
|       osx_image: xcode7.2 | ||||
|       osx_image: xcode8.3 | ||||
|       compiler: clang | ||||
|     - compiler: gcc | ||||
|       dist: trusty | ||||
|       sudo: required | ||||
|       addons: | ||||
|         apt: | ||||
|           sources: | ||||
| @@ -24,6 +26,8 @@ matrix: | ||||
|             - binutils-dev | ||||
|       env: VERSION=-4.9 | ||||
|     - compiler: gcc | ||||
|       dist: trusty | ||||
|       sudo: required | ||||
|       addons: | ||||
|         apt: | ||||
|           sources: | ||||
| @@ -38,6 +42,7 @@ matrix: | ||||
|             - binutils-dev | ||||
|       env: VERSION=-5 | ||||
|     - compiler: clang | ||||
|       dist: trusty | ||||
|       addons: | ||||
|         apt: | ||||
|           sources: | ||||
| @@ -52,6 +57,7 @@ matrix: | ||||
|             - binutils-dev | ||||
|       env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz | ||||
|     - compiler: clang | ||||
|       dist: trusty | ||||
|       addons: | ||||
|         apt: | ||||
|           sources: | ||||
| @@ -73,13 +79,15 @@ before_install: | ||||
|     - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi | ||||
|     - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi | ||||
|     - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi | ||||
|     - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install openmpi; fi | ||||
|     - if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi | ||||
|      | ||||
| install: | ||||
|     - export CC=$CC$VERSION | ||||
|     - export CXX=$CXX$VERSION | ||||
|     - echo $PATH | ||||
|     - which autoconf | ||||
|     - autoconf  --version | ||||
|     - which automake | ||||
|     - automake  --version | ||||
|     - which $CC | ||||
|     - $CC  --version | ||||
|     - which $CXX | ||||
| @@ -92,15 +100,15 @@ script: | ||||
|     - cd build | ||||
|     - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none | ||||
|     - make -j4  | ||||
|     - ./benchmarks/Benchmark_dwf --threads 1 | ||||
|     - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals | ||||
|     - echo make clean | ||||
|     - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none | ||||
|     - make -j4 | ||||
|     - ./benchmarks/Benchmark_dwf --threads 1 | ||||
|     - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals | ||||
|     - make check | ||||
|     - echo make clean | ||||
|     - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi | ||||
|     - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto | ||||
|     - make -j4 | ||||
|     - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi | ||||
|     - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then mpirun -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi | ||||
|     - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto ; fi | ||||
|     - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then make -j4; fi | ||||
|     - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi | ||||
|  | ||||
|  | ||||
|   | ||||
							
								
								
									
										11
									
								
								Makefile.am
									
									
									
									
									
								
							
							
						
						
									
										11
									
								
								Makefile.am
									
									
									
									
									
								
							| @@ -3,10 +3,15 @@ SUBDIRS = lib benchmarks tests extras | ||||
|  | ||||
| include $(top_srcdir)/doxygen.inc | ||||
|  | ||||
| tests: all | ||||
| 	$(MAKE) -C tests tests | ||||
| bin_SCRIPTS=grid-config | ||||
|  | ||||
| .PHONY: tests doxygen-run doxygen-doc $(DX_PS_GOAL) $(DX_PDF_GOAL) | ||||
|  | ||||
| .PHONY: bench check tests doxygen-run doxygen-doc $(DX_PS_GOAL) $(DX_PDF_GOAL) | ||||
|  | ||||
| tests-local: all | ||||
| bench-local: all | ||||
| check-local: all | ||||
|  | ||||
| AM_CXXFLAGS += -I$(top_builddir)/include | ||||
|  | ||||
| ACLOCAL_AMFLAGS = -I m4 | ||||
|   | ||||
							
								
								
									
										29
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										29
									
								
								README.md
									
									
									
									
									
								
							| @@ -22,6 +22,26 @@ Last update Nov 2016. | ||||
|  | ||||
| _Please do not send pull requests to the `master` branch which is reserved for releases._ | ||||
|  | ||||
| ### Compilers | ||||
|  | ||||
| Intel ICPC v16.0.3 and later | ||||
|  | ||||
| Clang v3.5 and later (need 3.8 and later for OpenMP) | ||||
|  | ||||
| GCC   v4.9.x (recommended) | ||||
|  | ||||
| GCC   v6.3 and later | ||||
|  | ||||
| ### Important:  | ||||
|  | ||||
| Some versions of GCC appear to have a bug under high optimisation (-O2, -O3). | ||||
|  | ||||
| The safety of these compiler versions cannot be guaranteed at this time. Follow Issue 100 for details and updates. | ||||
|  | ||||
| GCC   v5.x | ||||
|  | ||||
| GCC   v6.1, v6.2 | ||||
|  | ||||
| ### Bug report | ||||
|  | ||||
| _To help us tracking and solving more efficiently issues with Grid, please report problems using the issue system of GitHub rather than sending emails to Grid developers._ | ||||
| @@ -32,7 +52,7 @@ When you file an issue, please go though the following checklist: | ||||
| 2. Give a description of the target platform (CPU, network, compiler). Please give the full CPU part description, using for example `cat /proc/cpuinfo | grep 'model name' | uniq` (Linux) or `sysctl machdep.cpu.brand_string` (macOS) and the full output the `--version` option of your compiler. | ||||
| 3. Give the exact `configure` command used. | ||||
| 4. Attach `config.log`. | ||||
| 5. Attach `config.summary`. | ||||
| 5. Attach `grid.config.summary`. | ||||
| 6. Attach the output of `make V=1`. | ||||
| 7. Describe the issue and any previous attempt to solve it. If relevant, show how to reproduce the issue using a minimal working example. | ||||
|  | ||||
| @@ -95,10 +115,10 @@ install Grid. Other options are detailed in the next section, you can also use ` | ||||
| `CXX`, `CXXFLAGS`, `LDFLAGS`, ... environment variables can be modified to | ||||
| customise the build. | ||||
|  | ||||
| Finally, you can build and install Grid: | ||||
| Finally, you can build, check, and install Grid: | ||||
|  | ||||
| ``` bash | ||||
| make; make install | ||||
| make; make check; make install | ||||
| ``` | ||||
|  | ||||
| To minimise the build time, only the tests at the root of the `tests` directory are built by default. If you want to build tests in the sub-directory `<subdir>` you can execute: | ||||
| @@ -121,7 +141,7 @@ If you want to build all the tests at once just use `make tests`. | ||||
| - `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). | ||||
| - `--enable-precision={single|double}`: set the default precision (default: `double`). | ||||
| - `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below. | ||||
| - `--enable-rng={ranlux48|mt19937}`: choose the RNG (default: `ranlux48 `). | ||||
| - `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `). | ||||
| - `--disable-timers`: disable system dependent high-resolution timers. | ||||
| - `--enable-chroma`: enable Chroma regression tests. | ||||
| - `--enable-doxygen-doc`: enable the Doxygen documentation generation (build with `make doxygen-doc`) | ||||
| @@ -159,7 +179,6 @@ Alternatively, some CPU codenames can be directly used: | ||||
|  | ||||
| | `<code>`    | Description                            | | ||||
| | ----------- | -------------------------------------- | | ||||
| | `KNC`       | [Intel Xeon Phi codename Knights Corner](http://ark.intel.com/products/codename/57721/Knights-Corner) | | ||||
| | `KNL`       | [Intel Xeon Phi codename Knights Landing](http://ark.intel.com/products/codename/48999/Knights-Landing) | | ||||
| | `BGQ`       | Blue Gene/Q                            | | ||||
|  | ||||
|   | ||||
							
								
								
									
										62
									
								
								TODO
									
									
									
									
									
								
							
							
						
						
									
										62
									
								
								TODO
									
									
									
									
									
								
							| @@ -1,6 +1,27 @@ | ||||
| TODO: | ||||
| --------------- | ||||
|  | ||||
| Peter's work list: | ||||
| 1)- Precision conversion and sort out localConvert      <--  | ||||
| 2)- Remove DenseVector, DenseMatrix; Use Eigen instead. <--  | ||||
|  | ||||
| -- Profile CG, BlockCG, etc... Flop count/rate -- PARTIAL, time but no flop/s yet | ||||
| -- Physical propagator interface | ||||
| -- Conserved currents | ||||
| -- GaugeFix into central location | ||||
| -- Multigrid Wilson and DWF, compare to other Multigrid implementations | ||||
| -- HDCR resume | ||||
|  | ||||
| Recent DONE  | ||||
| -- Binary I/O speed up & x-strips                      <-- DONE | ||||
| -- Cut down the exterior overhead                      <-- DONE | ||||
| -- Interior legs from SHM comms                        <-- DONE | ||||
| -- Half-precision comms                                <-- DONE | ||||
| -- Merge high precision reduction into develop         | ||||
| -- multiRHS DWF; benchmark on Cori/BNL for comms elimination | ||||
|    -- slice* linalg routines for multiRHS, BlockCG     | ||||
|  | ||||
| ----- | ||||
| * Forces; the UdSdU  term in gauge force term is half of what I think it should | ||||
|   be. This is a consequence of taking ONLY the first term in: | ||||
|  | ||||
| @@ -21,16 +42,8 @@ TODO: | ||||
|   This means we must double the force in the Test_xxx_force routines, and is the origin of the factor of two. | ||||
|   This 2x is applied by hand in the fermion routines and in the Test_rect_force routine. | ||||
|  | ||||
|  | ||||
| Policies: | ||||
|  | ||||
| * Link smearing/boundary conds; Policy class based implementation ; framework more in place | ||||
|  | ||||
| * Support different boundary conditions (finite temp, chem. potential ... ) | ||||
|  | ||||
| * Support different fermion representations?  | ||||
|   - contained entirely within the integrator presently | ||||
|  | ||||
| - Sign of force term. | ||||
|  | ||||
| - Reversibility test. | ||||
| @@ -41,11 +54,6 @@ Policies: | ||||
|  | ||||
| - Audit oIndex usage for cb behaviour | ||||
|  | ||||
| - Rectangle gauge actions. | ||||
|   Iwasaki, | ||||
|   Symanzik, | ||||
|   ... etc... | ||||
|  | ||||
| - Prepare multigrid for HMC. - Alternate setup schemes. | ||||
|  | ||||
| - Support for ILDG --- ugly, not done | ||||
| @@ -55,9 +63,11 @@ Policies: | ||||
| - FFTnD ? | ||||
|  | ||||
| - Gparity; hand opt use template specialisation elegance to enable the optimised paths ? | ||||
|  | ||||
| - Gparity force term; Gparity (R)HMC. | ||||
| - Random number state save restore | ||||
|  | ||||
| - Mobius implementation clean up to rmove #if 0 stale code sequences | ||||
|  | ||||
| - CG -- profile carefully, kernel fusion, whole CG performance measurements. | ||||
|  | ||||
| ================================================================ | ||||
| @@ -90,6 +100,7 @@ Insert/Extract | ||||
| Not sure of status of this -- reverify. Things are working nicely now though. | ||||
|  | ||||
| * Make the Tensor types and Complex etc... play more nicely. | ||||
|  | ||||
|   - TensorRemove is a hack, come up with a long term rationalised approach to Complex vs. Scalar<Scalar<Scalar<Complex > > > | ||||
|     QDP forces use of "toDouble" to get back to non tensor scalar. This role is presently taken TensorRemove, but I | ||||
|     want to introduce a syntax that does not require this. | ||||
| @@ -112,6 +123,8 @@ Not sure of status of this -- reverify. Things are working nicely now though. | ||||
| RECENT | ||||
| --------------- | ||||
|  | ||||
|   - Support different fermion representations? -- DONE | ||||
|   - contained entirely within the integrator presently | ||||
|   - Clean up HMC                                                             -- DONE | ||||
|   - LorentzScalar<GaugeField> gets Gauge link type (cleaner).                -- DONE | ||||
|   - Simplified the integrators a bit.                                        -- DONE | ||||
| @@ -123,6 +136,26 @@ RECENT | ||||
|   - Parallel io improvements                                  -- DONE | ||||
|   - Plaquette and link trace checks into nersc reader from the Grid_nersc_io.cc test. -- DONE | ||||
|  | ||||
|  | ||||
| DONE: | ||||
| - MultiArray -- MultiRHS done | ||||
| - ConjugateGradientMultiShift -- DONE | ||||
| - MCR                         -- DONE | ||||
| - Remez -- Mike or Boost?     -- DONE | ||||
| - Proto (ET)                  -- DONE | ||||
| - uBlas                       -- DONE ; Eigen | ||||
| - Potentially Useful Boost libraries -- DONE ; Eigen | ||||
| - Aligned allocator; memory pool -- DONE | ||||
| - Multiprecision              -- DONE | ||||
| - Serialization               -- DONE | ||||
| - Regex -- Not needed | ||||
| - Tokenize -- Why? | ||||
|  | ||||
| - Random number state save restore -- DONE | ||||
| - Rectangle gauge actions. -- DONE | ||||
|   Iwasaki, | ||||
|   Symanzik, | ||||
|   ... etc... | ||||
| Done: Cayley, Partial , ContFrac force terms. | ||||
|  | ||||
| DONE | ||||
| @@ -207,6 +240,7 @@ Done | ||||
| FUNCTIONALITY: it pleases me to keep track of things I have done (keeps me arguably sane) | ||||
| ====================================================================================================== | ||||
|  | ||||
| * Link smearing/boundary conds; Policy class based implementation ; framework more in place -- DONE | ||||
| * Command line args for geometry, simd, etc. layout. Is it necessary to have -- DONE | ||||
|   user pass these? Is this a QCD specific? | ||||
|  | ||||
|   | ||||
							
								
								
									
										9
									
								
								VERSION
									
									
									
									
									
								
							
							
						
						
									
										9
									
								
								VERSION
									
									
									
									
									
								
							| @@ -1,6 +1,5 @@ | ||||
| Version : 0.6.0 | ||||
| Version : 0.7.0 | ||||
|  | ||||
| - AVX512, AVX2, AVX, SSE good | ||||
| - Clang 3.5 and above, ICPC v16 and above, GCC 4.9 and above | ||||
| - MPI and MPI3 | ||||
| - HiRep, Smearing, Generic gauge group | ||||
| - Clang 3.5 and above, ICPC v16 and above, GCC 6.3 and above recommended | ||||
| - MPI and MPI3 comms optimisations for KNL and OPA finished | ||||
| - Half precision comms | ||||
|   | ||||
| @@ -31,6 +31,32 @@ using namespace std; | ||||
| using namespace Grid; | ||||
| using namespace Grid::QCD; | ||||
|  | ||||
| struct time_statistics{ | ||||
|   double mean; | ||||
|   double err; | ||||
|   double min; | ||||
|   double max; | ||||
|  | ||||
|   void statistics(std::vector<double> v){ | ||||
|       double sum = std::accumulate(v.begin(), v.end(), 0.0); | ||||
|       mean = sum / v.size(); | ||||
|  | ||||
|       std::vector<double> diff(v.size()); | ||||
|       std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; }); | ||||
|       double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); | ||||
|       err = std::sqrt(sq_sum / (v.size()*(v.size() - 1))); | ||||
|  | ||||
|       auto result = std::minmax_element(v.begin(), v.end()); | ||||
|       min = *result.first; | ||||
|       max = *result.second; | ||||
| } | ||||
| }; | ||||
|  | ||||
| void header(){ | ||||
|   std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t" | ||||
|             <<std::setw(11)<<"bytes"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl; | ||||
| }; | ||||
|  | ||||
| int main (int argc, char ** argv) | ||||
| { | ||||
|   Grid_init(&argc,&argv); | ||||
| @@ -40,17 +66,21 @@ int main (int argc, char ** argv) | ||||
|   int threads = GridThread::GetThreads(); | ||||
|   std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; | ||||
|  | ||||
|   int Nloop=10; | ||||
|   int Nloop=100; | ||||
|   int nmu=0; | ||||
|   int maxlat=24; | ||||
|   for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++; | ||||
|  | ||||
|   std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl; | ||||
|   std::vector<double> t_time(Nloop); | ||||
|   time_statistics timestat; | ||||
|  | ||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||
|   std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl; | ||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl; | ||||
|   int maxlat=16; | ||||
|   for(int lat=4;lat<=maxlat;lat+=2){ | ||||
|     for(int Ls=1;Ls<=16;Ls*=2){ | ||||
|   header(); | ||||
|   for(int lat=4;lat<=maxlat;lat+=4){ | ||||
|     for(int Ls=8;Ls<=32;Ls*=2){ | ||||
|  | ||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0], | ||||
|       				    lat*mpi_layout[1], | ||||
| @@ -58,6 +88,9 @@ int main (int argc, char ** argv) | ||||
|       				    lat*mpi_layout[3]}); | ||||
|  | ||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||
|       RealD Nrank = Grid._Nprocessors; | ||||
|       RealD Nnode = Grid.NodeCount(); | ||||
|       RealD ppn = Nrank/Nnode; | ||||
|  | ||||
|       std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); | ||||
|       std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); | ||||
| @@ -65,8 +98,8 @@ int main (int argc, char ** argv) | ||||
|       int ncomm; | ||||
|       int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); | ||||
|  | ||||
|       double start=usecond(); | ||||
|       for(int i=0;i<Nloop;i++){ | ||||
|       double start=usecond(); | ||||
|  | ||||
| 	std::vector<CartesianCommunicator::CommsRequest_t> requests; | ||||
|  | ||||
| @@ -102,18 +135,24 @@ int main (int argc, char ** argv) | ||||
| 	} | ||||
| 	Grid.SendToRecvFromComplete(requests); | ||||
| 	Grid.Barrier(); | ||||
|  | ||||
| 	double stop=usecond(); | ||||
| 	t_time[i] = stop-start; // microseconds | ||||
|       } | ||||
|       double stop=usecond(); | ||||
|  | ||||
|       double dbytes    = bytes; | ||||
|       double xbytes    = Nloop*dbytes*2.0*ncomm; | ||||
|       timestat.statistics(t_time); | ||||
|  | ||||
|       double dbytes    = bytes*ppn; | ||||
|       double xbytes    = dbytes*2.0*ncomm; | ||||
|       double rbytes    = xbytes; | ||||
|       double bidibytes = xbytes+rbytes; | ||||
|  | ||||
|       double time = stop-start; // microseconds | ||||
|       std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t" | ||||
|                <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7) | ||||
|                <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " " | ||||
|                <<xbytes/timestat.max <<" "<< xbytes/timestat.min   | ||||
|                << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " " | ||||
|                << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl; | ||||
|  | ||||
|       std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl; | ||||
|     } | ||||
|   }     | ||||
|  | ||||
| @@ -121,15 +160,17 @@ int main (int argc, char ** argv) | ||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||
|   std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange in "<<nmu<<" dimensions"<<std::endl; | ||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl; | ||||
|   header(); | ||||
|  | ||||
|  | ||||
|   for(int lat=4;lat<=maxlat;lat+=2){ | ||||
|     for(int Ls=1;Ls<=16;Ls*=2){ | ||||
|   for(int lat=4;lat<=maxlat;lat+=4){ | ||||
|     for(int Ls=8;Ls<=32;Ls*=2){ | ||||
|  | ||||
|       std::vector<int> latt_size  ({lat,lat,lat,lat}); | ||||
|  | ||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||
|       RealD Nrank = Grid._Nprocessors; | ||||
|       RealD Nnode = Grid.NodeCount(); | ||||
|       RealD ppn = Nrank/Nnode; | ||||
|  | ||||
|       std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); | ||||
|       std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); | ||||
| @@ -138,8 +179,8 @@ int main (int argc, char ** argv) | ||||
|       int ncomm; | ||||
|       int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); | ||||
|  | ||||
|       double start=usecond(); | ||||
|       for(int i=0;i<Nloop;i++){ | ||||
|       double start=usecond(); | ||||
|      | ||||
| 	ncomm=0; | ||||
| 	for(int mu=0;mu<4;mu++){ | ||||
| @@ -178,30 +219,37 @@ int main (int argc, char ** argv) | ||||
| 	  } | ||||
| 	} | ||||
| 	Grid.Barrier(); | ||||
| 	double stop=usecond(); | ||||
| 	t_time[i] = stop-start; // microseconds | ||||
|  | ||||
|       } | ||||
|  | ||||
|       double stop=usecond(); | ||||
|       timestat.statistics(t_time); | ||||
|        | ||||
|       double dbytes    = bytes; | ||||
|       double xbytes    = Nloop*dbytes*2.0*ncomm; | ||||
|       double dbytes    = bytes*ppn; | ||||
|       double xbytes    = dbytes*2.0*ncomm; | ||||
|       double rbytes    = xbytes; | ||||
|       double bidibytes = xbytes+rbytes; | ||||
|  | ||||
|       double time = stop-start; | ||||
|     std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t" | ||||
|                <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7) | ||||
|                <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " " | ||||
|                <<xbytes/timestat.max <<" "<< xbytes/timestat.min   | ||||
|                << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " " | ||||
|                << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl; | ||||
|  | ||||
|        | ||||
|       std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl; | ||||
|     } | ||||
|   }   | ||||
|  | ||||
|  | ||||
|   Nloop=100; | ||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||
|   std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl; | ||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl; | ||||
|   header(); | ||||
|  | ||||
|   for(int lat=4;lat<=maxlat;lat+=2){ | ||||
|     for(int Ls=1;Ls<=16;Ls*=2){ | ||||
|   for(int lat=4;lat<=maxlat;lat+=4){ | ||||
|     for(int Ls=8;Ls<=32;Ls*=2){ | ||||
|  | ||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0], | ||||
|       				    lat*mpi_layout[1], | ||||
| @@ -209,6 +257,9 @@ int main (int argc, char ** argv) | ||||
|       				    lat*mpi_layout[3]}); | ||||
|  | ||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||
|       RealD Nrank = Grid._Nprocessors; | ||||
|       RealD Nnode = Grid.NodeCount(); | ||||
|       RealD ppn = Nrank/Nnode; | ||||
|  | ||||
|       std::vector<HalfSpinColourVectorD *> xbuf(8); | ||||
|       std::vector<HalfSpinColourVectorD *> rbuf(8); | ||||
| @@ -216,73 +267,86 @@ int main (int argc, char ** argv) | ||||
|       for(int d=0;d<8;d++){ | ||||
| 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||
| 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||
| 	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||
| 	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||
|       } | ||||
|  | ||||
|       int ncomm; | ||||
|       int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); | ||||
|  | ||||
|       double start=usecond(); | ||||
|       double dbytes; | ||||
|       for(int i=0;i<Nloop;i++){ | ||||
| 	double start=usecond(); | ||||
|  | ||||
| 	dbytes=0; | ||||
| 	ncomm=0; | ||||
|  | ||||
| 	std::vector<CartesianCommunicator::CommsRequest_t> requests; | ||||
|  | ||||
| 	ncomm=0; | ||||
| 	for(int mu=0;mu<4;mu++){ | ||||
| 	 | ||||
|  | ||||
| 	  if (mpi_layout[mu]>1 ) { | ||||
| 	   | ||||
| 	    ncomm++; | ||||
| 	    int comm_proc=1; | ||||
| 	    int xmit_to_rank; | ||||
| 	    int recv_from_rank; | ||||
| 	     | ||||
| 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | ||||
| 	    Grid.StencilSendToRecvFromBegin(requests, | ||||
| 					    (void *)&xbuf[mu][0], | ||||
| 					    xmit_to_rank, | ||||
| 					    (void *)&rbuf[mu][0], | ||||
| 					    recv_from_rank, | ||||
| 					    bytes); | ||||
| 	    dbytes+= | ||||
| 	      Grid.StencilSendToRecvFromBegin(requests, | ||||
| 					      (void *)&xbuf[mu][0], | ||||
| 					      xmit_to_rank, | ||||
| 					      (void *)&rbuf[mu][0], | ||||
| 					      recv_from_rank, | ||||
| 					      bytes); | ||||
| 	 | ||||
| 	    comm_proc = mpi_layout[mu]-1; | ||||
| 	   | ||||
| 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | ||||
| 	    Grid.StencilSendToRecvFromBegin(requests, | ||||
| 					    (void *)&xbuf[mu+4][0], | ||||
| 					    xmit_to_rank, | ||||
| 					    (void *)&rbuf[mu+4][0], | ||||
| 					    recv_from_rank, | ||||
| 					    bytes); | ||||
| 	    dbytes+= | ||||
| 	      Grid.StencilSendToRecvFromBegin(requests, | ||||
| 					      (void *)&xbuf[mu+4][0], | ||||
| 					      xmit_to_rank, | ||||
| 					      (void *)&rbuf[mu+4][0], | ||||
| 					      recv_from_rank, | ||||
| 					      bytes); | ||||
| 	   | ||||
| 	  } | ||||
| 	} | ||||
| 	Grid.StencilSendToRecvFromComplete(requests); | ||||
| 	Grid.Barrier(); | ||||
| 	double stop=usecond(); | ||||
| 	t_time[i] = stop-start; // microseconds | ||||
| 	 | ||||
|       } | ||||
|       double stop=usecond(); | ||||
|  | ||||
|       double dbytes    = bytes; | ||||
|       double xbytes    = Nloop*dbytes*2.0*ncomm; | ||||
|       double rbytes    = xbytes; | ||||
|       double bidibytes = xbytes+rbytes; | ||||
|       timestat.statistics(t_time); | ||||
|  | ||||
|       dbytes=dbytes*ppn; | ||||
|       double xbytes    = dbytes*0.5; | ||||
|       double rbytes    = dbytes*0.5; | ||||
|       double bidibytes = dbytes; | ||||
|  | ||||
|       std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t" | ||||
|                <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7) | ||||
|                <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " " | ||||
|                <<xbytes/timestat.max <<" "<< xbytes/timestat.min   | ||||
|                << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " " | ||||
|                << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl; | ||||
|  | ||||
|       double time = stop-start; // microseconds | ||||
|  | ||||
|       std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl; | ||||
|     } | ||||
|   }     | ||||
|  | ||||
|  | ||||
|  | ||||
|   Nloop=100; | ||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||
|   std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl; | ||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl; | ||||
|   header(); | ||||
|  | ||||
|   for(int lat=4;lat<=maxlat;lat+=2){ | ||||
|     for(int Ls=1;Ls<=16;Ls*=2){ | ||||
|   for(int lat=4;lat<=maxlat;lat+=4){ | ||||
|     for(int Ls=8;Ls<=32;Ls*=2){ | ||||
|  | ||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0], | ||||
|       				    lat*mpi_layout[1], | ||||
| @@ -290,6 +354,9 @@ int main (int argc, char ** argv) | ||||
|       				    lat*mpi_layout[3]}); | ||||
|  | ||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||
|       RealD Nrank = Grid._Nprocessors; | ||||
|       RealD Nnode = Grid.NodeCount(); | ||||
|       RealD ppn = Nrank/Nnode; | ||||
|  | ||||
|       std::vector<HalfSpinColourVectorD *> xbuf(8); | ||||
|       std::vector<HalfSpinColourVectorD *> rbuf(8); | ||||
| @@ -297,16 +364,18 @@ int main (int argc, char ** argv) | ||||
|       for(int d=0;d<8;d++){ | ||||
| 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||
| 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||
| 	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||
| 	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||
|       } | ||||
|  | ||||
|       int ncomm; | ||||
|       int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); | ||||
|  | ||||
|       double start=usecond(); | ||||
|       double dbytes; | ||||
|       for(int i=0;i<Nloop;i++){ | ||||
| 	double start=usecond(); | ||||
|  | ||||
| 	std::vector<CartesianCommunicator::CommsRequest_t> requests; | ||||
|  | ||||
| 	dbytes=0; | ||||
| 	ncomm=0; | ||||
| 	for(int mu=0;mu<4;mu++){ | ||||
| 	 | ||||
| @@ -318,42 +387,52 @@ int main (int argc, char ** argv) | ||||
| 	    int recv_from_rank; | ||||
| 	     | ||||
| 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | ||||
| 	    Grid.StencilSendToRecvFromBegin(requests, | ||||
| 					    (void *)&xbuf[mu][0], | ||||
| 					    xmit_to_rank, | ||||
| 					    (void *)&rbuf[mu][0], | ||||
| 					    recv_from_rank, | ||||
| 					    bytes); | ||||
| 	    //	    Grid.StencilSendToRecvFromComplete(requests); | ||||
| 	    //	    requests.resize(0); | ||||
| 	    dbytes+= | ||||
| 	      Grid.StencilSendToRecvFromBegin(requests, | ||||
| 					      (void *)&xbuf[mu][0], | ||||
| 					      xmit_to_rank, | ||||
| 					      (void *)&rbuf[mu][0], | ||||
| 					      recv_from_rank, | ||||
| 					      bytes); | ||||
| 	    Grid.StencilSendToRecvFromComplete(requests); | ||||
| 	    requests.resize(0); | ||||
|  | ||||
| 	    comm_proc = mpi_layout[mu]-1; | ||||
| 	   | ||||
| 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | ||||
| 	    Grid.StencilSendToRecvFromBegin(requests, | ||||
| 					    (void *)&xbuf[mu+4][0], | ||||
| 					    xmit_to_rank, | ||||
| 					    (void *)&rbuf[mu+4][0], | ||||
| 					    recv_from_rank, | ||||
| 					    bytes); | ||||
| 	    dbytes+= | ||||
| 	      Grid.StencilSendToRecvFromBegin(requests, | ||||
| 					      (void *)&xbuf[mu+4][0], | ||||
| 					      xmit_to_rank, | ||||
| 					      (void *)&rbuf[mu+4][0], | ||||
| 					      recv_from_rank, | ||||
| 					      bytes); | ||||
| 	    Grid.StencilSendToRecvFromComplete(requests); | ||||
| 	    requests.resize(0); | ||||
| 	   | ||||
| 	  } | ||||
| 	} | ||||
| 	Grid.Barrier(); | ||||
| 	double stop=usecond(); | ||||
| 	t_time[i] = stop-start; // microseconds | ||||
| 	 | ||||
|       } | ||||
|       double stop=usecond(); | ||||
|  | ||||
|       double dbytes    = bytes; | ||||
|       double xbytes    = Nloop*dbytes*2.0*ncomm; | ||||
|       double rbytes    = xbytes; | ||||
|       double bidibytes = xbytes+rbytes; | ||||
|       timestat.statistics(t_time); | ||||
|  | ||||
|       double time = stop-start; // microseconds | ||||
|       dbytes=dbytes*ppn; | ||||
|       double xbytes    = dbytes*0.5; | ||||
|       double rbytes    = dbytes*0.5; | ||||
|       double bidibytes = dbytes; | ||||
|  | ||||
|  | ||||
|       std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t" | ||||
|                <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7) | ||||
|                <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " " | ||||
|                <<xbytes/timestat.max <<" "<< xbytes/timestat.min   | ||||
|                << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " " | ||||
|                << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl; | ||||
|   | ||||
|       std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl; | ||||
|     } | ||||
|   }     | ||||
|  | ||||
|   | ||||
| @@ -1,28 +1,22 @@ | ||||
|     /************************************************************************************* | ||||
|  | ||||
|  /************************************************************************************* | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
|     Source file: ./benchmarks/Benchmark_dwf.cc | ||||
|  | ||||
|     Copyright (C) 2015 | ||||
|  | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|     Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
|  | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
|  | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| @@ -48,16 +42,16 @@ typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR; | ||||
| typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF; | ||||
| typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD; | ||||
|  | ||||
|  | ||||
| int main (int argc, char ** argv) | ||||
| { | ||||
|   Grid_init(&argc,&argv); | ||||
|  | ||||
|  | ||||
|   int threads = GridThread::GetThreads(); | ||||
|   std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; | ||||
|  | ||||
|   std::vector<int> latt4 = GridDefaultLatt(); | ||||
|   const int Ls=8; | ||||
|   const int Ls=16; | ||||
|   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); | ||||
|   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); | ||||
|   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); | ||||
| @@ -72,34 +66,65 @@ int main (int argc, char ** argv) | ||||
|   std::vector<int> seeds4({1,2,3,4}); | ||||
|   std::vector<int> seeds5({5,6,7,8}); | ||||
|    | ||||
|   std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; | ||||
|   GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4); | ||||
|   std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; | ||||
|   GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5); | ||||
|   std::cout << GridLogMessage << "Initialised RNGs" << std::endl; | ||||
|  | ||||
|   LatticeFermion src   (FGrid); random(RNG5,src); | ||||
| #if 0 | ||||
|   src = zero; | ||||
|   { | ||||
|     std::vector<int> origin({0,0,0,latt4[2]-1,0}); | ||||
|     SpinColourVectorF tmp; | ||||
|     tmp=zero; | ||||
|     tmp()(0)(0)=Complex(-2.0,0.0); | ||||
|     std::cout << " source site 0 " << tmp<<std::endl; | ||||
|     pokeSite(tmp,src,origin); | ||||
|   } | ||||
| #else | ||||
|   RealD N2 = 1.0/::sqrt(norm2(src)); | ||||
|   src = src*N2; | ||||
| #endif | ||||
|  | ||||
|  | ||||
|   LatticeFermion result(FGrid); result=zero; | ||||
|   LatticeFermion    ref(FGrid);    ref=zero; | ||||
|   LatticeFermion    tmp(FGrid); | ||||
|   LatticeFermion    err(FGrid); | ||||
|  | ||||
|   std::cout << GridLogMessage << "Drawing gauge field" << std::endl; | ||||
|   LatticeGaugeField Umu(UGrid);  | ||||
|   random(RNG4,Umu); | ||||
|  | ||||
|   LatticeGaugeField Umu5d(FGrid);  | ||||
|   SU3::HotConfiguration(RNG4,Umu);  | ||||
|   std::cout << GridLogMessage << "Random gauge initialised " << std::endl; | ||||
| #if 0 | ||||
|   Umu=1.0; | ||||
|   for(int mu=0;mu<Nd;mu++){ | ||||
|     LatticeColourMatrix ttmp(UGrid); | ||||
|     ttmp = PeekIndex<LorentzIndex>(Umu,mu); | ||||
|     //    if (mu !=2 ) ttmp = 0; | ||||
|     //    ttmp = ttmp* pow(10.0,mu); | ||||
|     PokeIndex<LorentzIndex>(Umu,ttmp,mu); | ||||
|   } | ||||
|   std::cout << GridLogMessage << "Forced to diagonal " << std::endl; | ||||
| #endif | ||||
|  | ||||
|   //////////////////////////////////// | ||||
|   // Naive wilson implementation | ||||
|   //////////////////////////////////// | ||||
|   // replicate across fifth dimension | ||||
|   LatticeGaugeField Umu5d(FGrid);  | ||||
|   std::vector<LatticeColourMatrix> U(4,FGrid); | ||||
|   for(int ss=0;ss<Umu._grid->oSites();ss++){ | ||||
|     for(int s=0;s<Ls;s++){ | ||||
|       Umu5d._odata[Ls*ss+s] = Umu._odata[ss]; | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   //////////////////////////////////// | ||||
|   // Naive wilson implementation | ||||
|   //////////////////////////////////// | ||||
|   std::vector<LatticeColourMatrix> U(4,FGrid); | ||||
|   for(int mu=0;mu<Nd;mu++){ | ||||
|     U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu); | ||||
|   } | ||||
|   std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl; | ||||
|  | ||||
|   if (1) | ||||
|   { | ||||
| @@ -120,8 +145,7 @@ int main (int argc, char ** argv) | ||||
|   RealD M5  =1.8; | ||||
|  | ||||
|   RealD NP = UGrid->_Nprocessors; | ||||
|  | ||||
|   DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); | ||||
|   RealD NN = UGrid->NodeCount(); | ||||
|  | ||||
|   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; | ||||
|   std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl; | ||||
| @@ -131,15 +155,22 @@ int main (int argc, char ** argv) | ||||
|   std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl; | ||||
|   if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; | ||||
|   if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl; | ||||
| #ifdef GRID_OMP | ||||
|   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl; | ||||
|   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl; | ||||
| #endif | ||||
|   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl; | ||||
|   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl; | ||||
|   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl; | ||||
|   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; | ||||
|  | ||||
|   int ncall =100; | ||||
|   DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); | ||||
|   int ncall =1000; | ||||
|   if (1) { | ||||
|     FGrid->Barrier(); | ||||
|     Dw.ZeroCounters(); | ||||
|     Dw.Dhop(src,result,0); | ||||
|     std::cout<<GridLogMessage<<"Called warmup"<<std::endl; | ||||
|     double t0=usecond(); | ||||
|     for(int i=0;i<ncall;i++){ | ||||
|       __SSC_START; | ||||
| @@ -153,16 +184,55 @@ int main (int argc, char ** argv) | ||||
|     double flops=1344*volume*ncall; | ||||
|  | ||||
|     std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; | ||||
|     std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; | ||||
|     std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl; | ||||
|     //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; | ||||
|     //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl; | ||||
|     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl; | ||||
|     std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl; | ||||
|     std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl; | ||||
|     err = ref-result;  | ||||
|     std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; | ||||
|  | ||||
|     /* | ||||
|     if(( norm2(err)>1.0e-4) ) {  | ||||
|       std::cout << "RESULT\n " << result<<std::endl; | ||||
|       std::cout << "REF   \n " << ref   <<std::endl; | ||||
|       std::cout << "ERR   \n " << err   <<std::endl; | ||||
|       FGrid->Barrier(); | ||||
|       exit(-1); | ||||
|     } | ||||
|     */ | ||||
|     assert (norm2(err)< 1.0e-4 ); | ||||
|     Dw.Report(); | ||||
|   } | ||||
|  | ||||
|   DomainWallFermionRL DwH(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); | ||||
|   if (1) { | ||||
|     FGrid->Barrier(); | ||||
|     DwH.ZeroCounters(); | ||||
|     DwH.Dhop(src,result,0); | ||||
|     double t0=usecond(); | ||||
|     for(int i=0;i<ncall;i++){ | ||||
|       __SSC_START; | ||||
|       DwH.Dhop(src,result,0); | ||||
|       __SSC_STOP; | ||||
|     } | ||||
|     double t1=usecond(); | ||||
|     FGrid->Barrier(); | ||||
|      | ||||
|     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||
|     double flops=1344*volume*ncall; | ||||
|  | ||||
|     std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; | ||||
|     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl; | ||||
|     std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl; | ||||
|     std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl; | ||||
|     err = ref-result;  | ||||
|     std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; | ||||
|  | ||||
|     assert (norm2(err)< 1.0e-3 ); | ||||
|     DwH.Report(); | ||||
|   } | ||||
|  | ||||
|   if (1) | ||||
|   { | ||||
|  | ||||
| @@ -171,6 +241,10 @@ int main (int argc, char ** argv) | ||||
|     std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl; | ||||
|     if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; | ||||
|     if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl; | ||||
| #ifdef GRID_OMP | ||||
|   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl; | ||||
|   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl; | ||||
| #endif | ||||
|     if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl; | ||||
|     if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl; | ||||
|     if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl; | ||||
| @@ -183,20 +257,12 @@ int main (int argc, char ** argv) | ||||
|  | ||||
|     WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5); | ||||
|  | ||||
|     for(int x=0;x<latt4[0];x++){ | ||||
|     for(int y=0;y<latt4[1];y++){ | ||||
|     for(int z=0;z<latt4[2];z++){ | ||||
|     for(int t=0;t<latt4[3];t++){ | ||||
|     for(int s=0;s<Ls;s++){ | ||||
|       std::vector<int> site({s,x,y,z,t}); | ||||
|       SpinColourVector tmp; | ||||
|       peekSite(tmp,src,site); | ||||
|       pokeSite(tmp,ssrc,site); | ||||
|     }}}}} | ||||
|     localConvert(src,ssrc); | ||||
|     std::cout<<GridLogMessage<< "src norms "<< norm2(src)<<" " <<norm2(ssrc)<<std::endl; | ||||
|     FGrid->Barrier(); | ||||
|     double t0=usecond(); | ||||
|     sDw.Dhop(ssrc,sresult,0); | ||||
|     sDw.ZeroCounters(); | ||||
|     double t0=usecond(); | ||||
|     for(int i=0;i<ncall;i++){ | ||||
|       __SSC_START; | ||||
|       sDw.Dhop(ssrc,sresult,0); | ||||
| @@ -210,46 +276,52 @@ int main (int argc, char ** argv) | ||||
|     std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; | ||||
|     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl; | ||||
|     std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl; | ||||
|     std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl; | ||||
|     //    std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl; | ||||
|     sDw.Report(); | ||||
|    | ||||
|     if(0){ | ||||
|       for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){ | ||||
| 	sDw.Dhop(ssrc,sresult,0); | ||||
| 	PerformanceCounter Counter(i); | ||||
| 	Counter.Start(); | ||||
| 	sDw.Dhop(ssrc,sresult,0); | ||||
| 	Counter.Stop(); | ||||
| 	Counter.Report(); | ||||
|       } | ||||
|     } | ||||
|  | ||||
|     std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl; | ||||
|  | ||||
|     RealD sum=0; | ||||
|     for(int x=0;x<latt4[0];x++){ | ||||
|     for(int y=0;y<latt4[1];y++){ | ||||
|     for(int z=0;z<latt4[2];z++){ | ||||
|     for(int t=0;t<latt4[3];t++){ | ||||
|     for(int s=0;s<Ls;s++){ | ||||
|       std::vector<int> site({s,x,y,z,t}); | ||||
|       SpinColourVector normal, simd; | ||||
|       peekSite(normal,result,site); | ||||
|       peekSite(simd,sresult,site); | ||||
|       sum=sum+norm2(normal-simd); | ||||
|       if (norm2(normal-simd) > 1.0e-6 ) { | ||||
| 	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl; | ||||
| 	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" normal "<<normal<<std::endl; | ||||
| 	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" simd   "<<simd<<std::endl; | ||||
|       } | ||||
|     }}}}} | ||||
|     std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl; | ||||
|     assert (sum< 1.0e-4 ); | ||||
|  | ||||
|     err=zero; | ||||
|     localConvert(sresult,err); | ||||
|     err = err - ref; | ||||
|     sum = norm2(err); | ||||
|     std::cout<<GridLogMessage<<" difference between normal ref and simd is "<<sum<<std::endl; | ||||
|     if(sum > 1.0e-4 ){ | ||||
|       std::cout<< "sD REF\n " <<ref << std::endl; | ||||
|       std::cout<< "sD ERR   \n " <<err  <<std::endl; | ||||
|     } | ||||
|     //    assert(sum < 1.0e-4); | ||||
|  | ||||
|     if (1) { | ||||
|     err=zero; | ||||
|     localConvert(sresult,err); | ||||
|     err = err - result; | ||||
|     sum = norm2(err); | ||||
|     std::cout<<GridLogMessage<<" difference between normal result and simd is "<<sum<<std::endl; | ||||
|     if(sum > 1.0e-4 ){ | ||||
|       std::cout<< "sD REF\n " <<result << std::endl; | ||||
|       std::cout<< "sD ERR   \n " << err  <<std::endl; | ||||
|     } | ||||
|     assert(sum < 1.0e-4); | ||||
|      | ||||
|     if(1){ | ||||
|       std::cout << GridLogMessage<< "*********************************************************" <<std::endl; | ||||
|       std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl; | ||||
|       std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl; | ||||
|       if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; | ||||
|       if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl; | ||||
| #ifdef GRID_OMP | ||||
|   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl; | ||||
|   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl; | ||||
| #endif | ||||
|       if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   )  | ||||
| 	std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl; | ||||
|       if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll)  | ||||
| 	std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl; | ||||
|       if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm )  | ||||
| 	std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl; | ||||
|       std::cout << GridLogMessage<< "*********************************************************" <<std::endl; | ||||
|  | ||||
|       LatticeFermion sr_eo(sFGrid); | ||||
|  | ||||
|       LatticeFermion ssrc_e (sFrbGrid); | ||||
|       LatticeFermion ssrc_o (sFrbGrid); | ||||
|       LatticeFermion sr_e   (sFrbGrid); | ||||
| @@ -257,39 +329,30 @@ int main (int argc, char ** argv) | ||||
|  | ||||
|       pickCheckerboard(Even,ssrc_e,ssrc); | ||||
|       pickCheckerboard(Odd,ssrc_o,ssrc); | ||||
|  | ||||
|       setCheckerboard(sr_eo,ssrc_o); | ||||
|       setCheckerboard(sr_eo,ssrc_e); | ||||
|       //      setCheckerboard(sr_eo,ssrc_o); | ||||
|       //      setCheckerboard(sr_eo,ssrc_e); | ||||
|  | ||||
|       sr_e = zero; | ||||
|       sr_o = zero; | ||||
|  | ||||
|       std::cout << GridLogMessage<< "*********************************************************" <<std::endl; | ||||
|       std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl; | ||||
|       std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl; | ||||
|       if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; | ||||
|       if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl; | ||||
|       if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl; | ||||
|       if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl; | ||||
|       if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl; | ||||
|       std::cout << GridLogMessage<< "*********************************************************" <<std::endl; | ||||
|  | ||||
|       FGrid->Barrier(); | ||||
|       sDw.DhopEO(ssrc_o, sr_e, DaggerNo); | ||||
|       sDw.ZeroCounters(); | ||||
|       sDw.stat.init("DhopEO"); | ||||
|       //      sDw.stat.init("DhopEO"); | ||||
|       double t0=usecond(); | ||||
|       for (int i = 0; i < ncall; i++) { | ||||
|         sDw.DhopEO(ssrc_o, sr_e, DaggerNo); | ||||
|       } | ||||
|       double t1=usecond(); | ||||
|       FGrid->Barrier(); | ||||
|       sDw.stat.print(); | ||||
|       //      sDw.stat.print(); | ||||
|  | ||||
|       double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||
|       double flops=(1344.0*volume*ncall)/2; | ||||
|  | ||||
|       std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl; | ||||
|       std::cout<<GridLogMessage << "sDeo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl; | ||||
|       std::cout<<GridLogMessage << "sDeo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl; | ||||
|       sDw.Report(); | ||||
|  | ||||
|       sDw.DhopEO(ssrc_o,sr_e,DaggerNo); | ||||
| @@ -298,22 +361,26 @@ int main (int argc, char ** argv) | ||||
|  | ||||
|       pickCheckerboard(Even,ssrc_e,sresult); | ||||
|       pickCheckerboard(Odd ,ssrc_o,sresult); | ||||
|  | ||||
|       ssrc_e = ssrc_e - sr_e; | ||||
|       RealD error = norm2(ssrc_e); | ||||
|  | ||||
|       std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<< "  vec nrm"<<norm2(sr_e) <<std::endl; | ||||
|       ssrc_o = ssrc_o - sr_o; | ||||
|  | ||||
|       ssrc_o = ssrc_o - sr_o; | ||||
|       error+= norm2(ssrc_o); | ||||
|       std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<< "  vec nrm"<<norm2(sr_o) <<std::endl; | ||||
|       if(error>1.0e-4) {  | ||||
|  | ||||
|       if(( error>1.0e-4) ) {  | ||||
| 	setCheckerboard(ssrc,ssrc_o); | ||||
| 	setCheckerboard(ssrc,ssrc_e); | ||||
| 	std::cout<< ssrc << std::endl; | ||||
| 	std::cout<< "DIFF\n " <<ssrc << std::endl; | ||||
| 	setCheckerboard(ssrc,sr_o); | ||||
| 	setCheckerboard(ssrc,sr_e); | ||||
| 	std::cout<< "CBRESULT\n " <<ssrc << std::endl; | ||||
| 	std::cout<< "RESULT\n " <<sresult<< std::endl; | ||||
|       } | ||||
|       assert(error<1.0e-4); | ||||
|     } | ||||
|  | ||||
|  | ||||
|   } | ||||
|  | ||||
|   if (1) | ||||
| @@ -324,25 +391,30 @@ int main (int argc, char ** argv) | ||||
|       //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x | ||||
|       tmp = U[mu]*Cshift(src,mu+1,1); | ||||
|       for(int i=0;i<ref._odata.size();i++){ | ||||
|   ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ; | ||||
| 	ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ; | ||||
|       } | ||||
|  | ||||
|       tmp =adj(U[mu])*src; | ||||
|       tmp =Cshift(tmp,mu+1,-1); | ||||
|       for(int i=0;i<ref._odata.size();i++){ | ||||
|   ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ; | ||||
| 	ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ; | ||||
|       } | ||||
|     } | ||||
|     ref = -0.5*ref; | ||||
|   } | ||||
|   //  dump=1; | ||||
|   Dw.Dhop(src,result,1); | ||||
|   std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl; | ||||
|   std::cout<<GridLogMessage << "Called DwDag"<<std::endl; | ||||
|   std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; | ||||
|   std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl; | ||||
|   std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl; | ||||
|   std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl; | ||||
|   err = ref-result;  | ||||
|   std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; | ||||
|   assert(norm2(err)<1.0e-4); | ||||
|   std::cout<<GridLogMessage << "norm dag diff   "<< norm2(err)<<std::endl; | ||||
|   if((norm2(err)>1.0e-4)){ | ||||
| 	std::cout<< "DAG RESULT\n "  <<ref     << std::endl; | ||||
| 	std::cout<< "DAG sRESULT\n " <<result  << std::endl; | ||||
| 	std::cout<< "DAG ERR   \n "  << err    <<std::endl; | ||||
|   } | ||||
|   LatticeFermion src_e (FrbGrid); | ||||
|   LatticeFermion src_o (FrbGrid); | ||||
|   LatticeFermion r_e   (FrbGrid); | ||||
| @@ -350,18 +422,24 @@ int main (int argc, char ** argv) | ||||
|   LatticeFermion r_eo  (FGrid); | ||||
|  | ||||
|  | ||||
|   std::cout<<GridLogMessage << "Calling Deo and Doe and assert Deo+Doe == Dunprec"<<std::endl; | ||||
|   std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl; | ||||
|   pickCheckerboard(Even,src_e,src); | ||||
|   pickCheckerboard(Odd,src_o,src); | ||||
|  | ||||
|   std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl; | ||||
|   std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl; | ||||
|  | ||||
|  | ||||
|   // S-direction is INNERMOST and takes no part in the parity. | ||||
|   std::cout << GridLogMessage<< "*********************************************************" <<std::endl; | ||||
|   std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::DhopEO                "<<std::endl; | ||||
|   std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl; | ||||
|   if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; | ||||
|   if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl; | ||||
| #ifdef GRID_OMP | ||||
|   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl; | ||||
|   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl; | ||||
| #endif | ||||
|   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl; | ||||
|   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl; | ||||
|   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl; | ||||
| @@ -369,6 +447,7 @@ int main (int argc, char ** argv) | ||||
|   { | ||||
|     Dw.ZeroCounters(); | ||||
|     FGrid->Barrier(); | ||||
|     Dw.DhopEO(src_o,r_e,DaggerNo); | ||||
|     double t0=usecond(); | ||||
|     for(int i=0;i<ncall;i++){ | ||||
|       Dw.DhopEO(src_o,r_e,DaggerNo); | ||||
| @@ -381,6 +460,7 @@ int main (int argc, char ** argv) | ||||
|  | ||||
|     std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl; | ||||
|     std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl; | ||||
|     std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl; | ||||
|     Dw.Report(); | ||||
|   } | ||||
|   Dw.DhopEO(src_o,r_e,DaggerNo); | ||||
| @@ -396,14 +476,20 @@ int main (int argc, char ** argv) | ||||
|  | ||||
|   err = r_eo-result;  | ||||
|   std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; | ||||
|   assert(norm2(err)<1.0e-4); | ||||
|   if((norm2(err)>1.0e-4)){ | ||||
| 	std::cout<< "Deo RESULT\n " <<r_eo << std::endl; | ||||
| 	std::cout<< "Deo REF\n " <<result  << std::endl; | ||||
| 	std::cout<< "Deo ERR   \n " << err <<std::endl; | ||||
|   } | ||||
|  | ||||
|   pickCheckerboard(Even,src_e,err); | ||||
|   pickCheckerboard(Odd,src_o,err); | ||||
|   std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl; | ||||
|   std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl; | ||||
|   assert(norm2(src_e)<1.0e-4); | ||||
|   assert(norm2(src_o)<1.0e-4); | ||||
|  | ||||
|   //assert(norm2(src_e)<1.0e-4); | ||||
|   //assert(norm2(src_o)<1.0e-4); | ||||
|  | ||||
|   Grid_finalize(); | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -66,7 +66,8 @@ int main (int argc, char ** argv) | ||||
|  | ||||
|     Vec tsum; tsum = zero; | ||||
|  | ||||
|     GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice(); | ||||
|     GridParallelRNG          pRNG(&Grid);       | ||||
|     pRNG.SeedFixedIntegers(std::vector<int>({56,17,89,101})); | ||||
|  | ||||
|     std::vector<double> stop(threads); | ||||
|     Vector<Vec> sum(threads); | ||||
| @@ -77,8 +78,7 @@ int main (int argc, char ** argv) | ||||
|     } | ||||
|  | ||||
|     double start=usecond(); | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int t=0;t<threads;t++){ | ||||
|     parallel_for(int t=0;t<threads;t++){ | ||||
|  | ||||
|       sum[t] = x[t]._odata[0]; | ||||
|       for(int i=0;i<Nloop;i++){ | ||||
|   | ||||
| @@ -55,8 +55,8 @@ int main (int argc, char ** argv) | ||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl; | ||||
|   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||
|   uint64_t lmax=44; | ||||
| #define NLOOP (1*lmax*lmax*lmax*lmax/vol) | ||||
|   uint64_t lmax=64; | ||||
| #define NLOOP (100*lmax*lmax*lmax*lmax/vol) | ||||
|   for(int lat=4;lat<=lmax;lat+=4){ | ||||
|  | ||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); | ||||
| @@ -65,7 +65,7 @@ int main (int argc, char ** argv) | ||||
|  | ||||
|       uint64_t Nloop=NLOOP; | ||||
|  | ||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice(); | ||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); | ||||
|  | ||||
|       LatticeVec z(&Grid); //random(pRNG,z); | ||||
|       LatticeVec x(&Grid); //random(pRNG,x); | ||||
| @@ -100,7 +100,7 @@ int main (int argc, char ** argv) | ||||
|       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||
|  | ||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice(); | ||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); | ||||
|  | ||||
|       LatticeVec z(&Grid); //random(pRNG,z); | ||||
|       LatticeVec x(&Grid); //random(pRNG,x); | ||||
| @@ -138,7 +138,7 @@ int main (int argc, char ** argv) | ||||
|  | ||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||
|  | ||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice(); | ||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); | ||||
|  | ||||
|       LatticeVec z(&Grid); //random(pRNG,z); | ||||
|       LatticeVec x(&Grid); //random(pRNG,x); | ||||
| @@ -173,7 +173,7 @@ int main (int argc, char ** argv) | ||||
|       uint64_t Nloop=NLOOP; | ||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||
|  | ||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice(); | ||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); | ||||
|       LatticeVec z(&Grid); //random(pRNG,z); | ||||
|       LatticeVec x(&Grid); //random(pRNG,x); | ||||
|       LatticeVec y(&Grid); //random(pRNG,y); | ||||
|   | ||||
							
								
								
									
										134
									
								
								benchmarks/Benchmark_staggered.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										134
									
								
								benchmarks/Benchmark_staggered.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,134 @@ | ||||
|     /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
|     Source file: ./benchmarks/Benchmark_staggered.cc | ||||
|  | ||||
|     Copyright (C) 2015 | ||||
|  | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
|  | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
|  | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #include <Grid/Grid.h> | ||||
|  | ||||
| using namespace std; | ||||
| using namespace Grid; | ||||
| using namespace Grid::QCD; | ||||
|  | ||||
| int main (int argc, char ** argv) | ||||
| { | ||||
|   Grid_init(&argc,&argv); | ||||
|  | ||||
|   std::vector<int> latt_size   = GridDefaultLatt(); | ||||
|   std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); | ||||
|   std::vector<int> mpi_layout  = GridDefaultMpi(); | ||||
|   GridCartesian               Grid(latt_size,simd_layout,mpi_layout); | ||||
|   GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout); | ||||
|  | ||||
|   int threads = GridThread::GetThreads(); | ||||
|   std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; | ||||
|   std::cout<<GridLogMessage << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl; | ||||
|   std::cout<<GridLogMessage << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl; | ||||
|   std::cout<<GridLogMessage << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl; | ||||
|  | ||||
|   std::vector<int> seeds({1,2,3,4}); | ||||
|   GridParallelRNG          pRNG(&Grid); | ||||
|   pRNG.SeedFixedIntegers(seeds); | ||||
|   //  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); | ||||
|  | ||||
|   typedef typename ImprovedStaggeredFermionR::FermionField FermionField;  | ||||
|   typename ImprovedStaggeredFermionR::ImplParams params;  | ||||
|  | ||||
|   FermionField src   (&Grid); random(pRNG,src); | ||||
|   FermionField result(&Grid); result=zero; | ||||
|   FermionField    ref(&Grid);    ref=zero; | ||||
|   FermionField    tmp(&Grid);    tmp=zero; | ||||
|   FermionField    err(&Grid);    tmp=zero; | ||||
|   LatticeGaugeField Umu(&Grid); random(pRNG,Umu); | ||||
|   std::vector<LatticeColourMatrix> U(4,&Grid); | ||||
|  | ||||
|   double volume=1; | ||||
|   for(int mu=0;mu<Nd;mu++){ | ||||
|     volume=volume*latt_size[mu]; | ||||
|   }   | ||||
|  | ||||
|   // Only one non-zero (y) | ||||
| #if 0 | ||||
|   Umu=zero; | ||||
|   Complex cone(1.0,0.0); | ||||
|   for(int nn=0;nn<Nd;nn++){ | ||||
|     random(pRNG,U[nn]); | ||||
|     if(1) { | ||||
|       if (nn!=2) { U[nn]=zero; std::cout<<GridLogMessage << "zeroing gauge field in dir "<<nn<<std::endl; } | ||||
|       //      else       { U[nn]= cone;std::cout<<GridLogMessage << "unit gauge field in dir "<<nn<<std::endl; } | ||||
|       else       { std::cout<<GridLogMessage << "random gauge field in dir "<<nn<<std::endl; } | ||||
|     } | ||||
|     PokeIndex<LorentzIndex>(Umu,U[nn],nn); | ||||
|   } | ||||
| #endif | ||||
|  | ||||
|   for(int mu=0;mu<Nd;mu++){ | ||||
|     U[mu] = PeekIndex<LorentzIndex>(Umu,mu); | ||||
|   } | ||||
|   ref = zero; | ||||
|   /*   | ||||
|   { // Naive wilson implementation | ||||
|     ref = zero; | ||||
|     for(int mu=0;mu<Nd;mu++){ | ||||
|       //    ref =  src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x | ||||
|       tmp = U[mu]*Cshift(src,mu,1); | ||||
|       for(int i=0;i<ref._odata.size();i++){ | ||||
| 	ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ; | ||||
|       } | ||||
|  | ||||
|       tmp =adj(U[mu])*src; | ||||
|       tmp =Cshift(tmp,mu,-1); | ||||
|       for(int i=0;i<ref._odata.size();i++){ | ||||
| 	ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ; | ||||
|       } | ||||
|     } | ||||
|   } | ||||
|   ref = -0.5*ref; | ||||
|   */ | ||||
|  | ||||
|   RealD mass=0.1; | ||||
|   RealD c1=9.0/8.0; | ||||
|   RealD c2=-1.0/24.0; | ||||
|   RealD u0=1.0; | ||||
|   ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params); | ||||
|    | ||||
|   std::cout<<GridLogMessage << "Calling Ds"<<std::endl; | ||||
|   int ncall=1000; | ||||
|   double t0=usecond(); | ||||
|   for(int i=0;i<ncall;i++){ | ||||
|     Ds.Dhop(src,result,0); | ||||
|   } | ||||
|   double t1=usecond(); | ||||
|   double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 +  == 1146 | ||||
|    | ||||
|   std::cout<<GridLogMessage << "Called Ds"<<std::endl; | ||||
|   std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; | ||||
|   std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl; | ||||
|   std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl; | ||||
|   err = ref-result;  | ||||
|   std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; | ||||
|  | ||||
|   Grid_finalize(); | ||||
| } | ||||
| @@ -35,8 +35,9 @@ using namespace Grid::QCD; | ||||
| int main (int argc, char ** argv) | ||||
| { | ||||
|   Grid_init(&argc,&argv); | ||||
| #define LMAX (64) | ||||
|  | ||||
|   int Nloop=1000; | ||||
|   int Nloop=20; | ||||
|  | ||||
|   std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); | ||||
|   std::vector<int> mpi_layout  = GridDefaultMpi(); | ||||
| @@ -50,12 +51,12 @@ int main (int argc, char ** argv) | ||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl; | ||||
|   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||
|  | ||||
|   for(int lat=2;lat<=32;lat+=2){ | ||||
|   for(int lat=2;lat<=LMAX;lat+=2){ | ||||
|  | ||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); | ||||
|       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice(); | ||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); | ||||
|  | ||||
|       LatticeColourMatrix z(&Grid);// random(pRNG,z); | ||||
|       LatticeColourMatrix x(&Grid);// random(pRNG,x); | ||||
| @@ -82,13 +83,13 @@ int main (int argc, char ** argv) | ||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl; | ||||
|   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||
|  | ||||
|   for(int lat=2;lat<=32;lat+=2){ | ||||
|   for(int lat=2;lat<=LMAX;lat+=2){ | ||||
|  | ||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); | ||||
|       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||
|  | ||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice(); | ||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); | ||||
|  | ||||
|       LatticeColourMatrix z(&Grid); //random(pRNG,z); | ||||
|       LatticeColourMatrix x(&Grid); //random(pRNG,x); | ||||
| @@ -113,13 +114,13 @@ int main (int argc, char ** argv) | ||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl; | ||||
|   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||
|  | ||||
|   for(int lat=2;lat<=32;lat+=2){ | ||||
|   for(int lat=2;lat<=LMAX;lat+=2){ | ||||
|  | ||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); | ||||
|       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||
|  | ||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice(); | ||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); | ||||
|  | ||||
|       LatticeColourMatrix z(&Grid); //random(pRNG,z); | ||||
|       LatticeColourMatrix x(&Grid); //random(pRNG,x); | ||||
| @@ -144,13 +145,13 @@ int main (int argc, char ** argv) | ||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl; | ||||
|   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||
|  | ||||
|   for(int lat=2;lat<=32;lat+=2){ | ||||
|   for(int lat=2;lat<=LMAX;lat+=2){ | ||||
|  | ||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); | ||||
|       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||
|  | ||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice(); | ||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); | ||||
|  | ||||
|       LatticeColourMatrix z(&Grid); //random(pRNG,z); | ||||
|       LatticeColourMatrix x(&Grid); //random(pRNG,x); | ||||
|   | ||||
| @@ -69,7 +69,7 @@ int main (int argc, char ** argv) | ||||
|   std::vector<int> seeds({1,2,3,4}); | ||||
|   GridParallelRNG          pRNG(&Grid); | ||||
|   pRNG.SeedFixedIntegers(seeds); | ||||
|   //  pRNG.SeedRandomDevice(); | ||||
|   //  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); | ||||
|  | ||||
|   LatticeFermion src   (&Grid); random(pRNG,src); | ||||
|   LatticeFermion result(&Grid); result=zero; | ||||
|   | ||||
| @@ -1,11 +1,7 @@ | ||||
| include Make.inc | ||||
|  | ||||
| simple: simple_su3_test.o simple_su3_expr.o simple_simd_test.o | ||||
|  | ||||
| EXTRA_LIBRARIES = libsimple_su3_test.a libsimple_su3_expr.a libsimple_simd_test.a | ||||
|  | ||||
| libsimple_su3_test_a_SOURCES = simple_su3_test.cc | ||||
|  | ||||
| libsimple_su3_expr_a_SOURCES = simple_su3_expr.cc | ||||
|  | ||||
| libsimple_simd_test_a_SOURCES = simple_simd_test.cc | ||||
| bench-local: all | ||||
| 	./Benchmark_su3 | ||||
| 	./Benchmark_memory_bandwidth | ||||
| 	./Benchmark_wilson | ||||
| 	./Benchmark_dwf --dslash-unroll | ||||
| @@ -1,6 +1,6 @@ | ||||
| #!/usr/bin/env bash | ||||
| ]#!/usr/bin/env bash | ||||
|  | ||||
| EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2' | ||||
| EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.3.3.tar.bz2' | ||||
|  | ||||
| echo "-- deploying Eigen source..." | ||||
| wget ${EIGEN_URL} --no-check-certificate | ||||
|   | ||||
							
								
								
									
										124
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										124
									
								
								configure.ac
									
									
									
									
									
								
							| @@ -1,16 +1,19 @@ | ||||
| AC_PREREQ([2.63]) | ||||
| AC_INIT([Grid], [0.6.0], [https://github.com/paboyle/Grid], [Grid]) | ||||
| AC_INIT([Grid], [0.7.0], [https://github.com/paboyle/Grid], [Grid]) | ||||
| AC_CANONICAL_BUILD | ||||
| AC_CANONICAL_HOST | ||||
| AC_CANONICAL_TARGET | ||||
| AM_INIT_AUTOMAKE(subdir-objects) | ||||
| AM_INIT_AUTOMAKE([subdir-objects 1.13]) | ||||
| AM_EXTRA_RECURSIVE_TARGETS([tests bench]) | ||||
| AC_CONFIG_MACRO_DIR([m4]) | ||||
| AC_CONFIG_SRCDIR([lib/Grid.h]) | ||||
| AC_CONFIG_HEADERS([lib/Config.h],[sed -i 's|PACKAGE_|GRID_|' lib/Config.h]) | ||||
| m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) | ||||
|  | ||||
| ################ Get git info | ||||
| #AC_REVISION([m4_esyscmd_s([./scripts/configure.commit])]) | ||||
|  | ||||
| ############### Checks for programs | ||||
| CXXFLAGS="-O3 $CXXFLAGS" | ||||
| AC_PROG_CXX | ||||
| AC_PROG_RANLIB | ||||
|  | ||||
| @@ -24,6 +27,9 @@ AX_GXX_VERSION | ||||
| AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"], | ||||
|       [version of g++ that will compile the code]) | ||||
|  | ||||
| CXXFLAGS="-g $CXXFLAGS" | ||||
|  | ||||
|  | ||||
| ############### Checks for typedefs, structures, and compiler characteristics | ||||
| AC_TYPE_SIZE_T | ||||
| AC_TYPE_UINT32_T | ||||
| @@ -67,6 +73,13 @@ AC_ARG_WITH([fftw], | ||||
|             [AM_CXXFLAGS="-I$with_fftw/include $AM_CXXFLAGS"] | ||||
|             [AM_LDFLAGS="-L$with_fftw/lib $AM_LDFLAGS"]) | ||||
|  | ||||
| ############### LIME | ||||
| AC_ARG_WITH([lime], | ||||
|             [AS_HELP_STRING([--with-lime=prefix], | ||||
|             [try this for a non-standard install prefix of the LIME library])], | ||||
|             [AM_CXXFLAGS="-I$with_lime/include $AM_CXXFLAGS"] | ||||
|             [AM_LDFLAGS="-L$with_lime/lib $AM_LDFLAGS"]) | ||||
|  | ||||
| ############### lapack | ||||
| AC_ARG_ENABLE([lapack], | ||||
|     [AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])], | ||||
| @@ -83,6 +96,18 @@ case ${ac_LAPACK} in | ||||
|         AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);; | ||||
| esac | ||||
|  | ||||
| ############### FP16 conversions | ||||
| AC_ARG_ENABLE([sfw-fp16], | ||||
|     [AC_HELP_STRING([--enable-sfw-fp16=yes|no], [enable software fp16 comms])], | ||||
|     [ac_SFW_FP16=${enable_sfw_fp16}], [ac_SFW_FP16=yes]) | ||||
| case ${ac_SFW_FP16} in | ||||
|     yes) | ||||
|       AC_DEFINE([SFW_FP16],[1],[software conversion to fp16]);; | ||||
|     no);; | ||||
|     *) | ||||
|       AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);; | ||||
| esac | ||||
|  | ||||
| ############### MKL | ||||
| AC_ARG_ENABLE([mkl], | ||||
|     [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])], | ||||
| @@ -152,6 +177,18 @@ AC_SEARCH_LIBS([fftw_execute], [fftw3], | ||||
|                [AC_DEFINE([HAVE_FFTW], [1], [Define to 1 if you have the `FFTW' library])] | ||||
|                [have_fftw=true]) | ||||
|  | ||||
| AC_SEARCH_LIBS([limeCreateReader], [lime], | ||||
|                [AC_DEFINE([HAVE_LIME], [1], [Define to 1 if you have the `LIME' library])] | ||||
|                [have_lime=true], | ||||
| 	       [AC_MSG_WARN(C-LIME library was not found in your system. | ||||
| In order to use ILGG file format please install or provide the correct path to your installation | ||||
| Info at: http://usqcd.jlab.org/usqcd-docs/c-lime/)]) | ||||
|  | ||||
| AC_SEARCH_LIBS([crc32], [z], | ||||
|                [AC_DEFINE([HAVE_ZLIB], [1], [Define to 1 if you have the `LIBZ' library])] | ||||
|                [have_zlib=true], | ||||
| 	       [AC_MSG_ERROR(zlib library was not found in your system.)]) | ||||
|  | ||||
| AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp], | ||||
|                [AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])] | ||||
|                [have_hdf5=true] | ||||
| @@ -176,19 +213,26 @@ case ${ax_cv_cxx_compiler_vendor} in | ||||
|     case ${ac_SIMD} in | ||||
|       SSE4) | ||||
|         AC_DEFINE([SSE4],[1],[SSE4 intrinsics]) | ||||
|         SIMD_FLAGS='-msse4.2';; | ||||
| 	case ${ac_SFW_FP16} in | ||||
| 	  yes) | ||||
| 	  SIMD_FLAGS='-msse4.2';; | ||||
| 	  no) | ||||
| 	  SIMD_FLAGS='-msse4.2 -mf16c';; | ||||
| 	  *) | ||||
|           AC_MSG_ERROR(["SFW_FP16 must be either yes or no value ${ac_SFW_FP16} "]);; | ||||
| 	esac;; | ||||
|       AVX) | ||||
|         AC_DEFINE([AVX1],[1],[AVX intrinsics]) | ||||
|         SIMD_FLAGS='-mavx';; | ||||
|         SIMD_FLAGS='-mavx -mf16c';; | ||||
|       AVXFMA4) | ||||
|         AC_DEFINE([AVXFMA4],[1],[AVX intrinsics with FMA4]) | ||||
|         SIMD_FLAGS='-mavx -mfma4';; | ||||
|         SIMD_FLAGS='-mavx -mfma4 -mf16c';; | ||||
|       AVXFMA) | ||||
|         AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3]) | ||||
|         SIMD_FLAGS='-mavx -mfma';; | ||||
|         SIMD_FLAGS='-mavx -mfma -mf16c';; | ||||
|       AVX2) | ||||
|         AC_DEFINE([AVX2],[1],[AVX2 intrinsics]) | ||||
|         SIMD_FLAGS='-mavx2 -mfma';; | ||||
|         SIMD_FLAGS='-mavx2 -mfma -mf16c';; | ||||
|       AVX512) | ||||
|         AC_DEFINE([AVX512],[1],[AVX512 intrinsics]) | ||||
|         SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';; | ||||
| @@ -321,7 +365,7 @@ AM_CONDITIONAL(BUILD_COMMS_NONE,  [ test "${comms_type}X" == "noneX" ]) | ||||
| ############### RNG selection | ||||
| AC_ARG_ENABLE([rng],[AC_HELP_STRING([--enable-rng=ranlux48|mt19937|sitmo],\ | ||||
| 	            [Select Random Number Generator to be used])],\ | ||||
| 	            [ac_RNG=${enable_rng}],[ac_RNG=ranlux48]) | ||||
| 	            [ac_RNG=${enable_rng}],[ac_RNG=sitmo]) | ||||
|  | ||||
| case ${ac_RNG} in | ||||
|      ranlux48) | ||||
| @@ -384,32 +428,31 @@ DX_INIT_DOXYGEN([$PACKAGE_NAME], [doxygen.cfg]) | ||||
|  | ||||
| ############### Ouput | ||||
| cwd=`pwd -P`; cd ${srcdir}; abs_srcdir=`pwd -P`; cd ${cwd} | ||||
| GRID_CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS" | ||||
| GRID_LDFLAGS="$AM_LDFLAGS $LDFLAGS" | ||||
| GRID_LIBS=$LIBS | ||||
| GRID_SHORT_SHA=`git rev-parse --short HEAD` | ||||
| GRID_SHA=`git rev-parse HEAD` | ||||
| GRID_BRANCH=`git rev-parse --abbrev-ref HEAD` | ||||
| AM_CXXFLAGS="-I${abs_srcdir}/include $AM_CXXFLAGS" | ||||
| AM_CFLAGS="-I${abs_srcdir}/include $AM_CFLAGS" | ||||
| AM_LDFLAGS="-L${cwd}/lib $AM_LDFLAGS" | ||||
| AC_SUBST([AM_CFLAGS]) | ||||
| AC_SUBST([AM_CXXFLAGS]) | ||||
| AC_SUBST([AM_LDFLAGS]) | ||||
| AC_CONFIG_FILES(Makefile) | ||||
| AC_CONFIG_FILES(lib/Makefile) | ||||
| AC_CONFIG_FILES(tests/Makefile) | ||||
| AC_CONFIG_FILES(tests/IO/Makefile) | ||||
| AC_CONFIG_FILES(tests/core/Makefile) | ||||
| AC_CONFIG_FILES(tests/debug/Makefile) | ||||
| AC_CONFIG_FILES(tests/forces/Makefile) | ||||
| AC_CONFIG_FILES(tests/hadrons/Makefile) | ||||
| AC_CONFIG_FILES(tests/hmc/Makefile) | ||||
| AC_CONFIG_FILES(tests/solver/Makefile) | ||||
| AC_CONFIG_FILES(tests/qdpxx/Makefile) | ||||
| AC_CONFIG_FILES(benchmarks/Makefile) | ||||
| AC_CONFIG_FILES(extras/Makefile) | ||||
| AC_CONFIG_FILES(extras/Hadrons/Makefile) | ||||
| AC_OUTPUT | ||||
| AC_SUBST([GRID_CXXFLAGS]) | ||||
| AC_SUBST([GRID_LDFLAGS]) | ||||
| AC_SUBST([GRID_LIBS]) | ||||
| AC_SUBST([GRID_SHA]) | ||||
| AC_SUBST([GRID_BRANCH]) | ||||
|  | ||||
| git_commit=`cd $srcdir && ./scripts/configure.commit` | ||||
|  | ||||
| echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
| Summary of configuration for $PACKAGE v$VERSION | ||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| ----- GIT VERSION ------------------------------------- | ||||
| $git_commit | ||||
| ----- PLATFORM ---------------------------------------- | ||||
| architecture (build)        : $build_cpu | ||||
| os (build)                  : $build_os | ||||
| @@ -422,10 +465,12 @@ SIMD                        : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG} | ||||
| Threading                   : ${ac_openmp} | ||||
| Communications type         : ${comms_type} | ||||
| Default precision           : ${ac_PRECISION} | ||||
| Software FP16 conversion    : ${ac_SFW_FP16} | ||||
| RNG choice                  : ${ac_RNG} | ||||
| GMP                         : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi` | ||||
| LAPACK                      : ${ac_LAPACK} | ||||
| FFTW                        : `if test "x$have_fftw" = xtrue; then echo yes; else echo no; fi` | ||||
| LIME (ILDG support)         : `if test "x$have_lime" = xtrue; then echo yes; else echo no; fi` | ||||
| HDF5                        : `if test "x$have_hdf5" = xtrue; then echo yes; else echo no; fi` | ||||
| build DOXYGEN documentation : `if test "$DX_FLAG_doc" = '1'; then echo yes; else echo no; fi` | ||||
| ----- BUILD FLAGS ------------------------------------- | ||||
| @@ -435,7 +480,32 @@ LDFLAGS: | ||||
| `echo ${AM_LDFLAGS} ${LDFLAGS} | tr ' ' '\n' | sed 's/^-/    -/g'` | ||||
| LIBS: | ||||
| `echo ${LIBS} | tr ' ' '\n' | sed 's/^-/    -/g'` | ||||
| -------------------------------------------------------" > config.summary | ||||
| -------------------------------------------------------" > grid.configure.summary | ||||
|  | ||||
| GRID_SUMMARY="`cat grid.configure.summary`" | ||||
| AM_SUBST_NOTMAKE([GRID_SUMMARY]) | ||||
| AC_SUBST([GRID_SUMMARY]) | ||||
|  | ||||
| AC_CONFIG_FILES([grid-config], [chmod +x grid-config]) | ||||
| AC_CONFIG_FILES(Makefile) | ||||
| AC_CONFIG_FILES(lib/Makefile) | ||||
| AC_CONFIG_FILES(tests/Makefile) | ||||
| AC_CONFIG_FILES(tests/IO/Makefile) | ||||
| AC_CONFIG_FILES(tests/core/Makefile) | ||||
| AC_CONFIG_FILES(tests/debug/Makefile) | ||||
| AC_CONFIG_FILES(tests/forces/Makefile) | ||||
| AC_CONFIG_FILES(tests/hadrons/Makefile) | ||||
| AC_CONFIG_FILES(tests/hmc/Makefile) | ||||
| AC_CONFIG_FILES(tests/solver/Makefile) | ||||
| AC_CONFIG_FILES(tests/smearing/Makefile) | ||||
| AC_CONFIG_FILES(tests/qdpxx/Makefile) | ||||
| AC_CONFIG_FILES(tests/testu01/Makefile) | ||||
| AC_CONFIG_FILES(benchmarks/Makefile) | ||||
| AC_CONFIG_FILES(extras/Makefile) | ||||
| AC_CONFIG_FILES(extras/Hadrons/Makefile) | ||||
| AC_OUTPUT | ||||
|  | ||||
| echo "" | ||||
| cat config.summary | ||||
| cat grid.configure.summary | ||||
| echo "" | ||||
|  | ||||
|   | ||||
| @@ -162,7 +162,8 @@ void Application::saveParameterFile(const std::string parameterFileName) | ||||
| sizeString((size)*locVol_) << " (" << sizeString(size)  << "/site)" | ||||
|  | ||||
| #define DEFINE_MEMPEAK \ | ||||
| auto memPeak = [this](const std::vector<unsigned int> &program)\ | ||||
| GeneticScheduler<unsigned int>::ObjFunc memPeak = \ | ||||
| [this](const std::vector<unsigned int> &program)\ | ||||
| {\ | ||||
|     unsigned int memPeak;\ | ||||
|     bool         msg;\ | ||||
|   | ||||
| @@ -145,6 +145,15 @@ std::string typeName(void) | ||||
|     return typeName(typeIdPt<T>()); | ||||
| } | ||||
|  | ||||
| // default writers/readers | ||||
| #ifdef HAVE_HDF5 | ||||
| typedef Hdf5Reader CorrReader; | ||||
| typedef Hdf5Writer CorrWriter; | ||||
| #else | ||||
| typedef XmlReader CorrReader; | ||||
| typedef XmlWriter CorrWriter; | ||||
| #endif | ||||
|  | ||||
| END_HADRONS_NAMESPACE | ||||
|  | ||||
| #endif // Hadrons_Global_hpp_ | ||||
|   | ||||
| @@ -29,12 +29,20 @@ See the full license in the file "LICENSE" in the top level distribution directo | ||||
| #include <Grid/Hadrons/Modules/MAction/DWF.hpp> | ||||
| #include <Grid/Hadrons/Modules/MAction/Wilson.hpp> | ||||
| #include <Grid/Hadrons/Modules/MContraction/Baryon.hpp> | ||||
| #include <Grid/Hadrons/Modules/MContraction/DiscLoop.hpp> | ||||
| #include <Grid/Hadrons/Modules/MContraction/Gamma3pt.hpp> | ||||
| #include <Grid/Hadrons/Modules/MContraction/Meson.hpp> | ||||
| #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp> | ||||
| #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp> | ||||
| #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp> | ||||
| #include <Grid/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp> | ||||
| #include <Grid/Hadrons/Modules/MGauge/Load.hpp> | ||||
| #include <Grid/Hadrons/Modules/MGauge/Random.hpp> | ||||
| #include <Grid/Hadrons/Modules/MGauge/Unit.hpp> | ||||
| #include <Grid/Hadrons/Modules/MLoop/NoiseLoop.hpp> | ||||
| #include <Grid/Hadrons/Modules/MSolver/RBPrecCG.hpp> | ||||
| #include <Grid/Hadrons/Modules/MSource/Point.hpp> | ||||
| #include <Grid/Hadrons/Modules/MSource/SeqGamma.hpp> | ||||
| #include <Grid/Hadrons/Modules/MSource/Wall.hpp> | ||||
| #include <Grid/Hadrons/Modules/MSource/Z2.hpp> | ||||
| #include <Grid/Hadrons/Modules/Quark.hpp> | ||||
|   | ||||
| @@ -112,7 +112,7 @@ void TBaryon<FImpl1, FImpl2, FImpl3>::execute(void) | ||||
|                  << " quarks '" << par().q1 << "', '" << par().q2 << "', and '" | ||||
|                  << par().q3 << "'" << std::endl; | ||||
|      | ||||
|     XmlWriter             writer(par().output); | ||||
|     CorrWriter             writer(par().output); | ||||
|     PropagatorField1      &q1 = *env().template getObject<PropagatorField1>(par().q1); | ||||
|     PropagatorField2      &q2 = *env().template getObject<PropagatorField2>(par().q2); | ||||
|     PropagatorField3      &q3 = *env().template getObject<PropagatorField3>(par().q2); | ||||
|   | ||||
							
								
								
									
										144
									
								
								extras/Hadrons/Modules/MContraction/DiscLoop.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										144
									
								
								extras/Hadrons/Modules/MContraction/DiscLoop.hpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,144 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
| Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| Source file: extras/Hadrons/Modules/MContraction/DiscLoop.hpp | ||||
|  | ||||
| Copyright (C) 2017 | ||||
|  | ||||
| Author: Andrew Lawson    <andrew.lawson1991@gmail.com> | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
|  | ||||
| You should have received a copy of the GNU General Public License along | ||||
| with this program; if not, write to the Free Software Foundation, Inc., | ||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
| See the full license in the file "LICENSE" in the top level distribution directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
|  | ||||
| #ifndef Hadrons_DiscLoop_hpp_ | ||||
| #define Hadrons_DiscLoop_hpp_ | ||||
|  | ||||
| #include <Grid/Hadrons/Global.hpp> | ||||
| #include <Grid/Hadrons/Module.hpp> | ||||
| #include <Grid/Hadrons/ModuleFactory.hpp> | ||||
|  | ||||
| BEGIN_HADRONS_NAMESPACE | ||||
|  | ||||
| /****************************************************************************** | ||||
|  *                                DiscLoop                                    * | ||||
|  ******************************************************************************/ | ||||
| BEGIN_MODULE_NAMESPACE(MContraction) | ||||
|  | ||||
| class DiscLoopPar: Serializable | ||||
| { | ||||
| public: | ||||
|     GRID_SERIALIZABLE_CLASS_MEMBERS(DiscLoopPar, | ||||
|                                     std::string,    q_loop, | ||||
|                                     Gamma::Algebra, gamma, | ||||
|                                     std::string,    output); | ||||
| }; | ||||
|  | ||||
| template <typename FImpl> | ||||
| class TDiscLoop: public Module<DiscLoopPar> | ||||
| { | ||||
|     TYPE_ALIASES(FImpl,); | ||||
|     class Result: Serializable | ||||
|     { | ||||
|     public: | ||||
|         GRID_SERIALIZABLE_CLASS_MEMBERS(Result, | ||||
|                                         Gamma::Algebra, gamma, | ||||
|                                         std::vector<Complex>, corr); | ||||
|     }; | ||||
| public: | ||||
|     // constructor | ||||
|     TDiscLoop(const std::string name); | ||||
|     // destructor | ||||
|     virtual ~TDiscLoop(void) = default; | ||||
|     // dependency relation | ||||
|     virtual std::vector<std::string> getInput(void); | ||||
|     virtual std::vector<std::string> getOutput(void); | ||||
|     // setup | ||||
|     virtual void setup(void); | ||||
|     // execution | ||||
|     virtual void execute(void); | ||||
| }; | ||||
|  | ||||
| MODULE_REGISTER_NS(DiscLoop, TDiscLoop<FIMPL>, MContraction); | ||||
|  | ||||
| /****************************************************************************** | ||||
|  *                       TDiscLoop implementation                             * | ||||
|  ******************************************************************************/ | ||||
| // constructor ///////////////////////////////////////////////////////////////// | ||||
| template <typename FImpl> | ||||
| TDiscLoop<FImpl>::TDiscLoop(const std::string name) | ||||
| : Module<DiscLoopPar>(name) | ||||
| {} | ||||
|  | ||||
| // dependencies/products /////////////////////////////////////////////////////// | ||||
| template <typename FImpl> | ||||
| std::vector<std::string> TDiscLoop<FImpl>::getInput(void) | ||||
| { | ||||
|     std::vector<std::string> in = {par().q_loop}; | ||||
|      | ||||
|     return in; | ||||
| } | ||||
|  | ||||
| template <typename FImpl> | ||||
| std::vector<std::string> TDiscLoop<FImpl>::getOutput(void) | ||||
| { | ||||
|     std::vector<std::string> out = {getName()}; | ||||
|      | ||||
|     return out; | ||||
| } | ||||
|  | ||||
| // setup /////////////////////////////////////////////////////////////////////// | ||||
| template <typename FImpl> | ||||
| void TDiscLoop<FImpl>::setup(void) | ||||
| { | ||||
|      | ||||
| } | ||||
|  | ||||
| // execution /////////////////////////////////////////////////////////////////// | ||||
| template <typename FImpl> | ||||
| void TDiscLoop<FImpl>::execute(void) | ||||
| { | ||||
|     LOG(Message) << "Computing disconnected loop contraction '" << getName()  | ||||
|                  << "' using '" << par().q_loop << "' with " << par().gamma  | ||||
|                  << " insertion." << std::endl; | ||||
|  | ||||
|     CorrWriter            writer(par().output); | ||||
|     PropagatorField       &q_loop = *env().template getObject<PropagatorField>(par().q_loop); | ||||
|     LatticeComplex        c(env().getGrid()); | ||||
|     Gamma                 gamma(par().gamma); | ||||
|     std::vector<TComplex> buf; | ||||
|     Result                result; | ||||
|  | ||||
|     c = trace(gamma*q_loop); | ||||
|     sliceSum(c, buf, Tp); | ||||
|  | ||||
|     result.gamma = par().gamma; | ||||
|     result.corr.resize(buf.size()); | ||||
|     for (unsigned int t = 0; t < buf.size(); ++t) | ||||
|     { | ||||
|         result.corr[t] = TensorRemove(buf[t]); | ||||
|     } | ||||
|  | ||||
|     write(writer, "disc", result); | ||||
| } | ||||
|  | ||||
| END_MODULE_NAMESPACE | ||||
|  | ||||
| END_HADRONS_NAMESPACE | ||||
|  | ||||
| #endif // Hadrons_DiscLoop_hpp_ | ||||
							
								
								
									
										170
									
								
								extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										170
									
								
								extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,170 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
| Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| Source file: extras/Hadrons/Modules/MContraction/Gamma3pt.hpp | ||||
|  | ||||
| Copyright (C) 2017 | ||||
|  | ||||
| Author: Andrew Lawson    <andrew.lawson1991@gmail.com> | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
|  | ||||
| You should have received a copy of the GNU General Public License along | ||||
| with this program; if not, write to the Free Software Foundation, Inc., | ||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
| See the full license in the file "LICENSE" in the top level distribution directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
|  | ||||
| #ifndef Hadrons_Gamma3pt_hpp_ | ||||
| #define Hadrons_Gamma3pt_hpp_ | ||||
|  | ||||
| #include <Grid/Hadrons/Global.hpp> | ||||
| #include <Grid/Hadrons/Module.hpp> | ||||
| #include <Grid/Hadrons/ModuleFactory.hpp> | ||||
|  | ||||
| BEGIN_HADRONS_NAMESPACE | ||||
|  | ||||
| /* | ||||
|  * 3pt contraction with gamma matrix insertion. | ||||
|  * | ||||
|  * Schematic: | ||||
|  * | ||||
|  *             q2           q3 | ||||
|  *        /----<------*------<----¬ | ||||
|  *       /          gamma          \ | ||||
|  *      /                           \ | ||||
|  *   i *                            * f | ||||
|  *      \                          / | ||||
|  *       \                        / | ||||
|  *        \----------->----------/ | ||||
|  *                   q1 | ||||
|  * | ||||
|  *      trace(g5*q1*adj(q2)*g5*gamma*q3) | ||||
|  */ | ||||
|  | ||||
| /****************************************************************************** | ||||
|  *                               Gamma3pt                                     * | ||||
|  ******************************************************************************/ | ||||
| BEGIN_MODULE_NAMESPACE(MContraction) | ||||
|  | ||||
| class Gamma3ptPar: Serializable | ||||
| { | ||||
| public: | ||||
|     GRID_SERIALIZABLE_CLASS_MEMBERS(Gamma3ptPar, | ||||
|                                     std::string,    q1, | ||||
|                                     std::string,    q2, | ||||
|                                     std::string,    q3, | ||||
|                                     Gamma::Algebra, gamma, | ||||
|                                     std::string,    output); | ||||
| }; | ||||
|  | ||||
| template <typename FImpl1, typename FImpl2, typename FImpl3> | ||||
| class TGamma3pt: public Module<Gamma3ptPar> | ||||
| { | ||||
|     TYPE_ALIASES(FImpl1, 1); | ||||
|     TYPE_ALIASES(FImpl2, 2); | ||||
|     TYPE_ALIASES(FImpl3, 3); | ||||
|     class Result: Serializable | ||||
|     { | ||||
|     public: | ||||
|         GRID_SERIALIZABLE_CLASS_MEMBERS(Result, | ||||
|                                         Gamma::Algebra, gamma, | ||||
|                                         std::vector<Complex>, corr); | ||||
|     }; | ||||
| public: | ||||
|     // constructor | ||||
|     TGamma3pt(const std::string name); | ||||
|     // destructor | ||||
|     virtual ~TGamma3pt(void) = default; | ||||
|     // dependency relation | ||||
|     virtual std::vector<std::string> getInput(void); | ||||
|     virtual std::vector<std::string> getOutput(void); | ||||
|     // setup | ||||
|     virtual void setup(void); | ||||
|     // execution | ||||
|     virtual void execute(void); | ||||
| }; | ||||
|  | ||||
| MODULE_REGISTER_NS(Gamma3pt, ARG(TGamma3pt<FIMPL, FIMPL, FIMPL>), MContraction); | ||||
|  | ||||
| /****************************************************************************** | ||||
|  *                       TGamma3pt implementation                             * | ||||
|  ******************************************************************************/ | ||||
| // constructor ///////////////////////////////////////////////////////////////// | ||||
| template <typename FImpl1, typename FImpl2, typename FImpl3> | ||||
| TGamma3pt<FImpl1, FImpl2, FImpl3>::TGamma3pt(const std::string name) | ||||
| : Module<Gamma3ptPar>(name) | ||||
| {} | ||||
|  | ||||
| // dependencies/products /////////////////////////////////////////////////////// | ||||
| template <typename FImpl1, typename FImpl2, typename FImpl3> | ||||
| std::vector<std::string> TGamma3pt<FImpl1, FImpl2, FImpl3>::getInput(void) | ||||
| { | ||||
|     std::vector<std::string> in = {par().q1, par().q2, par().q3}; | ||||
|      | ||||
|     return in; | ||||
| } | ||||
|  | ||||
| template <typename FImpl1, typename FImpl2, typename FImpl3> | ||||
| std::vector<std::string> TGamma3pt<FImpl1, FImpl2, FImpl3>::getOutput(void) | ||||
| { | ||||
|     std::vector<std::string> out = {getName()}; | ||||
|      | ||||
|     return out; | ||||
| } | ||||
|  | ||||
| // setup /////////////////////////////////////////////////////////////////////// | ||||
| template <typename FImpl1, typename FImpl2, typename FImpl3> | ||||
| void TGamma3pt<FImpl1, FImpl2, FImpl3>::setup(void) | ||||
| { | ||||
|      | ||||
| } | ||||
|  | ||||
| // execution /////////////////////////////////////////////////////////////////// | ||||
| template <typename FImpl1, typename FImpl2, typename FImpl3> | ||||
| void TGamma3pt<FImpl1, FImpl2, FImpl3>::execute(void) | ||||
| { | ||||
|     LOG(Message) << "Computing 3pt contractions '" << getName() << "' using" | ||||
|                  << " quarks '" << par().q1 << "', '" << par().q2 << "' and '" | ||||
|                  << par().q3 << "', with " << par().gamma << " insertion."  | ||||
|                  << std::endl; | ||||
|  | ||||
|     CorrWriter            writer(par().output); | ||||
|     PropagatorField1      &q1 = *env().template getObject<PropagatorField1>(par().q1); | ||||
|     PropagatorField2      &q2 = *env().template getObject<PropagatorField2>(par().q2); | ||||
|     PropagatorField3      &q3 = *env().template getObject<PropagatorField3>(par().q3); | ||||
|     LatticeComplex        c(env().getGrid()); | ||||
|     Gamma                 g5(Gamma::Algebra::Gamma5); | ||||
|     Gamma                 gamma(par().gamma); | ||||
|     std::vector<TComplex> buf; | ||||
|     Result                result; | ||||
|  | ||||
|     c = trace(g5*q1*adj(q2)*(g5*gamma)*q3); | ||||
|     sliceSum(c, buf, Tp); | ||||
|  | ||||
|     result.gamma = par().gamma; | ||||
|     result.corr.resize(buf.size()); | ||||
|     for (unsigned int t = 0; t < buf.size(); ++t) | ||||
|     { | ||||
|         result.corr[t] = TensorRemove(buf[t]); | ||||
|     } | ||||
|  | ||||
|     write(writer, "gamma3pt", result); | ||||
| } | ||||
|  | ||||
| END_MODULE_NAMESPACE | ||||
|  | ||||
| END_HADRONS_NAMESPACE | ||||
|  | ||||
| #endif // Hadrons_Gamma3pt_hpp_ | ||||
| @@ -6,8 +6,10 @@ Source file: extras/Hadrons/Modules/MContraction/Meson.hpp | ||||
|  | ||||
| Copyright (C) 2015 | ||||
| Copyright (C) 2016 | ||||
| Copyright (C) 2017 | ||||
|  | ||||
| Author: Antonin Portelli <antonin.portelli@me.com> | ||||
|         Andrew Lawson    <andrew.lawson1991@gmail.com> | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| @@ -36,20 +38,39 @@ See the full license in the file "LICENSE" in the top level distribution directo | ||||
|  | ||||
| BEGIN_HADRONS_NAMESPACE | ||||
|  | ||||
| /* | ||||
|   | ||||
|  Meson contractions | ||||
|  ----------------------------- | ||||
|   | ||||
|  * options: | ||||
|  - q1: input propagator 1 (string) | ||||
|  - q2: input propagator 2 (string) | ||||
|  - gammas: gamma products to insert at sink & source, pairs of gamma matrices  | ||||
|            (space-separated strings) in angled brackets (i.e. <g_sink g_src>), | ||||
|            in a sequence (e.g. "<Gamma5 Gamma5><Gamma5 GammaT>"). | ||||
|  | ||||
|            Special values: "all" - perform all possible contractions. | ||||
|  - mom: momentum insertion, space-separated float sequence (e.g ".1 .2 1. 0."), | ||||
|         given as multiples of (2*pi) / L. | ||||
| */ | ||||
|  | ||||
| /****************************************************************************** | ||||
|  *                                TMeson                                       * | ||||
|  ******************************************************************************/ | ||||
| BEGIN_MODULE_NAMESPACE(MContraction) | ||||
|  | ||||
| typedef std::pair<Gamma::Algebra, Gamma::Algebra> GammaPair; | ||||
|  | ||||
| class MesonPar: Serializable | ||||
| { | ||||
| public: | ||||
|     GRID_SERIALIZABLE_CLASS_MEMBERS(MesonPar, | ||||
|                                     std::string,    q1, | ||||
|                                     std::string,    q2, | ||||
|                                     std::string,    output, | ||||
|                                     Gamma::Algebra, gammaSource, | ||||
|                                     Gamma::Algebra, gammaSink); | ||||
|                                     std::string, q1, | ||||
|                                     std::string, q2, | ||||
|                                     std::string, gammas, | ||||
|                                     std::string, mom, | ||||
|                                     std::string, output); | ||||
| }; | ||||
|  | ||||
| template <typename FImpl1, typename FImpl2> | ||||
| @@ -61,7 +82,10 @@ public: | ||||
|     class Result: Serializable | ||||
|     { | ||||
|     public: | ||||
|         GRID_SERIALIZABLE_CLASS_MEMBERS(Result, std::vector<Complex>, corr); | ||||
|         GRID_SERIALIZABLE_CLASS_MEMBERS(Result, | ||||
|                                         Gamma::Algebra, gamma_snk, | ||||
|                                         Gamma::Algebra, gamma_src, | ||||
|                                         std::vector<Complex>, corr); | ||||
|     }; | ||||
| public: | ||||
|     // constructor | ||||
| @@ -71,6 +95,7 @@ public: | ||||
|     // dependencies/products | ||||
|     virtual std::vector<std::string> getInput(void); | ||||
|     virtual std::vector<std::string> getOutput(void); | ||||
|     virtual void parseGammaString(std::vector<GammaPair> &gammaList); | ||||
|     // execution | ||||
|     virtual void execute(void); | ||||
| }; | ||||
| @@ -103,6 +128,32 @@ std::vector<std::string> TMeson<FImpl1, FImpl2>::getOutput(void) | ||||
|     return output; | ||||
| } | ||||
|  | ||||
| template <typename FImpl1, typename FImpl2> | ||||
| void TMeson<FImpl1, FImpl2>::parseGammaString(std::vector<GammaPair> &gammaList) | ||||
| { | ||||
|     // Determine gamma matrices to insert at source/sink. | ||||
|     if (par().gammas.compare("all") == 0) | ||||
|     { | ||||
|         // Do all contractions. | ||||
|         unsigned int n_gam = Ns * Ns; | ||||
|         gammaList.resize(n_gam*n_gam); | ||||
|         for (unsigned int i = 1; i < Gamma::nGamma; i += 2) | ||||
|         { | ||||
|             for (unsigned int j = 1; j < Gamma::nGamma; j += 2) | ||||
|             { | ||||
|                 gammaList.push_back(std::make_pair((Gamma::Algebra)i,  | ||||
|                                                    (Gamma::Algebra)j)); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|     else | ||||
|     { | ||||
|         // Parse individual contractions from input string. | ||||
|         gammaList = strToVec<GammaPair>(par().gammas); | ||||
|     } | ||||
| } | ||||
|  | ||||
|  | ||||
| // execution /////////////////////////////////////////////////////////////////// | ||||
| template <typename FImpl1, typename FImpl2> | ||||
| void TMeson<FImpl1, FImpl2>::execute(void) | ||||
| @@ -111,21 +162,44 @@ void TMeson<FImpl1, FImpl2>::execute(void) | ||||
|                  << " quarks '" << par().q1 << "' and '" << par().q2 << "'" | ||||
|                  << std::endl; | ||||
|      | ||||
|     XmlWriter             writer(par().output); | ||||
|     PropagatorField1      &q1 = *env().template getObject<PropagatorField1>(par().q1); | ||||
|     PropagatorField2      &q2 = *env().template getObject<PropagatorField2>(par().q2); | ||||
|     LatticeComplex        c(env().getGrid()); | ||||
|     Gamma                 gSrc(par().gammaSource), gSnk(par().gammaSink); | ||||
|     Gamma                 g5(Gamma::Algebra::Gamma5); | ||||
|     std::vector<TComplex> buf; | ||||
|     Result                result; | ||||
|     CorrWriter              writer(par().output); | ||||
|     PropagatorField1       &q1 = *env().template getObject<PropagatorField1>(par().q1); | ||||
|     PropagatorField2       &q2 = *env().template getObject<PropagatorField2>(par().q2); | ||||
|     LatticeComplex         c(env().getGrid()); | ||||
|     Gamma                  g5(Gamma::Algebra::Gamma5); | ||||
|     std::vector<GammaPair> gammaList; | ||||
|     std::vector<TComplex>  buf; | ||||
|     std::vector<Result>    result; | ||||
|     std::vector<Real>      p; | ||||
|  | ||||
|     c = trace(gSnk*q1*adj(gSrc)*g5*adj(q2)*g5); | ||||
|     sliceSum(c, buf, Tp); | ||||
|     result.corr.resize(buf.size()); | ||||
|     for (unsigned int t = 0; t < buf.size(); ++t) | ||||
|     p  = strToVec<Real>(par().mom); | ||||
|     LatticeComplex         ph(env().getGrid()), coor(env().getGrid()); | ||||
|     Complex                i(0.0,1.0); | ||||
|     ph = zero; | ||||
|     for(unsigned int mu = 0; mu < env().getNd(); mu++) | ||||
|     { | ||||
|         result.corr[t] = TensorRemove(buf[t]); | ||||
|         LatticeCoordinate(coor, mu); | ||||
|         ph = ph + p[mu]*coor*((1./(env().getGrid()->_fdimensions[mu]))); | ||||
|     } | ||||
|     ph = exp((Real)(2*M_PI)*i*ph); | ||||
|      | ||||
|     parseGammaString(gammaList); | ||||
|  | ||||
|     result.resize(gammaList.size()); | ||||
|     for (unsigned int i = 0; i < result.size(); ++i) | ||||
|     { | ||||
|         Gamma gSnk(gammaList[i].first); | ||||
|         Gamma gSrc(gammaList[i].second); | ||||
|         c = trace((g5*gSnk)*q1*(adj(gSrc)*g5)*adj(q2))*ph; | ||||
|         sliceSum(c, buf, Tp); | ||||
|  | ||||
|         result[i].gamma_snk = gammaList[i].first; | ||||
|         result[i].gamma_src = gammaList[i].second; | ||||
|         result[i].corr.resize(buf.size()); | ||||
|         for (unsigned int t = 0; t < buf.size(); ++t) | ||||
|         { | ||||
|             result[i].corr[t] = TensorRemove(buf[t]); | ||||
|         } | ||||
|     } | ||||
|     write(writer, "meson", result); | ||||
| } | ||||
|   | ||||
							
								
								
									
										114
									
								
								extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										114
									
								
								extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,114 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
| Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp | ||||
|  | ||||
| Copyright (C) 2017 | ||||
|  | ||||
| Author: Andrew Lawson    <andrew.lawson1991@gmail.com> | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
|  | ||||
| You should have received a copy of the GNU General Public License along | ||||
| with this program; if not, write to the Free Software Foundation, Inc., | ||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
| See the full license in the file "LICENSE" in the top level distribution directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
|  | ||||
| #ifndef Hadrons_WeakHamiltonian_hpp_ | ||||
| #define Hadrons_WeakHamiltonian_hpp_ | ||||
|  | ||||
| #include <Grid/Hadrons/Global.hpp> | ||||
| #include <Grid/Hadrons/Module.hpp> | ||||
| #include <Grid/Hadrons/ModuleFactory.hpp> | ||||
|  | ||||
| BEGIN_HADRONS_NAMESPACE | ||||
|  | ||||
| /****************************************************************************** | ||||
|  *                         WeakHamiltonian                                    * | ||||
|  ******************************************************************************/ | ||||
| BEGIN_MODULE_NAMESPACE(MContraction) | ||||
|  | ||||
| /******************************************************************************* | ||||
|  * Utilities for contractions involving the Weak Hamiltonian. | ||||
|  ******************************************************************************/ | ||||
| //// Sum and store correlator. | ||||
| #define MAKE_DIAG(exp, buf, res, n)\ | ||||
| sliceSum(exp, buf, Tp);\ | ||||
| res.name = (n);\ | ||||
| res.corr.resize(buf.size());\ | ||||
| for (unsigned int t = 0; t < buf.size(); ++t)\ | ||||
| {\ | ||||
|     res.corr[t] = TensorRemove(buf[t]);\ | ||||
| } | ||||
|  | ||||
| //// Contraction of mu index: use 'mu' variable in exp. | ||||
| #define SUM_MU(buf,exp)\ | ||||
| buf = zero;\ | ||||
| for (unsigned int mu = 0; mu < ndim; ++mu)\ | ||||
| {\ | ||||
|     buf += exp;\ | ||||
| } | ||||
|  | ||||
| enum  | ||||
| { | ||||
|   i_V = 0, | ||||
|   i_A = 1, | ||||
|   n_i = 2 | ||||
| }; | ||||
|  | ||||
| class WeakHamiltonianPar: Serializable | ||||
| { | ||||
| public: | ||||
|     GRID_SERIALIZABLE_CLASS_MEMBERS(WeakHamiltonianPar, | ||||
|                                     std::string, q1, | ||||
|                                     std::string, q2, | ||||
|                                     std::string, q3, | ||||
|                                     std::string, q4, | ||||
|                                     std::string, output); | ||||
| }; | ||||
|  | ||||
| #define MAKE_WEAK_MODULE(modname)\ | ||||
| class T##modname: public Module<WeakHamiltonianPar>\ | ||||
| {\ | ||||
| public:\ | ||||
|     TYPE_ALIASES(FIMPL,)\ | ||||
|     class Result: Serializable\ | ||||
|     {\ | ||||
|     public:\ | ||||
|         GRID_SERIALIZABLE_CLASS_MEMBERS(Result,\ | ||||
|                                         std::string, name,\ | ||||
|                                         std::vector<Complex>, corr);\ | ||||
|     };\ | ||||
| public:\ | ||||
|     /* constructor */ \ | ||||
|     T##modname(const std::string name);\ | ||||
|     /* destructor */ \ | ||||
|     virtual ~T##modname(void) = default;\ | ||||
|     /* dependency relation */ \ | ||||
|     virtual std::vector<std::string> getInput(void);\ | ||||
|     virtual std::vector<std::string> getOutput(void);\ | ||||
|     /* setup */ \ | ||||
|     virtual void setup(void);\ | ||||
|     /* execution */ \ | ||||
|     virtual void execute(void);\ | ||||
|     std::vector<std::string> VA_label = {"V", "A"};\ | ||||
| };\ | ||||
| MODULE_REGISTER_NS(modname, T##modname, MContraction); | ||||
|  | ||||
| END_MODULE_NAMESPACE | ||||
|  | ||||
| END_HADRONS_NAMESPACE | ||||
|  | ||||
| #endif // Hadrons_WeakHamiltonian_hpp_ | ||||
							
								
								
									
										137
									
								
								extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										137
									
								
								extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,137 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
| Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.cc | ||||
|  | ||||
| Copyright (C) 2017 | ||||
|  | ||||
| Author: Andrew Lawson    <andrew.lawson1991@gmail.com> | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
|  | ||||
| You should have received a copy of the GNU General Public License along | ||||
| with this program; if not, write to the Free Software Foundation, Inc., | ||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
| See the full license in the file "LICENSE" in the top level distribution directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
|  | ||||
| #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp> | ||||
|  | ||||
| using namespace Grid; | ||||
| using namespace Hadrons; | ||||
| using namespace MContraction; | ||||
|  | ||||
| /* | ||||
|  * Weak Hamiltonian current-current contractions, Eye-type. | ||||
|  *  | ||||
|  * These contractions are generated by the Q1 and Q2 operators in the physical | ||||
|  * basis (see e.g. Fig 3 of arXiv:1507.03094). | ||||
|  *  | ||||
|  * Schematics:        q4                 |                   | ||||
|  *                  /-<-¬                |                              | ||||
|  *                 /     \               |             q2           q3 | ||||
|  *                 \     /               |        /----<------*------<----¬                         | ||||
|  *            q2    \   /    q3          |       /          /-*-¬          \ | ||||
|  *       /-----<-----* *-----<----¬      |      /          /     \          \ | ||||
|  *    i *            H_W           * f   |   i *           \     /  q4      * f | ||||
|  *       \                        /      |      \           \->-/          /    | ||||
|  *        \                      /       |       \                        /        | ||||
|  *         \---------->---------/        |        \----------->----------/         | ||||
|  *                   q1                  |                   q1                   | ||||
|  *                                       | | ||||
|  *                Saucer (S)             |                  Eye (E) | ||||
|  *  | ||||
|  * S: trace(q3*g5*q1*adj(q2)*g5*gL[mu][p_1]*q4*gL[mu][p_2]) | ||||
|  * E: trace(q3*g5*q1*adj(q2)*g5*gL[mu][p_1])*trace(q4*gL[mu][p_2]) | ||||
|  */ | ||||
|  | ||||
| /****************************************************************************** | ||||
|  *                  TWeakHamiltonianEye implementation                        * | ||||
|  ******************************************************************************/ | ||||
| // constructor ///////////////////////////////////////////////////////////////// | ||||
| TWeakHamiltonianEye::TWeakHamiltonianEye(const std::string name) | ||||
| : Module<WeakHamiltonianPar>(name) | ||||
| {} | ||||
|  | ||||
| // dependencies/products /////////////////////////////////////////////////////// | ||||
| std::vector<std::string> TWeakHamiltonianEye::getInput(void) | ||||
| { | ||||
|     std::vector<std::string> in = {par().q1, par().q2, par().q3, par().q4}; | ||||
|      | ||||
|     return in; | ||||
| } | ||||
|  | ||||
| std::vector<std::string> TWeakHamiltonianEye::getOutput(void) | ||||
| { | ||||
|     std::vector<std::string> out = {getName()}; | ||||
|      | ||||
|     return out; | ||||
| } | ||||
|  | ||||
| // setup /////////////////////////////////////////////////////////////////////// | ||||
| void TWeakHamiltonianEye::setup(void) | ||||
| { | ||||
|  | ||||
| } | ||||
|  | ||||
| // execution /////////////////////////////////////////////////////////////////// | ||||
| void TWeakHamiltonianEye::execute(void) | ||||
| { | ||||
|     LOG(Message) << "Computing Weak Hamiltonian (Eye type) contractions '"  | ||||
|                  << getName() << "' using quarks '" << par().q1 << "', '"  | ||||
|                  << par().q2 << ", '" << par().q3 << "' and '" << par().q4  | ||||
|                  << "'." << std::endl; | ||||
|  | ||||
|     CorrWriter             writer(par().output); | ||||
|     PropagatorField &q1 = *env().template getObject<PropagatorField>(par().q1); | ||||
|     PropagatorField &q2 = *env().template getObject<PropagatorField>(par().q2); | ||||
|     PropagatorField &q3 = *env().template getObject<PropagatorField>(par().q3); | ||||
|     PropagatorField &q4 = *env().template getObject<PropagatorField>(par().q4); | ||||
|     Gamma g5            = Gamma(Gamma::Algebra::Gamma5); | ||||
|     LatticeComplex        expbuf(env().getGrid()); | ||||
|     std::vector<TComplex> corrbuf; | ||||
|     std::vector<Result>   result(n_eye_diag); | ||||
|     unsigned int ndim   = env().getNd(); | ||||
|  | ||||
|     PropagatorField              tmp1(env().getGrid()); | ||||
|     LatticeComplex               tmp2(env().getGrid()); | ||||
|     std::vector<PropagatorField> S_body(ndim, tmp1); | ||||
|     std::vector<PropagatorField> S_loop(ndim, tmp1); | ||||
|     std::vector<LatticeComplex>  E_body(ndim, tmp2); | ||||
|     std::vector<LatticeComplex>  E_loop(ndim, tmp2); | ||||
|  | ||||
|     // Setup for S-type contractions. | ||||
|     for (int mu = 0; mu < ndim; ++mu) | ||||
|     { | ||||
|         S_body[mu] = MAKE_SE_BODY(q1, q2, q3, GammaL(Gamma::gmu[mu])); | ||||
|         S_loop[mu] = MAKE_SE_LOOP(q4, GammaL(Gamma::gmu[mu])); | ||||
|     } | ||||
|  | ||||
|     // Perform S-type contractions.     | ||||
|     SUM_MU(expbuf, trace(S_body[mu]*S_loop[mu])) | ||||
|     MAKE_DIAG(expbuf, corrbuf, result[S_diag], "HW_S") | ||||
|  | ||||
|     // Recycle sub-expressions for E-type contractions. | ||||
|     for (unsigned int mu = 0; mu < ndim; ++mu) | ||||
|     { | ||||
|         E_body[mu] = trace(S_body[mu]); | ||||
|         E_loop[mu] = trace(S_loop[mu]); | ||||
|     } | ||||
|  | ||||
|     // Perform E-type contractions. | ||||
|     SUM_MU(expbuf, E_body[mu]*E_loop[mu]) | ||||
|     MAKE_DIAG(expbuf, corrbuf, result[E_diag], "HW_E") | ||||
|  | ||||
|     write(writer, "HW_Eye", result); | ||||
| } | ||||
							
								
								
									
										58
									
								
								extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										58
									
								
								extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,58 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
| Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp | ||||
|  | ||||
| Copyright (C) 2017 | ||||
|  | ||||
| Author: Andrew Lawson    <andrew.lawson1991@gmail.com> | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
|  | ||||
| You should have received a copy of the GNU General Public License along | ||||
| with this program; if not, write to the Free Software Foundation, Inc., | ||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
| See the full license in the file "LICENSE" in the top level distribution directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
|  | ||||
| #ifndef Hadrons_WeakHamiltonianEye_hpp_ | ||||
| #define Hadrons_WeakHamiltonianEye_hpp_ | ||||
|  | ||||
| #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp> | ||||
|  | ||||
| BEGIN_HADRONS_NAMESPACE | ||||
|  | ||||
| /****************************************************************************** | ||||
|  *                         WeakHamiltonianEye                                 * | ||||
|  ******************************************************************************/ | ||||
| BEGIN_MODULE_NAMESPACE(MContraction) | ||||
|  | ||||
| enum | ||||
| { | ||||
|     S_diag = 0, | ||||
|     E_diag = 1, | ||||
|     n_eye_diag = 2 | ||||
| }; | ||||
|  | ||||
| // Saucer and Eye subdiagram contractions. | ||||
| #define MAKE_SE_BODY(Q_1, Q_2, Q_3, gamma) (Q_3*g5*Q_1*adj(Q_2)*g5*gamma) | ||||
| #define MAKE_SE_LOOP(Q_loop, gamma) (Q_loop*gamma) | ||||
|  | ||||
| MAKE_WEAK_MODULE(WeakHamiltonianEye) | ||||
|  | ||||
| END_MODULE_NAMESPACE | ||||
|  | ||||
| END_HADRONS_NAMESPACE | ||||
|  | ||||
| #endif // Hadrons_WeakHamiltonianEye_hpp_ | ||||
							
								
								
									
										139
									
								
								extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										139
									
								
								extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,139 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
| Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.cc | ||||
|  | ||||
| Copyright (C) 2017 | ||||
|  | ||||
| Author: Andrew Lawson    <andrew.lawson1991@gmail.com> | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
|  | ||||
| You should have received a copy of the GNU General Public License along | ||||
| with this program; if not, write to the Free Software Foundation, Inc., | ||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
| See the full license in the file "LICENSE" in the top level distribution directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
|  | ||||
| #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp> | ||||
|  | ||||
| using namespace Grid; | ||||
| using namespace Hadrons; | ||||
| using namespace MContraction; | ||||
|  | ||||
| /* | ||||
|  * Weak Hamiltonian current-current contractions, Non-Eye-type. | ||||
|  *  | ||||
|  * These contractions are generated by the Q1 and Q2 operators in the physical | ||||
|  * basis (see e.g. Fig 3 of arXiv:1507.03094). | ||||
|  *  | ||||
|  * Schematic:      | ||||
|  *            q2             q3          |           q2              q3 | ||||
|  *          /--<--¬       /--<--¬        |        /--<--¬         /--<--¬        | ||||
|  *         /       \     /       \       |       /       \       /       \       | ||||
|  *        /         \   /         \      |      /         \     /         \      | ||||
|  *       /           \ /           \     |     /           \   /           \     | ||||
|  *    i *             * H_W         *  f |  i *             * * H_W         * f  | ||||
|  *      \             *             |    |     \           /   \           / | ||||
|  *       \           / \           /     |      \         /     \         /     | ||||
|  *        \         /   \         /      |       \       /       \       /   | ||||
|  *         \       /     \       /       |        \-->--/         \-->--/       | ||||
|  *          \-->--/       \-->--/        |          q1               q4  | ||||
|  *            q1             q4          | | ||||
|  *                Connected (C)          |                 Wing (W) | ||||
|  * | ||||
|  * C: trace(q1*adj(q2)*g5*gL[mu]*q3*adj(q4)*g5*gL[mu]) | ||||
|  * W: trace(q1*adj(q2)*g5*gL[mu])*trace(q3*adj(q4)*g5*gL[mu]) | ||||
|  *  | ||||
|  */ | ||||
|  | ||||
| /****************************************************************************** | ||||
|  *                  TWeakHamiltonianNonEye implementation                     * | ||||
|  ******************************************************************************/ | ||||
| // constructor ///////////////////////////////////////////////////////////////// | ||||
| TWeakHamiltonianNonEye::TWeakHamiltonianNonEye(const std::string name) | ||||
| : Module<WeakHamiltonianPar>(name) | ||||
| {} | ||||
|  | ||||
| // dependencies/products /////////////////////////////////////////////////////// | ||||
| std::vector<std::string> TWeakHamiltonianNonEye::getInput(void) | ||||
| { | ||||
|     std::vector<std::string> in = {par().q1, par().q2, par().q3, par().q4}; | ||||
|      | ||||
|     return in; | ||||
| } | ||||
|  | ||||
| std::vector<std::string> TWeakHamiltonianNonEye::getOutput(void) | ||||
| { | ||||
|     std::vector<std::string> out = {getName()}; | ||||
|      | ||||
|     return out; | ||||
| } | ||||
|  | ||||
| // setup /////////////////////////////////////////////////////////////////////// | ||||
| void TWeakHamiltonianNonEye::setup(void) | ||||
| { | ||||
|  | ||||
| } | ||||
|  | ||||
| // execution /////////////////////////////////////////////////////////////////// | ||||
| void TWeakHamiltonianNonEye::execute(void) | ||||
| { | ||||
|     LOG(Message) << "Computing Weak Hamiltonian (Non-Eye type) contractions '"  | ||||
|                  << getName() << "' using quarks '" << par().q1 << "', '"  | ||||
|                  << par().q2 << ", '" << par().q3 << "' and '" << par().q4  | ||||
|                  << "'." << std::endl; | ||||
|      | ||||
|     CorrWriter             writer(par().output); | ||||
|     PropagatorField &q1 = *env().template getObject<PropagatorField>(par().q1); | ||||
|     PropagatorField &q2 = *env().template getObject<PropagatorField>(par().q2); | ||||
|     PropagatorField &q3 = *env().template getObject<PropagatorField>(par().q3); | ||||
|     PropagatorField &q4 = *env().template getObject<PropagatorField>(par().q4); | ||||
|     Gamma g5            = Gamma(Gamma::Algebra::Gamma5); | ||||
|     LatticeComplex        expbuf(env().getGrid()); | ||||
|     std::vector<TComplex> corrbuf; | ||||
|     std::vector<Result>   result(n_noneye_diag);  | ||||
|     unsigned int ndim   = env().getNd(); | ||||
|  | ||||
|     PropagatorField              tmp1(env().getGrid()); | ||||
|     LatticeComplex               tmp2(env().getGrid()); | ||||
|     std::vector<PropagatorField> C_i_side_loop(ndim, tmp1); | ||||
|     std::vector<PropagatorField> C_f_side_loop(ndim, tmp1); | ||||
|     std::vector<LatticeComplex>  W_i_side_loop(ndim, tmp2); | ||||
|     std::vector<LatticeComplex>  W_f_side_loop(ndim, tmp2); | ||||
|  | ||||
|     // Setup for C-type contractions. | ||||
|     for (int mu = 0; mu < ndim; ++mu) | ||||
|     { | ||||
|         C_i_side_loop[mu] = MAKE_CW_SUBDIAG(q1, q2, GammaL(Gamma::gmu[mu])); | ||||
|         C_f_side_loop[mu] = MAKE_CW_SUBDIAG(q3, q4, GammaL(Gamma::gmu[mu])); | ||||
|     } | ||||
|  | ||||
|     // Perform C-type contractions.     | ||||
|     SUM_MU(expbuf, trace(C_i_side_loop[mu]*C_f_side_loop[mu])) | ||||
|     MAKE_DIAG(expbuf, corrbuf, result[C_diag], "HW_C") | ||||
|  | ||||
|     // Recycle sub-expressions for W-type contractions. | ||||
|     for (unsigned int mu = 0; mu < ndim; ++mu) | ||||
|     { | ||||
|         W_i_side_loop[mu] = trace(C_i_side_loop[mu]); | ||||
|         W_f_side_loop[mu] = trace(C_f_side_loop[mu]); | ||||
|     } | ||||
|  | ||||
|     // Perform W-type contractions. | ||||
|     SUM_MU(expbuf, W_i_side_loop[mu]*W_f_side_loop[mu]) | ||||
|     MAKE_DIAG(expbuf, corrbuf, result[W_diag], "HW_W") | ||||
|  | ||||
|     write(writer, "HW_NonEye", result); | ||||
| } | ||||
| @@ -0,0 +1,57 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
| Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp | ||||
|  | ||||
| Copyright (C) 2017 | ||||
|  | ||||
| Author: Andrew Lawson    <andrew.lawson1991@gmail.com> | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
|  | ||||
| You should have received a copy of the GNU General Public License along | ||||
| with this program; if not, write to the Free Software Foundation, Inc., | ||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
| See the full license in the file "LICENSE" in the top level distribution directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
|  | ||||
| #ifndef Hadrons_WeakHamiltonianNonEye_hpp_ | ||||
| #define Hadrons_WeakHamiltonianNonEye_hpp_ | ||||
|  | ||||
| #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp> | ||||
|  | ||||
| BEGIN_HADRONS_NAMESPACE | ||||
|  | ||||
| /****************************************************************************** | ||||
|  *                         WeakHamiltonianNonEye                              * | ||||
|  ******************************************************************************/ | ||||
| BEGIN_MODULE_NAMESPACE(MContraction) | ||||
|  | ||||
| enum | ||||
| { | ||||
|     W_diag = 0, | ||||
|     C_diag = 1, | ||||
|     n_noneye_diag = 2 | ||||
| }; | ||||
|  | ||||
| // Wing and Connected subdiagram contractions | ||||
| #define MAKE_CW_SUBDIAG(Q_1, Q_2, gamma) (Q_1*adj(Q_2)*g5*gamma) | ||||
|  | ||||
| MAKE_WEAK_MODULE(WeakHamiltonianNonEye) | ||||
|  | ||||
| END_MODULE_NAMESPACE | ||||
|  | ||||
| END_HADRONS_NAMESPACE | ||||
|  | ||||
| #endif // Hadrons_WeakHamiltonianNonEye_hpp_ | ||||
							
								
								
									
										135
									
								
								extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										135
									
								
								extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,135 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
| Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| Source file: extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.cc | ||||
|  | ||||
| Copyright (C) 2017 | ||||
|  | ||||
| Author: Andrew Lawson    <andrew.lawson1991@gmail.com> | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
|  | ||||
| You should have received a copy of the GNU General Public License along | ||||
| with this program; if not, write to the Free Software Foundation, Inc., | ||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
| See the full license in the file "LICENSE" in the top level distribution directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
|  | ||||
| #include <Grid/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp> | ||||
|  | ||||
| using namespace Grid; | ||||
| using namespace Hadrons; | ||||
| using namespace MContraction; | ||||
|  | ||||
| /* | ||||
|  * Weak Hamiltonian + current contractions, disconnected topology for neutral  | ||||
|  * mesons. | ||||
|  *  | ||||
|  * These contractions are generated by operators Q_1,...,10 of the dS=1 Weak | ||||
|  * Hamiltonian in the physical basis and an additional current J (see e.g.  | ||||
|  * Fig 11 of arXiv:1507.03094). | ||||
|  *  | ||||
|  * Schematic: | ||||
|  *                         | ||||
|  *           q2          q4             q3 | ||||
|  *       /--<--¬     /---<--¬       /---<--¬ | ||||
|  *     /         \ /         \     /        \ | ||||
|  *  i *           * H_W      |  J *          * f | ||||
|  *     \         / \         /     \        / | ||||
|  *      \--->---/   \-------/       \------/ | ||||
|  *          q1  | ||||
|  *  | ||||
|  * options | ||||
|  * - q1: input propagator 1 (string) | ||||
|  * - q2: input propagator 2 (string) | ||||
|  * - q3: input propagator 3 (string), assumed to be sequential propagator  | ||||
|  * - q4: input propagator 4 (string), assumed to be a loop | ||||
|  *  | ||||
|  * type 1: trace(q1*adj(q2)*g5*gL[mu])*trace(loop*gL[mu])*trace(q3*g5) | ||||
|  * type 2: trace(q1*adj(q2)*g5*gL[mu]*loop*gL[mu])*trace(q3*g5) | ||||
|  */ | ||||
|  | ||||
| /******************************************************************************* | ||||
|  *                  TWeakNeutral4ptDisc implementation                         * | ||||
|  ******************************************************************************/ | ||||
| // constructor ///////////////////////////////////////////////////////////////// | ||||
| TWeakNeutral4ptDisc::TWeakNeutral4ptDisc(const std::string name) | ||||
| : Module<WeakHamiltonianPar>(name) | ||||
| {} | ||||
|  | ||||
| // dependencies/products /////////////////////////////////////////////////////// | ||||
| std::vector<std::string> TWeakNeutral4ptDisc::getInput(void) | ||||
| { | ||||
|     std::vector<std::string> in = {par().q1, par().q2, par().q3, par().q4}; | ||||
|      | ||||
|     return in; | ||||
| } | ||||
|  | ||||
| std::vector<std::string> TWeakNeutral4ptDisc::getOutput(void) | ||||
| { | ||||
|     std::vector<std::string> out = {getName()}; | ||||
|      | ||||
|     return out; | ||||
| } | ||||
|  | ||||
| // setup /////////////////////////////////////////////////////////////////////// | ||||
| void TWeakNeutral4ptDisc::setup(void) | ||||
| { | ||||
|  | ||||
| } | ||||
|  | ||||
| // execution /////////////////////////////////////////////////////////////////// | ||||
| void TWeakNeutral4ptDisc::execute(void) | ||||
| { | ||||
|     LOG(Message) << "Computing Weak Hamiltonian neutral disconnected contractions '"  | ||||
|                  << getName() << "' using quarks '" << par().q1 << "', '"  | ||||
|                  << par().q2 << ", '" << par().q3 << "' and '" << par().q4  | ||||
|                  << "'." << std::endl; | ||||
|  | ||||
|     CorrWriter             writer(par().output); | ||||
|     PropagatorField &q1 = *env().template getObject<PropagatorField>(par().q1); | ||||
|     PropagatorField &q2 = *env().template getObject<PropagatorField>(par().q2); | ||||
|     PropagatorField &q3 = *env().template getObject<PropagatorField>(par().q3); | ||||
|     PropagatorField &q4 = *env().template getObject<PropagatorField>(par().q4); | ||||
|     Gamma g5            = Gamma(Gamma::Algebra::Gamma5); | ||||
|     LatticeComplex        expbuf(env().getGrid()); | ||||
|     std::vector<TComplex> corrbuf; | ||||
|     std::vector<Result>   result(n_neut_disc_diag); | ||||
|     unsigned int ndim   = env().getNd(); | ||||
|  | ||||
|     PropagatorField              tmp(env().getGrid()); | ||||
|     std::vector<PropagatorField> meson(ndim, tmp); | ||||
|     std::vector<PropagatorField> loop(ndim, tmp); | ||||
|     LatticeComplex               curr(env().getGrid()); | ||||
|  | ||||
|     // Setup for type 1 contractions. | ||||
|     for (int mu = 0; mu < ndim; ++mu) | ||||
|     { | ||||
|         meson[mu] = MAKE_DISC_MESON(q1, q2, GammaL(Gamma::gmu[mu])); | ||||
|         loop[mu] = MAKE_DISC_LOOP(q4, GammaL(Gamma::gmu[mu])); | ||||
|     } | ||||
|     curr = MAKE_DISC_CURR(q3, GammaL(Gamma::Algebra::Gamma5)); | ||||
|  | ||||
|     // Perform type 1 contractions.     | ||||
|     SUM_MU(expbuf, trace(meson[mu]*loop[mu])) | ||||
|     expbuf *= curr; | ||||
|     MAKE_DIAG(expbuf, corrbuf, result[neut_disc_1_diag], "HW_disc0_1") | ||||
|  | ||||
|     // Perform type 2 contractions. | ||||
|     SUM_MU(expbuf, trace(meson[mu])*trace(loop[mu])) | ||||
|     expbuf *= curr; | ||||
|     MAKE_DIAG(expbuf, corrbuf, result[neut_disc_2_diag], "HW_disc0_2") | ||||
|  | ||||
|     write(writer, "HW_disc0", result); | ||||
| } | ||||
							
								
								
									
										59
									
								
								extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										59
									
								
								extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,59 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
| Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| Source file: extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp | ||||
|  | ||||
| Copyright (C) 2017 | ||||
|  | ||||
| Author: Andrew Lawson    <andrew.lawson1991@gmail.com> | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
|  | ||||
| You should have received a copy of the GNU General Public License along | ||||
| with this program; if not, write to the Free Software Foundation, Inc., | ||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
| See the full license in the file "LICENSE" in the top level distribution directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
|  | ||||
| #ifndef Hadrons_WeakNeutral4ptDisc_hpp_ | ||||
| #define Hadrons_WeakNeutral4ptDisc_hpp_ | ||||
|  | ||||
| #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp> | ||||
|  | ||||
| BEGIN_HADRONS_NAMESPACE | ||||
|  | ||||
| /****************************************************************************** | ||||
|  *                         WeakNeutral4ptDisc                                 * | ||||
|  ******************************************************************************/ | ||||
| BEGIN_MODULE_NAMESPACE(MContraction) | ||||
|  | ||||
| enum | ||||
| { | ||||
|     neut_disc_1_diag = 0, | ||||
|     neut_disc_2_diag = 1, | ||||
|     n_neut_disc_diag = 2 | ||||
| }; | ||||
|  | ||||
| // Neutral 4pt disconnected subdiagram contractions. | ||||
| #define MAKE_DISC_MESON(Q_1, Q_2, gamma) (Q_1*adj(Q_2)*g5*gamma) | ||||
| #define MAKE_DISC_LOOP(Q_LOOP, gamma) (Q_LOOP*gamma) | ||||
| #define MAKE_DISC_CURR(Q_c, gamma) (trace(Q_c*gamma)) | ||||
|  | ||||
| MAKE_WEAK_MODULE(WeakNeutral4ptDisc) | ||||
|  | ||||
| END_MODULE_NAMESPACE | ||||
|  | ||||
| END_HADRONS_NAMESPACE | ||||
|  | ||||
| #endif // Hadrons_WeakNeutral4ptDisc_hpp_ | ||||
| @@ -65,7 +65,7 @@ void TLoad::setup(void) | ||||
| // execution /////////////////////////////////////////////////////////////////// | ||||
| void TLoad::execute(void) | ||||
| { | ||||
|     NerscField  header; | ||||
|     FieldMetaData  header; | ||||
|     std::string fileName = par().file + "." | ||||
|                            + std::to_string(env().getTrajectory()); | ||||
|      | ||||
| @@ -74,5 +74,5 @@ void TLoad::execute(void) | ||||
|     LatticeGaugeField &U = *env().createLattice<LatticeGaugeField>(getName()); | ||||
|     NerscIO::readConfiguration(U, header, fileName); | ||||
|     LOG(Message) << "NERSC header:" << std::endl; | ||||
|     dump_nersc_header(header, LOG(Message)); | ||||
|     dump_meta_data(header, LOG(Message)); | ||||
| } | ||||
|   | ||||
							
								
								
									
										132
									
								
								extras/Hadrons/Modules/MLoop/NoiseLoop.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										132
									
								
								extras/Hadrons/Modules/MLoop/NoiseLoop.hpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,132 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
| Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| Source file: extras/Hadrons/Modules/MLoop/NoiseLoop.hpp | ||||
|  | ||||
| Copyright (C) 2016 | ||||
|  | ||||
| Author: Andrew Lawson <andrew.lawson1991@gmail.com> | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
|  | ||||
| You should have received a copy of the GNU General Public License along | ||||
| with this program; if not, write to the Free Software Foundation, Inc., | ||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
| See the full license in the file "LICENSE" in the top level distribution directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
|  | ||||
| #ifndef Hadrons_NoiseLoop_hpp_ | ||||
| #define Hadrons_NoiseLoop_hpp_ | ||||
|  | ||||
| #include <Grid/Hadrons/Global.hpp> | ||||
| #include <Grid/Hadrons/Module.hpp> | ||||
| #include <Grid/Hadrons/ModuleFactory.hpp> | ||||
|  | ||||
| BEGIN_HADRONS_NAMESPACE | ||||
|  | ||||
| /* | ||||
|   | ||||
|  Noise loop propagator | ||||
|  ----------------------------- | ||||
|  * loop_x = q_x * adj(eta_x) | ||||
|   | ||||
|  * options: | ||||
|  - q = Result of inversion on noise source. | ||||
|  - eta = noise source. | ||||
|  | ||||
|  */ | ||||
|  | ||||
|  | ||||
| /****************************************************************************** | ||||
|  *                         NoiseLoop                                          * | ||||
|  ******************************************************************************/ | ||||
| BEGIN_MODULE_NAMESPACE(MLoop) | ||||
|  | ||||
| class NoiseLoopPar: Serializable | ||||
| { | ||||
| public: | ||||
|     GRID_SERIALIZABLE_CLASS_MEMBERS(NoiseLoopPar, | ||||
|                                     std::string, q, | ||||
|                                     std::string, eta); | ||||
| }; | ||||
|  | ||||
| template <typename FImpl> | ||||
| class TNoiseLoop: public Module<NoiseLoopPar> | ||||
| { | ||||
| public: | ||||
|     TYPE_ALIASES(FImpl,); | ||||
| public: | ||||
|     // constructor | ||||
|     TNoiseLoop(const std::string name); | ||||
|     // destructor | ||||
|     virtual ~TNoiseLoop(void) = default; | ||||
|     // dependency relation | ||||
|     virtual std::vector<std::string> getInput(void); | ||||
|     virtual std::vector<std::string> getOutput(void); | ||||
|     // setup | ||||
|     virtual void setup(void); | ||||
|     // execution | ||||
|     virtual void execute(void); | ||||
| }; | ||||
|  | ||||
| MODULE_REGISTER_NS(NoiseLoop, TNoiseLoop<FIMPL>, MLoop); | ||||
|  | ||||
| /****************************************************************************** | ||||
|  *                 TNoiseLoop implementation                                  * | ||||
|  ******************************************************************************/ | ||||
| // constructor ///////////////////////////////////////////////////////////////// | ||||
| template <typename FImpl> | ||||
| TNoiseLoop<FImpl>::TNoiseLoop(const std::string name) | ||||
| : Module<NoiseLoopPar>(name) | ||||
| {} | ||||
|  | ||||
| // dependencies/products /////////////////////////////////////////////////////// | ||||
| template <typename FImpl> | ||||
| std::vector<std::string> TNoiseLoop<FImpl>::getInput(void) | ||||
| { | ||||
|     std::vector<std::string> in = {par().q, par().eta}; | ||||
|      | ||||
|     return in; | ||||
| } | ||||
|  | ||||
| template <typename FImpl> | ||||
| std::vector<std::string> TNoiseLoop<FImpl>::getOutput(void) | ||||
| { | ||||
|     std::vector<std::string> out = {getName()}; | ||||
|      | ||||
|     return out; | ||||
| } | ||||
|  | ||||
| // setup /////////////////////////////////////////////////////////////////////// | ||||
| template <typename FImpl> | ||||
| void TNoiseLoop<FImpl>::setup(void) | ||||
| { | ||||
|     env().template registerLattice<PropagatorField>(getName()); | ||||
| } | ||||
|  | ||||
| // execution /////////////////////////////////////////////////////////////////// | ||||
| template <typename FImpl> | ||||
| void TNoiseLoop<FImpl>::execute(void) | ||||
| { | ||||
|     PropagatorField &loop = *env().template createLattice<PropagatorField>(getName()); | ||||
|     PropagatorField &q    = *env().template getObject<PropagatorField>(par().q); | ||||
|     PropagatorField &eta  = *env().template getObject<PropagatorField>(par().eta); | ||||
|     loop = q*adj(eta); | ||||
| } | ||||
|  | ||||
| END_MODULE_NAMESPACE | ||||
|  | ||||
| END_HADRONS_NAMESPACE | ||||
|  | ||||
| #endif // Hadrons_NoiseLoop_hpp_ | ||||
| @@ -6,6 +6,7 @@ Source file: extras/Hadrons/Modules/MSource/SeqGamma.hpp | ||||
|  | ||||
| Copyright (C) 2015 | ||||
| Copyright (C) 2016 | ||||
| Copyright (C) 2017 | ||||
|  | ||||
| Author: Antonin Portelli <antonin.portelli@me.com> | ||||
|  | ||||
| @@ -149,9 +150,9 @@ void TSeqGamma<FImpl>::execute(void) | ||||
|     for(unsigned int mu = 0; mu < env().getNd(); mu++) | ||||
|     { | ||||
|         LatticeCoordinate(coor, mu); | ||||
|         ph = ph + p[mu]*coor; | ||||
|         ph = ph + p[mu]*coor*((1./(env().getGrid()->_fdimensions[mu]))); | ||||
|     } | ||||
|     ph = exp(i*ph); | ||||
|     ph = exp((Real)(2*M_PI)*i*ph); | ||||
|     LatticeCoordinate(t, Tp); | ||||
|     src = where((t >= par().tA) and (t <= par().tB), ph*(g*q), 0.*q); | ||||
| } | ||||
|   | ||||
							
								
								
									
										147
									
								
								extras/Hadrons/Modules/MSource/Wall.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										147
									
								
								extras/Hadrons/Modules/MSource/Wall.hpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,147 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
| Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| Source file: extras/Hadrons/Modules/MSource/Wall.hpp | ||||
|  | ||||
| Copyright (C) 2017 | ||||
|  | ||||
| Author: Andrew Lawson <andrew.lawson1991@gmail.com> | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
|  | ||||
| You should have received a copy of the GNU General Public License along | ||||
| with this program; if not, write to the Free Software Foundation, Inc., | ||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
| See the full license in the file "LICENSE" in the top level distribution directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
|  | ||||
| #ifndef Hadrons_WallSource_hpp_ | ||||
| #define Hadrons_WallSource_hpp_ | ||||
|  | ||||
| #include <Grid/Hadrons/Global.hpp> | ||||
| #include <Grid/Hadrons/Module.hpp> | ||||
| #include <Grid/Hadrons/ModuleFactory.hpp> | ||||
|  | ||||
| BEGIN_HADRONS_NAMESPACE | ||||
|  | ||||
| /* | ||||
|   | ||||
|  Wall source | ||||
|  ----------------------------- | ||||
|  * src_x = delta(x_3 - tW) * exp(i x.mom) | ||||
|   | ||||
|  * options: | ||||
|  - tW: source timeslice (integer) | ||||
|  - mom: momentum insertion, space-separated float sequence (e.g ".1 .2 1. 0.") | ||||
|   | ||||
|  */ | ||||
|  | ||||
| /****************************************************************************** | ||||
|  *                         Wall                                               * | ||||
|  ******************************************************************************/ | ||||
| BEGIN_MODULE_NAMESPACE(MSource) | ||||
|  | ||||
| class WallPar: Serializable | ||||
| { | ||||
| public: | ||||
|     GRID_SERIALIZABLE_CLASS_MEMBERS(WallPar, | ||||
|                                     unsigned int, tW, | ||||
|                                     std::string, mom); | ||||
| }; | ||||
|  | ||||
| template <typename FImpl> | ||||
| class TWall: public Module<WallPar> | ||||
| { | ||||
| public: | ||||
|     TYPE_ALIASES(FImpl,); | ||||
| public: | ||||
|     // constructor | ||||
|     TWall(const std::string name); | ||||
|     // destructor | ||||
|     virtual ~TWall(void) = default; | ||||
|     // dependency relation | ||||
|     virtual std::vector<std::string> getInput(void); | ||||
|     virtual std::vector<std::string> getOutput(void); | ||||
|     // setup | ||||
|     virtual void setup(void); | ||||
|     // execution | ||||
|     virtual void execute(void); | ||||
| }; | ||||
|  | ||||
| MODULE_REGISTER_NS(Wall, TWall<FIMPL>, MSource); | ||||
|  | ||||
| /****************************************************************************** | ||||
|  *                 TWall implementation                                       * | ||||
|  ******************************************************************************/ | ||||
| // constructor ///////////////////////////////////////////////////////////////// | ||||
| template <typename FImpl> | ||||
| TWall<FImpl>::TWall(const std::string name) | ||||
| : Module<WallPar>(name) | ||||
| {} | ||||
|  | ||||
| // dependencies/products /////////////////////////////////////////////////////// | ||||
| template <typename FImpl> | ||||
| std::vector<std::string> TWall<FImpl>::getInput(void) | ||||
| { | ||||
|     std::vector<std::string> in; | ||||
|      | ||||
|     return in; | ||||
| } | ||||
|  | ||||
| template <typename FImpl> | ||||
| std::vector<std::string> TWall<FImpl>::getOutput(void) | ||||
| { | ||||
|     std::vector<std::string> out = {getName()}; | ||||
|      | ||||
|     return out; | ||||
| } | ||||
|  | ||||
| // setup /////////////////////////////////////////////////////////////////////// | ||||
| template <typename FImpl> | ||||
| void TWall<FImpl>::setup(void) | ||||
| { | ||||
|     env().template registerLattice<PropagatorField>(getName()); | ||||
| } | ||||
|  | ||||
| // execution /////////////////////////////////////////////////////////////////// | ||||
| template <typename FImpl> | ||||
| void TWall<FImpl>::execute(void) | ||||
| {     | ||||
|     LOG(Message) << "Generating wall source at t = " << par().tW  | ||||
|                  << " with momentum " << par().mom << std::endl; | ||||
|      | ||||
|     PropagatorField &src = *env().template createLattice<PropagatorField>(getName()); | ||||
|     Lattice<iScalar<vInteger>> t(env().getGrid()); | ||||
|     LatticeComplex             ph(env().getGrid()), coor(env().getGrid()); | ||||
|     std::vector<Real>          p; | ||||
|     Complex                    i(0.0,1.0); | ||||
|      | ||||
|     p  = strToVec<Real>(par().mom); | ||||
|     ph = zero; | ||||
|     for(unsigned int mu = 0; mu < Nd; mu++) | ||||
|     { | ||||
|         LatticeCoordinate(coor, mu); | ||||
|         ph = ph + p[mu]*coor*((1./(env().getGrid()->_fdimensions[mu]))); | ||||
|     } | ||||
|     ph = exp((Real)(2*M_PI)*i*ph); | ||||
|     LatticeCoordinate(t, Tp); | ||||
|     src = 1.; | ||||
|     src = where((t == par().tW), src*ph, 0.*src); | ||||
| } | ||||
|  | ||||
| END_MODULE_NAMESPACE | ||||
|  | ||||
| END_HADRONS_NAMESPACE | ||||
|  | ||||
| #endif // Hadrons_WallSource_hpp_ | ||||
| @@ -173,7 +173,7 @@ void TQuark<FImpl>::execute(void) | ||||
|                 *env().template getObject<PropagatorField>(getName()); | ||||
|              | ||||
|             axpby_ssp_pminus(sol, 0., sol, 1., sol, 0, 0); | ||||
|             axpby_ssp_pplus(sol, 0., sol, 1., sol, 0, Ls_-1); | ||||
|             axpby_ssp_pplus(sol, 1., sol, 1., sol, 0, Ls_-1); | ||||
|             ExtractSlice(tmp, sol, 0, 0); | ||||
|             FermToProp(p4d, tmp, s, c); | ||||
|         } | ||||
|   | ||||
| @@ -1,4 +1,7 @@ | ||||
| modules_cc =\ | ||||
|   Modules/MContraction/WeakHamiltonianEye.cc \ | ||||
|   Modules/MContraction/WeakHamiltonianNonEye.cc \ | ||||
|   Modules/MContraction/WeakNeutral4ptDisc.cc \ | ||||
|   Modules/MGauge/Load.cc \ | ||||
|   Modules/MGauge/Random.cc \ | ||||
|   Modules/MGauge/Unit.cc | ||||
| @@ -7,13 +10,21 @@ modules_hpp =\ | ||||
|   Modules/MAction/DWF.hpp \ | ||||
|   Modules/MAction/Wilson.hpp \ | ||||
|   Modules/MContraction/Baryon.hpp \ | ||||
|   Modules/MContraction/DiscLoop.hpp \ | ||||
|   Modules/MContraction/Gamma3pt.hpp \ | ||||
|   Modules/MContraction/Meson.hpp \ | ||||
|   Modules/MContraction/WeakHamiltonian.hpp \ | ||||
|   Modules/MContraction/WeakHamiltonianEye.hpp \ | ||||
|   Modules/MContraction/WeakHamiltonianNonEye.hpp \ | ||||
|   Modules/MContraction/WeakNeutral4ptDisc.hpp \ | ||||
|   Modules/MGauge/Load.hpp \ | ||||
|   Modules/MGauge/Random.hpp \ | ||||
|   Modules/MGauge/Unit.hpp \ | ||||
|   Modules/MLoop/NoiseLoop.hpp \ | ||||
|   Modules/MSolver/RBPrecCG.hpp \ | ||||
|   Modules/MSource/Point.hpp \ | ||||
|   Modules/MSource/SeqGamma.hpp \ | ||||
|   Modules/MSource/Wall.hpp \ | ||||
|   Modules/MSource/Z2.hpp \ | ||||
|   Modules/Quark.hpp | ||||
|  | ||||
|   | ||||
| @@ -21,3 +21,16 @@ problem. The test case works with icpc and with clang++, but fails consistently | ||||
| current variants. | ||||
|  | ||||
| Peter | ||||
|  | ||||
|  | ||||
| ************ | ||||
|  | ||||
| Second GCC bug reported, see Issue 100. | ||||
|  | ||||
| https://wandbox.org/permlink/tzssJza6R9XnqANw | ||||
| https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80652 | ||||
|  | ||||
| Getting Travis fails under gcc-5 for Test_simd, now that I added more comprehensive testing to the | ||||
| CI test suite. The limitations of Travis runtime limits & weak cores are being shown. | ||||
|  | ||||
| Travis uses 5.4.1 for g++-5. | ||||
|   | ||||
							
								
								
									
										86
									
								
								grid-config.in
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										86
									
								
								grid-config.in
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,86 @@ | ||||
| #! /bin/sh | ||||
|  | ||||
| prefix=@prefix@ | ||||
| exec_prefix=@exec_prefix@ | ||||
| includedir=@includedir@ | ||||
|  | ||||
| usage() | ||||
| { | ||||
|   cat <<EOF | ||||
| Usage: grid-config [OPTION] | ||||
|  | ||||
| Known values for OPTION are: | ||||
|  | ||||
|   --prefix     show Grid installation prefix | ||||
|   --cxxflags   print pre-processor and compiler flags | ||||
|   --ldflags    print library linking flags | ||||
|   --libs       print library linking information | ||||
|   --summary    print full build summary | ||||
|   --help       display this help and exit | ||||
|   --version    output version information | ||||
|   --git        print git revision | ||||
|  | ||||
| EOF | ||||
|    | ||||
|   exit $1 | ||||
| } | ||||
|  | ||||
| if test $# -eq 0; then | ||||
|   usage 1 | ||||
| fi | ||||
|  | ||||
| cflags=false | ||||
| libs=false | ||||
|  | ||||
| while test $# -gt 0; do | ||||
|   case "$1" in | ||||
|     -*=*) optarg=`echo "$1" | sed 's/[-_a-zA-Z0-9]*=//'` ;; | ||||
|     *) optarg= ;; | ||||
|   esac | ||||
|    | ||||
|   case "$1" in | ||||
|     --prefix) | ||||
|       echo $prefix | ||||
|     ;; | ||||
|      | ||||
|     --version) | ||||
|       echo @VERSION@ | ||||
|       exit 0 | ||||
|     ;; | ||||
|      | ||||
|     --git) | ||||
|       echo "@GRID_BRANCH@ @GRID_SHA@" | ||||
|       exit 0 | ||||
|     ;; | ||||
|      | ||||
|     --help) | ||||
|       usage 0 | ||||
|     ;; | ||||
|      | ||||
|     --cxxflags) | ||||
|       echo @GRID_CXXFLAGS@ | ||||
|     ;; | ||||
|      | ||||
|     --ldflags) | ||||
|       echo @GRID_LDFLAGS@ | ||||
|     ;; | ||||
|      | ||||
|     --libs) | ||||
|       echo @GRID_LIBS@ | ||||
|     ;; | ||||
|      | ||||
|     --summary) | ||||
|       echo "" | ||||
|       echo "@GRID_SUMMARY@" | ||||
|       echo "" | ||||
|     ;; | ||||
|      | ||||
|     *) | ||||
|       usage | ||||
|       exit 1 | ||||
|     ;; | ||||
|   esac | ||||
|   shift | ||||
| done | ||||
|  | ||||
| exit 0 | ||||
							
								
								
									
										37
									
								
								lib/DisableWarnings.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								lib/DisableWarnings.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,37 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
| Grid physics library, www.github.com/paboyle/Grid | ||||
|  | ||||
| Source file: ./lib/DisableWarnings.h | ||||
|  | ||||
| Copyright (C) 2016 | ||||
|  | ||||
| Author: Guido Cossu <guido.cossu@ed.ac.uk> | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
|  | ||||
| You should have received a copy of the GNU General Public License along | ||||
| with this program; if not, write to the Free Software Foundation, Inc., | ||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
| See the full license in the file "LICENSE" in the top level distribution | ||||
| directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
|  | ||||
| #ifndef DISABLE_WARNINGS_H | ||||
| #define DISABLE_WARNINGS_H | ||||
|  | ||||
|  //disables and intel compiler specific warning (in json.hpp) | ||||
| #pragma warning disable 488   | ||||
|  | ||||
|  | ||||
| #endif | ||||
							
								
								
									
										53
									
								
								lib/Grid.h
									
									
									
									
									
								
							
							
						
						
									
										53
									
								
								lib/Grid.h
									
									
									
									
									
								
							| @@ -38,52 +38,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
| #ifndef GRID_H | ||||
| #define GRID_H | ||||
|  | ||||
| /////////////////// | ||||
| // Std C++ dependencies | ||||
| /////////////////// | ||||
| #include <cassert> | ||||
| #include <complex> | ||||
| #include <vector> | ||||
| #include <iostream> | ||||
| #include <iomanip> | ||||
| #include <random> | ||||
| #include <functional> | ||||
| #include <stdio.h> | ||||
| #include <stdlib.h> | ||||
| #include <stdio.h> | ||||
| #include <signal.h> | ||||
| #include <ctime> | ||||
| #include <sys/time.h> | ||||
| #include <chrono> | ||||
|  | ||||
| /////////////////// | ||||
| // Grid headers | ||||
| /////////////////// | ||||
| #include "Config.h" | ||||
| #include <Grid/Timer.h> | ||||
| #include <Grid/PerfCount.h> | ||||
| #include <Grid/Log.h> | ||||
| #include <Grid/AlignedAllocator.h> | ||||
| #include <Grid/Simd.h> | ||||
| #include <Grid/serialisation/Serialisation.h> | ||||
| #include <Grid/Threads.h> | ||||
| #include <Grid/Lexicographic.h> | ||||
| #include <Grid/Init.h> | ||||
| #include <Grid/Communicator.h>  | ||||
| #include <Grid/Cartesian.h>     | ||||
| #include <Grid/Tensors.h>       | ||||
| #include <Grid/Lattice.h>       | ||||
| #include <Grid/Cshift.h>        | ||||
| #include <Grid/Stencil.h>       | ||||
| #include <Grid/Algorithms.h>    | ||||
| #include <Grid/parallelIO/BinaryIO.h> | ||||
| #include <Grid/FFT.h> | ||||
|  | ||||
| #include <Grid/qcd/QCD.h> | ||||
| #include <Grid/parallelIO/NerscIO.h> | ||||
| #include <Grid/qcd/hmc/NerscCheckpointer.h> | ||||
| #include <Grid/qcd/hmc/HmcRunner.h> | ||||
|  | ||||
|  | ||||
| #include <Grid/GridCore.h> | ||||
| #include <Grid/GridQCDcore.h> | ||||
| #include <Grid/qcd/action/Action.h> | ||||
| #include <Grid/qcd/smearing/Smearing.h> | ||||
| #include <Grid/parallelIO/MetaData.h> | ||||
| #include <Grid/qcd/hmc/HMC_aggregate.h> | ||||
|  | ||||
| #endif | ||||
|   | ||||
| @@ -2,11 +2,13 @@ | ||||
| 
 | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
| 
 | ||||
|     Source file: ./lib/algorithms/iterative/MatrixUtils.h | ||||
|     Source file: ./lib/Grid.h | ||||
| 
 | ||||
|     Copyright (C) 2015 | ||||
| 
 | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| Author: azusayamaguchi <ayamaguc@YAMAKAZE.local> | ||||
| Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
| 
 | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
| @@ -25,51 +27,34 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #ifndef GRID_MATRIX_UTILS_H | ||||
| #define GRID_MATRIX_UTILS_H | ||||
| //
 | ||||
| //  Grid.h
 | ||||
| //  simd
 | ||||
| //
 | ||||
| //  Created by Peter Boyle on 09/05/2014.
 | ||||
| //  Copyright (c) 2014 University of Edinburgh. All rights reserved.
 | ||||
| //
 | ||||
| 
 | ||||
| namespace Grid { | ||||
| #ifndef GRID_BASE_H | ||||
| #define GRID_BASE_H | ||||
| 
 | ||||
|   namespace MatrixUtils {  | ||||
| #include <Grid/GridStd.h> | ||||
| 
 | ||||
|     template<class T> inline void Size(Matrix<T>& A,int &N,int &M){ | ||||
|       N=A.size(); assert(N>0); | ||||
|       M=A[0].size(); | ||||
|       for(int i=0;i<N;i++){ | ||||
| 	assert(A[i].size()==M); | ||||
|       } | ||||
|     } | ||||
| #include <Grid/perfmon/Timer.h> | ||||
| #include <Grid/perfmon/PerfCount.h> | ||||
| #include <Grid/log/Log.h> | ||||
| #include <Grid/allocator/AlignedAllocator.h> | ||||
| #include <Grid/simd/Simd.h> | ||||
| #include <Grid/serialisation/Serialisation.h> | ||||
| #include <Grid/threads/Threads.h> | ||||
| #include <Grid/util/Util.h> | ||||
| #include <Grid/communicator/Communicator.h>  | ||||
| #include <Grid/cartesian/Cartesian.h>     | ||||
| #include <Grid/tensors/Tensors.h>       | ||||
| #include <Grid/lattice/Lattice.h>       | ||||
| #include <Grid/cshift/Cshift.h>        | ||||
| #include <Grid/stencil/Stencil.h>       | ||||
| #include <Grid/parallelIO/BinaryIO.h> | ||||
| #include <Grid/algorithms/Algorithms.h>    | ||||
| 
 | ||||
|     template<class T> inline void SizeSquare(Matrix<T>& A,int &N) | ||||
|     { | ||||
|       int M; | ||||
|       Size(A,N,M); | ||||
|       assert(N==M); | ||||
|     } | ||||
| 
 | ||||
|     template<class T> inline void Fill(Matrix<T>& A,T & val) | ||||
|     {  | ||||
|       int N,M; | ||||
|       Size(A,N,M); | ||||
|       for(int i=0;i<N;i++){ | ||||
|       for(int j=0;j<M;j++){ | ||||
| 	A[i][j]=val; | ||||
|       }} | ||||
|     } | ||||
|     template<class T> inline void Diagonal(Matrix<T>& A,T & val) | ||||
|     {  | ||||
|       int N; | ||||
|       SizeSquare(A,N); | ||||
|       for(int i=0;i<N;i++){ | ||||
| 	A[i][i]=val; | ||||
|       } | ||||
|     } | ||||
|     template<class T> inline void Identity(Matrix<T>& A) | ||||
|     { | ||||
|       Fill(A,0.0); | ||||
|       Diagonal(A,1.0); | ||||
|     } | ||||
| 
 | ||||
|   }; | ||||
| } | ||||
| #endif | ||||
| @@ -2,12 +2,12 @@ | ||||
| 
 | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
| 
 | ||||
|     Source file: ./lib/qcd/hmc/HMC.cc | ||||
|     Source file: ./lib/Grid.h | ||||
| 
 | ||||
|     Copyright (C) 2015 | ||||
| 
 | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| Author: neo <cossu@post.kek.jp> | ||||
| Author: azusayamaguchi <ayamaguc@YAMAKAZE.local> | ||||
| Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
| 
 | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
| @@ -27,10 +27,16 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #include <Grid/Grid.h> | ||||
| #ifndef GRID_QCD_CORE_H | ||||
| #define GRID_QCD_CORE_H | ||||
| 
 | ||||
| namespace Grid{ | ||||
|   namespace QCD{ | ||||
| /////////////////////////
 | ||||
| // Core Grid QCD headers
 | ||||
| /////////////////////////
 | ||||
| #include <Grid/GridCore.h> | ||||
| #include <Grid/qcd/QCD.h> | ||||
| #include <Grid/qcd/spin/Spin.h> | ||||
| #include <Grid/qcd/utils/Utils.h> | ||||
| #include <Grid/qcd/representations/Representations.h> | ||||
| 
 | ||||
|   } | ||||
| } | ||||
| #endif | ||||
							
								
								
									
										29
									
								
								lib/GridStd.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								lib/GridStd.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,29 @@ | ||||
| #ifndef GRID_STD_H | ||||
| #define GRID_STD_H | ||||
|  | ||||
| /////////////////// | ||||
| // Std C++ dependencies | ||||
| /////////////////// | ||||
| #include <cassert> | ||||
| #include <complex> | ||||
| #include <vector> | ||||
| #include <string> | ||||
| #include <iostream> | ||||
| #include <iomanip> | ||||
| #include <random> | ||||
| #include <functional> | ||||
| #include <stdio.h> | ||||
| #include <stdlib.h> | ||||
| #include <stdio.h> | ||||
| #include <signal.h> | ||||
| #include <ctime> | ||||
| #include <sys/time.h> | ||||
| #include <chrono> | ||||
| #include <zlib.h> | ||||
|  | ||||
| /////////////////// | ||||
| // Grid config | ||||
| /////////////////// | ||||
| #include "Config.h" | ||||
|  | ||||
| #endif /* GRID_STD_H */ | ||||
							
								
								
									
										9
									
								
								lib/Grid_Eigen_Dense.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								lib/Grid_Eigen_Dense.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,9 @@ | ||||
| #pragma once | ||||
| #if defined __GNUC__ | ||||
| #pragma GCC diagnostic push | ||||
| #pragma GCC diagnostic ignored "-Wdeprecated-declarations" | ||||
| #endif | ||||
| #include <Grid/Eigen/Dense> | ||||
| #if defined __GNUC__ | ||||
| #pragma GCC diagnostic pop | ||||
| #endif | ||||
										
											Binary file not shown.
										
									
								
							| @@ -1,154 +0,0 @@ | ||||
|     /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
|     Source file: ./lib/Old/Tensor_peek.h | ||||
|  | ||||
|     Copyright (C) 2015 | ||||
|  | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
|  | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
|  | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #ifndef GRID_MATH_PEEK_H | ||||
| #define GRID_MATH_PEEK_H | ||||
| namespace Grid { | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////////////// | ||||
| // Peek on a specific index; returns a scalar in that index, tensor inherits rest | ||||
| ////////////////////////////////////////////////////////////////////////////// | ||||
| // If we hit the right index, return scalar with no further recursion | ||||
|  | ||||
| //template<int Level> inline ComplexF peekIndex(const ComplexF arg) { return arg;} | ||||
| //template<int Level> inline ComplexD peekIndex(const ComplexD arg) { return arg;} | ||||
| //template<int Level> inline RealF peekIndex(const RealF arg) { return arg;} | ||||
| //template<int Level> inline RealD peekIndex(const RealD arg) { return arg;} | ||||
| #if 0 | ||||
| // Scalar peek, no indices | ||||
| template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline  | ||||
|   auto peekIndex(const iScalar<vtype> &arg) ->  iScalar<vtype>  | ||||
| { | ||||
|   return arg; | ||||
| } | ||||
| // Vector peek, one index | ||||
| template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline  | ||||
|   auto peekIndex(const iVector<vtype,N> &arg,int i) -> iScalar<vtype> // Index matches | ||||
| { | ||||
|   iScalar<vtype> ret;                              // return scalar | ||||
|   ret._internal = arg._internal[i]; | ||||
|   return ret; | ||||
| } | ||||
| // Matrix peek, two indices | ||||
| template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline  | ||||
|   auto peekIndex(const iMatrix<vtype,N> &arg,int i,int j) ->  iScalar<vtype> | ||||
| { | ||||
|   iScalar<vtype> ret;                              // return scalar | ||||
|   ret._internal = arg._internal[i][j]; | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| ///////////// | ||||
| // No match peek for scalar,vector,matrix must forward on either 0,1,2 args. Must have 9 routines with notvalue | ||||
| ///////////// | ||||
| // scalar | ||||
| template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||
|   auto peekIndex(const iScalar<vtype> &arg) -> iScalar<decltype(peekIndex<Level>(arg._internal))> | ||||
| { | ||||
|   iScalar<decltype(peekIndex<Level>(arg._internal))> ret; | ||||
|   ret._internal= peekIndex<Level>(arg._internal); | ||||
|   return ret; | ||||
| } | ||||
| template<int Level,class vtype, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||
|   auto peekIndex(const iScalar<vtype> &arg,int i) ->  iScalar<decltype(peekIndex<Level>(arg._internal,i))>  | ||||
| { | ||||
|   iScalar<decltype(peekIndex<Level>(arg._internal,i))> ret; | ||||
|   ret._internal=peekIndex<Level>(arg._internal,i); | ||||
|   return ret; | ||||
| } | ||||
| template<int Level,class vtype, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||
|   auto peekIndex(const iScalar<vtype> &arg,int i,int j) ->  iScalar<decltype(peekIndex<Level>(arg._internal,i,j))> | ||||
| { | ||||
|   iScalar<decltype(peekIndex<Level>(arg._internal,i,j))> ret; | ||||
|   ret._internal=peekIndex<Level>(arg._internal,i,j); | ||||
|   return ret; | ||||
| } | ||||
| // vector | ||||
| template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||
| auto peekIndex(const iVector<vtype,N> &arg) ->   iVector<decltype(peekIndex<Level>(arg._internal[0])),N> | ||||
| { | ||||
|   iVector<decltype(peekIndex<Level>(arg._internal[0])),N> ret; | ||||
|   for(int ii=0;ii<N;ii++){ | ||||
|     ret._internal[ii]=peekIndex<Level>(arg._internal[ii]); | ||||
|   } | ||||
|   return ret; | ||||
| } | ||||
| template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||
|   auto peekIndex(const iVector<vtype,N> &arg,int i) ->  iVector<decltype(peekIndex<Level>(arg._internal[0],i)),N> | ||||
| { | ||||
|   iVector<decltype(peekIndex<Level>(arg._internal[0],i)),N> ret; | ||||
|   for(int ii=0;ii<N;ii++){ | ||||
|     ret._internal[ii]=peekIndex<Level>(arg._internal[ii],i); | ||||
|   } | ||||
|   return ret; | ||||
| } | ||||
| template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||
|   auto peekIndex(const iVector<vtype,N> &arg,int i,int j) ->  iVector<decltype(peekIndex<Level>(arg._internal[0],i,j)),N>  | ||||
| { | ||||
|   iVector<decltype(peekIndex<Level>(arg._internal[0],i,j)),N> ret; | ||||
|   for(int ii=0;ii<N;ii++){ | ||||
|     ret._internal[ii]=peekIndex<Level>(arg._internal[ii],i,j); | ||||
|   } | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| // matrix | ||||
| template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||
| auto peekIndex(const iMatrix<vtype,N> &arg) ->   iMatrix<decltype(peekIndex<Level>(arg._internal[0][0])),N>  | ||||
| { | ||||
|   iMatrix<decltype(peekIndex<Level>(arg._internal[0][0])),N> ret; | ||||
|   for(int ii=0;ii<N;ii++){ | ||||
|   for(int jj=0;jj<N;jj++){ | ||||
|     ret._internal[ii][jj]=peekIndex<Level>(arg._internal[ii][jj]);// Could avoid this because peeking a scalar is dumb | ||||
|   }} | ||||
|   return ret; | ||||
| } | ||||
| template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||
|   auto peekIndex(const iMatrix<vtype,N> &arg,int i) ->   iMatrix<decltype(peekIndex<Level>(arg._internal[0][0],i)),N> | ||||
| { | ||||
|   iMatrix<decltype(peekIndex<Level>(arg._internal[0][0],i)),N> ret; | ||||
|   for(int ii=0;ii<N;ii++){ | ||||
|   for(int jj=0;jj<N;jj++){ | ||||
|     ret._internal[ii][jj]=peekIndex<Level>(arg._internal[ii][jj],i); | ||||
|   }} | ||||
|   return ret; | ||||
| } | ||||
| template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||
|   auto peekIndex(const iMatrix<vtype,N> &arg,int i,int j) ->   iMatrix<decltype(peekIndex<Level>(arg._internal[0][0],i,j)),N> | ||||
| { | ||||
|   iMatrix<decltype(peekIndex<Level>(arg._internal[0][0],i,j)),N> ret; | ||||
|   for(int ii=0;ii<N;ii++){ | ||||
|   for(int jj=0;jj<N;jj++){ | ||||
|     ret._internal[ii][jj]=peekIndex<Level>(arg._internal[ii][jj],i,j); | ||||
|   }} | ||||
|   return ret; | ||||
| } | ||||
| #endif | ||||
|  | ||||
|  | ||||
| } | ||||
| #endif | ||||
| @@ -1,127 +0,0 @@ | ||||
|     /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
|     Source file: ./lib/Old/Tensor_poke.h | ||||
|  | ||||
|     Copyright (C) 2015 | ||||
|  | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
|  | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
|  | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #ifndef GRID_MATH_POKE_H | ||||
| #define GRID_MATH_POKE_H | ||||
| namespace Grid { | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////////////// | ||||
| // Poke a specific index;  | ||||
| ////////////////////////////////////////////////////////////////////////////// | ||||
| #if 0 | ||||
| // Scalar poke | ||||
| template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline  | ||||
|   void pokeIndex(iScalar<vtype> &ret, const iScalar<vtype> &arg) | ||||
| { | ||||
|   ret._internal = arg._internal; | ||||
| } | ||||
| // Vector poke, one index | ||||
| template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline  | ||||
|   void pokeIndex(iVector<vtype,N> &ret, const iScalar<vtype> &arg,int i) | ||||
| { | ||||
|   ret._internal[i] = arg._internal; | ||||
| } | ||||
| //Matrix poke, two indices | ||||
| template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline  | ||||
|   void pokeIndex(iMatrix<vtype,N> &ret, const iScalar<vtype> &arg,int i,int j) | ||||
| { | ||||
|   ret._internal[i][j] = arg._internal; | ||||
| } | ||||
|  | ||||
| ///////////// | ||||
| // No match poke for scalar,vector,matrix must forward on either 0,1,2 args. Must have 9 routines with notvalue | ||||
| ///////////// | ||||
| // scalar | ||||
| template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||
| void pokeIndex(iScalar<vtype> &ret, const iScalar<decltype(peekIndex<Level>(ret._internal))>  &arg) | ||||
| { | ||||
|   pokeIndex<Level>(ret._internal,arg._internal); | ||||
| } | ||||
| template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||
|   void pokeIndex(iScalar<vtype> &ret, const iScalar<decltype(peekIndex<Level>(ret._internal,0))> &arg, int i) | ||||
| 		  | ||||
| { | ||||
|   pokeIndex<Level>(ret._internal,arg._internal,i); | ||||
| } | ||||
| template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||
|   void pokeIndex(iScalar<vtype> &ret, const iScalar<decltype(peekIndex<Level>(ret._internal,0,0))> &arg,int i,int j) | ||||
| { | ||||
|   pokeIndex<Level>(ret._internal,arg._internal,i,j); | ||||
| } | ||||
|  | ||||
| // Vector | ||||
| template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||
|   void pokeIndex(iVector<vtype,N> &ret, iVector<decltype(peekIndex<Level>(ret._internal)),N>  &arg) | ||||
| { | ||||
|   for(int ii=0;ii<N;ii++){ | ||||
|     pokeIndex<Level>(ret._internal[ii],arg._internal[ii]); | ||||
|   } | ||||
| } | ||||
| template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||
|   void pokeIndex(iVector<vtype,N> &ret, const iVector<decltype(peekIndex<Level>(ret._internal,0)),N> &arg,int i) | ||||
| { | ||||
|   for(int ii=0;ii<N;ii++){ | ||||
|     pokeIndex<Level>(ret._internal[ii],arg._internal[ii],i); | ||||
|   } | ||||
| } | ||||
| template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||
|   void pokeIndex(iVector<vtype,N> &ret, const iVector<decltype(peekIndex<Level>(ret._internal,0,0)),N> &arg,int i,int j) | ||||
| { | ||||
|   for(int ii=0;ii<N;ii++){ | ||||
|     pokeIndex<Level>(ret._internal[ii],arg._internal[ii],i,j); | ||||
|   } | ||||
| } | ||||
|  | ||||
| // Matrix | ||||
| template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||
|   void pokeIndex(iMatrix<vtype,N> &ret, const iMatrix<decltype(peekIndex<Level>(ret._internal)),N> &arg)		  | ||||
| { | ||||
|   for(int ii=0;ii<N;ii++){ | ||||
|   for(int jj=0;jj<N;jj++){ | ||||
|     pokeIndex<Level>(ret._internal[ii][jj],arg._internal[ii][jj]); | ||||
|   }} | ||||
| } | ||||
| template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||
|   void pokeIndex(iMatrix<vtype,N> &ret, const iMatrix<decltype(peekIndex<Level>(ret._internal,0)),N> &arg,int i) | ||||
| { | ||||
|   for(int ii=0;ii<N;ii++){ | ||||
|   for(int jj=0;jj<N;jj++){ | ||||
|     pokeIndex<Level>(ret._internal[ii][jj],arg._internal[ii][jj],i); | ||||
|   }} | ||||
| } | ||||
| template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||
|   void pokeIndex(iMatrix<vtype,N> &ret, const iMatrix<decltype(peekIndex<Level>(ret._internal,0,0)),N> &arg, int i,int j) | ||||
| { | ||||
|   for(int ii=0;ii<N;ii++){ | ||||
|   for(int jj=0;jj<N;jj++){ | ||||
|     pokeIndex<Level>(ret._internal[ii][jj],arg._internal[ii][jj],i,j); | ||||
|   }} | ||||
| } | ||||
| #endif | ||||
|  | ||||
| } | ||||
| #endif | ||||
| @@ -39,19 +39,17 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| #include <Grid/algorithms/approx/MultiShiftFunction.h> | ||||
| 
 | ||||
| #include <Grid/algorithms/iterative/ConjugateGradient.h> | ||||
| #include <Grid/algorithms/iterative/ConjugateGradientShifted.h> | ||||
| #include <Grid/algorithms/iterative/ConjugateResidual.h> | ||||
| #include <Grid/algorithms/iterative/NormalEquations.h> | ||||
| #include <Grid/algorithms/iterative/SchurRedBlack.h> | ||||
| 
 | ||||
| #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h> | ||||
| #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h> | ||||
| 
 | ||||
| // Lanczos support
 | ||||
| #include <Grid/algorithms/iterative/MatrixUtils.h> | ||||
| //#include <Grid/algorithms/iterative/MatrixUtils.h>
 | ||||
| #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h> | ||||
| 
 | ||||
| #include <Grid/algorithms/CoarsenedMatrix.h> | ||||
| #include <Grid/algorithms/FFT.h> | ||||
| 
 | ||||
| // Eigen/lanczos
 | ||||
| // EigCg
 | ||||
| @@ -267,8 +267,7 @@ namespace Grid { | ||||
|       SimpleCompressor<siteVector> compressor; | ||||
|       Stencil.HaloExchange(in,compressor); | ||||
|  | ||||
| PARALLEL_FOR_LOOP | ||||
|       for(int ss=0;ss<Grid()->oSites();ss++){ | ||||
|       parallel_for(int ss=0;ss<Grid()->oSites();ss++){ | ||||
|         siteVector res = zero; | ||||
| 	siteVector nbr; | ||||
| 	int ptype; | ||||
| @@ -380,8 +379,7 @@ PARALLEL_FOR_LOOP | ||||
| 	  Subspace.ProjectToSubspace(oProj,oblock); | ||||
| 	  //	  blockProject(iProj,iblock,Subspace.subspace); | ||||
| 	  //	  blockProject(oProj,oblock,Subspace.subspace); | ||||
| PARALLEL_FOR_LOOP | ||||
| 	  for(int ss=0;ss<Grid()->oSites();ss++){ | ||||
| 	  parallel_for(int ss=0;ss<Grid()->oSites();ss++){ | ||||
| 	    for(int j=0;j<nbasis;j++){ | ||||
| 	      if( disp!= 0 ) { | ||||
| 		A[p]._odata[ss](j,i) = oProj._odata[ss](j); | ||||
| @@ -427,7 +425,7 @@ PARALLEL_FOR_LOOP | ||||
| 	A[p]=zero; | ||||
|       } | ||||
|  | ||||
|       GridParallelRNG  RNG(Grid()); RNG.SeedRandomDevice(); | ||||
|       GridParallelRNG  RNG(Grid()); RNG.SeedFixedIntegers(std::vector<int>({55,72,19,17,34})); | ||||
|       Lattice<iScalar<CComplex> > val(Grid()); random(RNG,val); | ||||
|  | ||||
|       Complex one(1.0); | ||||
|   | ||||
| @@ -235,7 +235,7 @@ namespace Grid { | ||||
| 	Field tmp(in._grid); | ||||
|  | ||||
| 	_Mat.MeooeDag(in,tmp); | ||||
| 	_Mat.MooeeInvDag(tmp,out); | ||||
|         _Mat.MooeeInvDag(tmp,out); | ||||
| 	_Mat.MeooeDag(out,tmp); | ||||
|  | ||||
| 	_Mat.MooeeDag(in,out); | ||||
|   | ||||
| @@ -197,8 +197,9 @@ namespace Grid { | ||||
|     void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) { | ||||
|  | ||||
|       GridBase *grid=in._grid; | ||||
| //std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl; | ||||
| //<<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl; | ||||
|  | ||||
|       // std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl; | ||||
|       //std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl; | ||||
|  | ||||
|       int vol=grid->gSites(); | ||||
|  | ||||
|   | ||||
| @@ -25,7 +25,7 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #include <Grid/Grid.h> | ||||
| #include <Grid/GridCore.h> | ||||
|  | ||||
| namespace Grid { | ||||
| double MultiShiftFunction::approx(double x) | ||||
|   | ||||
| @@ -16,7 +16,7 @@ | ||||
| #define INCLUDED_ALG_REMEZ_H | ||||
|  | ||||
| #include <stddef.h> | ||||
| #include <Config.h> | ||||
| #include <Grid/GridStd.h> | ||||
|  | ||||
| #ifdef HAVE_LIBGMP | ||||
| #include "bigfloat.h" | ||||
|   | ||||
							
								
								
									
										366
									
								
								lib/algorithms/iterative/BlockConjugateGradient.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										366
									
								
								lib/algorithms/iterative/BlockConjugateGradient.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,366 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
| Grid physics library, www.github.com/paboyle/Grid | ||||
|  | ||||
| Source file: ./lib/algorithms/iterative/BlockConjugateGradient.h | ||||
|  | ||||
| Copyright (C) 2017 | ||||
|  | ||||
| Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
|  | ||||
| You should have received a copy of the GNU General Public License along | ||||
| with this program; if not, write to the Free Software Foundation, Inc., | ||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
| See the full license in the file "LICENSE" in the top level distribution | ||||
| directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H | ||||
| #define GRID_BLOCK_CONJUGATE_GRADIENT_H | ||||
|  | ||||
|  | ||||
| namespace Grid { | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////////// | ||||
| // Block conjugate gradient. Dimension zero should be the block direction | ||||
| ////////////////////////////////////////////////////////////////////////// | ||||
| template <class Field> | ||||
| class BlockConjugateGradient : public OperatorFunction<Field> { | ||||
|  public: | ||||
|  | ||||
|   typedef typename Field::scalar_type scomplex; | ||||
|  | ||||
|   const int blockDim = 0; | ||||
|  | ||||
|   int Nblock; | ||||
|   bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge. | ||||
|                            // Defaults true. | ||||
|   RealD Tolerance; | ||||
|   Integer MaxIterations; | ||||
|   Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion | ||||
|    | ||||
|   BlockConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true) | ||||
|     : Tolerance(tol), | ||||
|     MaxIterations(maxit), | ||||
|     ErrorOnNoConverge(err_on_no_conv){}; | ||||
|  | ||||
| void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)  | ||||
| { | ||||
|   int Orthog = 0; // First dimension is block dim | ||||
|   Nblock = Src._grid->_fdimensions[Orthog]; | ||||
|  | ||||
|   std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl; | ||||
|  | ||||
|   Psi.checkerboard = Src.checkerboard; | ||||
|   conformable(Psi, Src); | ||||
|  | ||||
|   Field P(Src); | ||||
|   Field AP(Src); | ||||
|   Field R(Src); | ||||
|    | ||||
|   Eigen::MatrixXcd m_pAp    = Eigen::MatrixXcd::Identity(Nblock,Nblock); | ||||
|   Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock); | ||||
|   Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock); | ||||
|   Eigen::MatrixXcd m_rr_inv = Eigen::MatrixXcd::Zero(Nblock,Nblock); | ||||
|  | ||||
|   Eigen::MatrixXcd m_alpha      = Eigen::MatrixXcd::Zero(Nblock,Nblock); | ||||
|   Eigen::MatrixXcd m_beta   = Eigen::MatrixXcd::Zero(Nblock,Nblock); | ||||
|  | ||||
|   // Initial residual computation & set up | ||||
|   std::vector<RealD> residuals(Nblock); | ||||
|   std::vector<RealD> ssq(Nblock); | ||||
|  | ||||
|   sliceNorm(ssq,Src,Orthog); | ||||
|   RealD sssum=0; | ||||
|   for(int b=0;b<Nblock;b++) sssum+=ssq[b]; | ||||
|  | ||||
|   sliceNorm(residuals,Src,Orthog); | ||||
|   for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); } | ||||
|  | ||||
|   sliceNorm(residuals,Psi,Orthog); | ||||
|   for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); } | ||||
|  | ||||
|   // Initial search dir is guess | ||||
|   Linop.HermOp(Psi, AP); | ||||
|    | ||||
|  | ||||
|   /************************************************************************ | ||||
|    * Block conjugate gradient (Stephen Pickles, thesis 1995, pp 71, O Leary 1980) | ||||
|    ************************************************************************ | ||||
|    * O'Leary : R = B - A X | ||||
|    * O'Leary : P = M R ; preconditioner M = 1 | ||||
|    * O'Leary : alpha = PAP^{-1} RMR | ||||
|    * O'Leary : beta  = RMR^{-1}_old RMR_new | ||||
|    * O'Leary : X=X+Palpha | ||||
|    * O'Leary : R_new=R_old-AP alpha | ||||
|    * O'Leary : P=MR_new+P beta | ||||
|    */ | ||||
|  | ||||
|   R = Src - AP;   | ||||
|   P = R; | ||||
|   sliceInnerProductMatrix(m_rr,R,R,Orthog); | ||||
|  | ||||
|   GridStopWatch sliceInnerTimer; | ||||
|   GridStopWatch sliceMaddTimer; | ||||
|   GridStopWatch MatrixTimer; | ||||
|   GridStopWatch SolverTimer; | ||||
|   SolverTimer.Start(); | ||||
|  | ||||
|   int k; | ||||
|   for (k = 1; k <= MaxIterations; k++) { | ||||
|  | ||||
|     RealD rrsum=0; | ||||
|     for(int b=0;b<Nblock;b++) rrsum+=real(m_rr(b,b)); | ||||
|  | ||||
|     std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum | ||||
| 	      <<" / "<<std::sqrt(rrsum/sssum) <<std::endl; | ||||
|  | ||||
|     MatrixTimer.Start(); | ||||
|     Linop.HermOp(P, AP); | ||||
|     MatrixTimer.Stop(); | ||||
|  | ||||
|     // Alpha | ||||
|     sliceInnerTimer.Start(); | ||||
|     sliceInnerProductMatrix(m_pAp,P,AP,Orthog); | ||||
|     sliceInnerTimer.Stop(); | ||||
|     m_pAp_inv = m_pAp.inverse(); | ||||
|     m_alpha   = m_pAp_inv * m_rr ; | ||||
|  | ||||
|     // Psi, R update | ||||
|     sliceMaddTimer.Start(); | ||||
|     sliceMaddMatrix(Psi,m_alpha, P,Psi,Orthog);     // add alpha *  P to psi | ||||
|     sliceMaddMatrix(R  ,m_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid | ||||
|     sliceMaddTimer.Stop(); | ||||
|  | ||||
|     // Beta | ||||
|     m_rr_inv = m_rr.inverse(); | ||||
|     sliceInnerTimer.Start(); | ||||
|     sliceInnerProductMatrix(m_rr,R,R,Orthog); | ||||
|     sliceInnerTimer.Stop(); | ||||
|     m_beta = m_rr_inv *m_rr; | ||||
|  | ||||
|     // Search update | ||||
|     sliceMaddTimer.Start(); | ||||
|     sliceMaddMatrix(AP,m_beta,P,R,Orthog); | ||||
|     sliceMaddTimer.Stop(); | ||||
|     P= AP; | ||||
|  | ||||
|     /********************* | ||||
|      * convergence monitor | ||||
|      ********************* | ||||
|      */ | ||||
|     RealD max_resid=0; | ||||
|     for(int b=0;b<Nblock;b++){ | ||||
|       RealD rr = real(m_rr(b,b))/ssq[b]; | ||||
|       if ( rr > max_resid ) max_resid = rr; | ||||
|     } | ||||
|      | ||||
|     if ( max_resid < Tolerance*Tolerance ) {  | ||||
|  | ||||
|       SolverTimer.Stop(); | ||||
|  | ||||
|       std::cout << GridLogMessage<<"BlockCG converged in "<<k<<" iterations"<<std::endl; | ||||
|       for(int b=0;b<Nblock;b++){ | ||||
| 	std::cout << GridLogMessage<< "\t\tblock "<<b<<" resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl; | ||||
|       } | ||||
|       std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl; | ||||
|  | ||||
|       Linop.HermOp(Psi, AP); | ||||
|       AP = AP-Src; | ||||
|       std::cout << GridLogMessage <<"\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl; | ||||
|  | ||||
|       std::cout << GridLogMessage << "Time Breakdown "<<std::endl; | ||||
|       std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl; | ||||
|       std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl; | ||||
|       std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl; | ||||
|       std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl; | ||||
| 	     | ||||
|       IterationsToComplete = k; | ||||
|       return; | ||||
|     } | ||||
|  | ||||
|   } | ||||
|   std::cout << GridLogMessage << "BlockConjugateGradient did NOT converge" << std::endl; | ||||
|  | ||||
|   if (ErrorOnNoConverge) assert(0); | ||||
|   IterationsToComplete = k; | ||||
| } | ||||
| }; | ||||
|  | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////////// | ||||
| // multiRHS conjugate gradient. Dimension zero should be the block direction | ||||
| ////////////////////////////////////////////////////////////////////////// | ||||
| template <class Field> | ||||
| class MultiRHSConjugateGradient : public OperatorFunction<Field> { | ||||
|  public: | ||||
|  | ||||
|   typedef typename Field::scalar_type scomplex; | ||||
|  | ||||
|   const int blockDim = 0; | ||||
|  | ||||
|   int Nblock; | ||||
|   bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge. | ||||
|                            // Defaults true. | ||||
|   RealD Tolerance; | ||||
|   Integer MaxIterations; | ||||
|   Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion | ||||
|    | ||||
|    MultiRHSConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true) | ||||
|     : Tolerance(tol), | ||||
|     MaxIterations(maxit), | ||||
|     ErrorOnNoConverge(err_on_no_conv){}; | ||||
|  | ||||
| void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)  | ||||
| { | ||||
|   int Orthog = 0; // First dimension is block dim | ||||
|   Nblock = Src._grid->_fdimensions[Orthog]; | ||||
|  | ||||
|   std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl; | ||||
|  | ||||
|   Psi.checkerboard = Src.checkerboard; | ||||
|   conformable(Psi, Src); | ||||
|  | ||||
|   Field P(Src); | ||||
|   Field AP(Src); | ||||
|   Field R(Src); | ||||
|    | ||||
|   std::vector<ComplexD> v_pAp(Nblock); | ||||
|   std::vector<RealD> v_rr (Nblock); | ||||
|   std::vector<RealD> v_rr_inv(Nblock); | ||||
|   std::vector<RealD> v_alpha(Nblock); | ||||
|   std::vector<RealD> v_beta(Nblock); | ||||
|  | ||||
|   // Initial residual computation & set up | ||||
|   std::vector<RealD> residuals(Nblock); | ||||
|   std::vector<RealD> ssq(Nblock); | ||||
|  | ||||
|   sliceNorm(ssq,Src,Orthog); | ||||
|   RealD sssum=0; | ||||
|   for(int b=0;b<Nblock;b++) sssum+=ssq[b]; | ||||
|  | ||||
|   sliceNorm(residuals,Src,Orthog); | ||||
|   for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); } | ||||
|  | ||||
|   sliceNorm(residuals,Psi,Orthog); | ||||
|   for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); } | ||||
|  | ||||
|   // Initial search dir is guess | ||||
|   Linop.HermOp(Psi, AP); | ||||
|  | ||||
|   R = Src - AP;   | ||||
|   P = R; | ||||
|   sliceNorm(v_rr,R,Orthog); | ||||
|  | ||||
|   GridStopWatch sliceInnerTimer; | ||||
|   GridStopWatch sliceMaddTimer; | ||||
|   GridStopWatch sliceNormTimer; | ||||
|   GridStopWatch MatrixTimer; | ||||
|   GridStopWatch SolverTimer; | ||||
|  | ||||
|   SolverTimer.Start(); | ||||
|   int k; | ||||
|   for (k = 1; k <= MaxIterations; k++) { | ||||
|  | ||||
|     RealD rrsum=0; | ||||
|     for(int b=0;b<Nblock;b++) rrsum+=real(v_rr[b]); | ||||
|  | ||||
|     std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum | ||||
| 	      <<" / "<<std::sqrt(rrsum/sssum) <<std::endl; | ||||
|  | ||||
|     MatrixTimer.Start(); | ||||
|     Linop.HermOp(P, AP); | ||||
|     MatrixTimer.Stop(); | ||||
|  | ||||
|     // Alpha | ||||
|     //    sliceInnerProductVectorTest(v_pAp_test,P,AP,Orthog); | ||||
|     sliceInnerTimer.Start(); | ||||
|     sliceInnerProductVector(v_pAp,P,AP,Orthog); | ||||
|     sliceInnerTimer.Stop(); | ||||
|     for(int b=0;b<Nblock;b++){ | ||||
|       //      std::cout << " "<< v_pAp[b]<<" "<< v_pAp_test[b]<<std::endl; | ||||
|       v_alpha[b] = v_rr[b]/real(v_pAp[b]); | ||||
|     } | ||||
|  | ||||
|     // Psi, R update | ||||
|     sliceMaddTimer.Start(); | ||||
|     sliceMaddVector(Psi,v_alpha, P,Psi,Orthog);     // add alpha *  P to psi | ||||
|     sliceMaddVector(R  ,v_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid | ||||
|     sliceMaddTimer.Stop(); | ||||
|  | ||||
|     // Beta | ||||
|     for(int b=0;b<Nblock;b++){ | ||||
|       v_rr_inv[b] = 1.0/v_rr[b]; | ||||
|     } | ||||
|     sliceNormTimer.Start(); | ||||
|     sliceNorm(v_rr,R,Orthog); | ||||
|     sliceNormTimer.Stop(); | ||||
|     for(int b=0;b<Nblock;b++){ | ||||
|       v_beta[b] = v_rr_inv[b] *v_rr[b]; | ||||
|     } | ||||
|  | ||||
|     // Search update | ||||
|     sliceMaddTimer.Start(); | ||||
|     sliceMaddVector(P,v_beta,P,R,Orthog); | ||||
|     sliceMaddTimer.Stop(); | ||||
|  | ||||
|     /********************* | ||||
|      * convergence monitor | ||||
|      ********************* | ||||
|      */ | ||||
|     RealD max_resid=0; | ||||
|     for(int b=0;b<Nblock;b++){ | ||||
|       RealD rr = v_rr[b]/ssq[b]; | ||||
|       if ( rr > max_resid ) max_resid = rr; | ||||
|     } | ||||
|      | ||||
|     if ( max_resid < Tolerance*Tolerance ) {  | ||||
|  | ||||
|       SolverTimer.Stop(); | ||||
|  | ||||
|       std::cout << GridLogMessage<<"MultiRHS solver converged in " <<k<<" iterations"<<std::endl; | ||||
|       for(int b=0;b<Nblock;b++){ | ||||
| 	std::cout << GridLogMessage<< "\t\tBlock "<<b<<" resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl; | ||||
|       } | ||||
|       std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl; | ||||
|  | ||||
|       Linop.HermOp(Psi, AP); | ||||
|       AP = AP-Src; | ||||
|       std::cout <<GridLogMessage << "\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl; | ||||
|  | ||||
|       std::cout << GridLogMessage << "Time Breakdown "<<std::endl; | ||||
|       std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl; | ||||
|       std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl; | ||||
|       std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl; | ||||
|       std::cout << GridLogMessage << "\tNorm       " << sliceNormTimer.Elapsed() <<std::endl; | ||||
|       std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl; | ||||
|  | ||||
|  | ||||
|       IterationsToComplete = k; | ||||
|       return; | ||||
|     } | ||||
|  | ||||
|   } | ||||
|   std::cout << GridLogMessage << "MultiRHSConjugateGradient did NOT converge" << std::endl; | ||||
|  | ||||
|   if (ErrorOnNoConverge) assert(0); | ||||
|   IterationsToComplete = k; | ||||
| } | ||||
| }; | ||||
|  | ||||
|  | ||||
|  | ||||
| } | ||||
| #endif | ||||
| @@ -45,6 +45,8 @@ class ConjugateGradient : public OperatorFunction<Field> { | ||||
|                            // Defaults true. | ||||
|   RealD Tolerance; | ||||
|   Integer MaxIterations; | ||||
|   Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion | ||||
|    | ||||
|   ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true) | ||||
|       : Tolerance(tol), | ||||
|         MaxIterations(maxit), | ||||
| @@ -76,18 +78,12 @@ class ConjugateGradient : public OperatorFunction<Field> { | ||||
|     cp = a; | ||||
|     ssq = norm2(src); | ||||
|  | ||||
|     std::cout << GridLogIterative << std::setprecision(4) | ||||
|               << "ConjugateGradient: guess " << guess << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) | ||||
|               << "ConjugateGradient:   src " << ssq << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) | ||||
|               << "ConjugateGradient:    mp " << d << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) | ||||
|               << "ConjugateGradient:   mmp " << b << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) | ||||
|               << "ConjugateGradient:  cp,r " << cp << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) | ||||
|               << "ConjugateGradient:     p " << a << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: guess " << guess << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:   src " << ssq << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:    mp " << d << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:   mmp " << b << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:  cp,r " << cp << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:     p " << a << std::endl; | ||||
|  | ||||
|     RealD rsq = Tolerance * Tolerance * ssq; | ||||
|  | ||||
| @@ -97,8 +93,7 @@ class ConjugateGradient : public OperatorFunction<Field> { | ||||
|     } | ||||
|  | ||||
|     std::cout << GridLogIterative << std::setprecision(4) | ||||
|               << "ConjugateGradient: k=0 residual " << cp << " target " << rsq | ||||
|               << std::endl; | ||||
|               << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl; | ||||
|  | ||||
|     GridStopWatch LinalgTimer; | ||||
|     GridStopWatch MatrixTimer; | ||||
| @@ -128,8 +123,11 @@ class ConjugateGradient : public OperatorFunction<Field> { | ||||
|       p = p * b + r; | ||||
|  | ||||
|       LinalgTimer.Stop(); | ||||
|  | ||||
|       std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k | ||||
|                 << " residual " << cp << " target " << rsq << std::endl; | ||||
|       std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << "  b = "<< b << std::endl; | ||||
|       std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << "  c = "<< c << std::endl; | ||||
|  | ||||
|       // Stopping condition | ||||
|       if (cp <= rsq) { | ||||
| @@ -137,31 +135,33 @@ class ConjugateGradient : public OperatorFunction<Field> { | ||||
|         Linop.HermOpAndNorm(psi, mmp, d, qq); | ||||
|         p = mmp - src; | ||||
|  | ||||
|         RealD mmpnorm = sqrt(norm2(mmp)); | ||||
|         RealD psinorm = sqrt(norm2(psi)); | ||||
|         RealD srcnorm = sqrt(norm2(src)); | ||||
|         RealD resnorm = sqrt(norm2(p)); | ||||
|         RealD true_residual = resnorm / srcnorm; | ||||
|  | ||||
|         std::cout << GridLogMessage | ||||
|                   << "ConjugateGradient: Converged on iteration " << k << std::endl; | ||||
|         std::cout << GridLogMessage << "Computed residual " << sqrt(cp / ssq) | ||||
|                   << " true residual " << true_residual << " target " | ||||
|                   << Tolerance << std::endl; | ||||
|         std::cout << GridLogMessage << "Time elapsed: Iterations " | ||||
|                   << SolverTimer.Elapsed() << " Matrix  " | ||||
|                   << MatrixTimer.Elapsed() << " Linalg " | ||||
|                   << LinalgTimer.Elapsed(); | ||||
|         std::cout << std::endl; | ||||
|         std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl; | ||||
|         std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl; | ||||
|  | ||||
|         std::cout << GridLogMessage << "Time breakdown "<<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl; | ||||
|  | ||||
|         if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); | ||||
|  | ||||
| 	IterationsToComplete = k;	 | ||||
|  | ||||
|         return; | ||||
|       } | ||||
|     } | ||||
|     std::cout << GridLogMessage << "ConjugateGradient did NOT converge" | ||||
|               << std::endl; | ||||
|  | ||||
|     if (ErrorOnNoConverge) assert(0); | ||||
|     IterationsToComplete = k; | ||||
|  | ||||
|   } | ||||
| }; | ||||
| } | ||||
|   | ||||
| @@ -35,6 +35,7 @@ namespace Grid { | ||||
|   class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> { | ||||
|   public:                                                 | ||||
|     RealD   Tolerance; | ||||
|     RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed | ||||
|     Integer MaxInnerIterations; | ||||
|     Integer MaxOuterIterations; | ||||
|     GridBase* SinglePrecGrid; //Grid for single-precision fields | ||||
| @@ -42,12 +43,16 @@ namespace Grid { | ||||
|     LinearOperatorBase<FieldF> &Linop_f; | ||||
|     LinearOperatorBase<FieldD> &Linop_d; | ||||
|  | ||||
|     Integer TotalInnerIterations; //Number of inner CG iterations | ||||
|     Integer TotalOuterIterations; //Number of restarts | ||||
|     Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step | ||||
|  | ||||
|     //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess | ||||
|     LinearFunction<FieldF> *guesser; | ||||
|      | ||||
|     MixedPrecisionConjugateGradient(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d) : | ||||
|       Linop_f(_Linop_f), Linop_d(_Linop_d), | ||||
|       Tolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid), | ||||
|       Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid), | ||||
|       OuterLoopNormMult(100.), guesser(NULL){ }; | ||||
|  | ||||
|     void useGuesser(LinearFunction<FieldF> &g){ | ||||
| @@ -55,9 +60,8 @@ namespace Grid { | ||||
|     } | ||||
|    | ||||
|     void operator() (const FieldD &src_d_in, FieldD &sol_d){ | ||||
| 	(*this)(src_d_in,sol_d,NULL); | ||||
|     } | ||||
|     void operator() (const FieldD &src_d_in, FieldD &sol_d, RealD *shift){ | ||||
|       TotalInnerIterations = 0; | ||||
| 	 | ||||
|       GridStopWatch TotalTimer; | ||||
|       TotalTimer.Start(); | ||||
|      | ||||
| @@ -77,7 +81,7 @@ namespace Grid { | ||||
|       FieldD src_d(DoublePrecGrid); | ||||
|       src_d = src_d_in; //source for next inner iteration, computed from residual during operation | ||||
|      | ||||
|       RealD inner_tol = Tolerance; | ||||
|       RealD inner_tol = InnerTolerance; | ||||
|      | ||||
|       FieldF src_f(SinglePrecGrid); | ||||
|       src_f.checkerboard = cb; | ||||
| @@ -85,17 +89,18 @@ namespace Grid { | ||||
|       FieldF sol_f(SinglePrecGrid); | ||||
|       sol_f.checkerboard = cb; | ||||
|      | ||||
|       ConjugateGradientShifted<FieldF> CG_f(inner_tol, MaxInnerIterations); | ||||
|       ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations); | ||||
|       CG_f.ErrorOnNoConverge = false; | ||||
|  | ||||
|       GridStopWatch InnerCGtimer; | ||||
|  | ||||
|       GridStopWatch PrecChangeTimer; | ||||
|      | ||||
|       for(Integer outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){ | ||||
|       Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count | ||||
|        | ||||
|       for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){ | ||||
| 	//Compute double precision rsd and also new RHS vector. | ||||
| 	Linop_d.HermOp(sol_d, tmp_d); | ||||
| 	if(shift) axpy(tmp_d,*shift,sol_d,tmp_d); | ||||
| 	RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector | ||||
|        | ||||
| 	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl; | ||||
| @@ -119,8 +124,9 @@ namespace Grid { | ||||
| 	//Inner CG | ||||
| 	CG_f.Tolerance = inner_tol; | ||||
| 	InnerCGtimer.Start(); | ||||
| 	CG_f(Linop_f, src_f, sol_f,shift); | ||||
| 	CG_f(Linop_f, src_f, sol_f); | ||||
| 	InnerCGtimer.Stop(); | ||||
| 	TotalInnerIterations += CG_f.IterationsToComplete; | ||||
|        | ||||
| 	//Convert sol back to double and add to double prec solution | ||||
| 	PrecChangeTimer.Start(); | ||||
| @@ -133,11 +139,13 @@ namespace Grid { | ||||
|       //Final trial CG | ||||
|       std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl; | ||||
|      | ||||
|       ConjugateGradientShifted<FieldD> CG_d(Tolerance, MaxInnerIterations); | ||||
|       CG_d(Linop_d, src_d_in, sol_d,shift); | ||||
|       ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations); | ||||
|       CG_d(Linop_d, src_d_in, sol_d); | ||||
|       TotalFinalStepIterations = CG_d.IterationsToComplete; | ||||
|  | ||||
|       TotalTimer.Stop(); | ||||
|       std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl; | ||||
|       std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl; | ||||
|       std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl; | ||||
|     } | ||||
|   }; | ||||
|  | ||||
|   | ||||
| @@ -45,7 +45,6 @@ public: | ||||
|     Integer MaxIterations; | ||||
|     int verbose; | ||||
|     MultiShiftFunction shifts; | ||||
|     int iter; | ||||
|  | ||||
|     ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) :  | ||||
| 	MaxIterations(maxit), | ||||
| @@ -61,7 +60,6 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) | ||||
|   std::vector<Field> results(nshift,grid); | ||||
|   (*this)(Linop,src,results,psi); | ||||
| } | ||||
|  | ||||
| void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &results, Field &psi) | ||||
| { | ||||
|   int nshift = shifts.order; | ||||
| @@ -107,12 +105,11 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector | ||||
|   RealD a,b,c,d; | ||||
|   RealD cp,bp,qq; //prev | ||||
|    | ||||
|   int cb=src.checkerboard; | ||||
|   // Matrix mult fields | ||||
|   Field r(grid); | ||||
|   Field p(grid); p.checkerboard = src.checkerboard; | ||||
|   Field p(grid); | ||||
|   Field tmp(grid); | ||||
|   Field mmp(grid);mmp.checkerboard = src.checkerboard; | ||||
|   Field mmp(grid); | ||||
|    | ||||
|   // Check lightest mass | ||||
|   for(int s=0;s<nshift;s++){ | ||||
| @@ -135,9 +132,6 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector | ||||
|   p=src; | ||||
|    | ||||
|   //MdagM+m[0] | ||||
|   std::cout << "p.checkerboard " << p.checkerboard | ||||
|   << "mmp.checkerboard " << mmp.checkerboard << std::endl; | ||||
|  | ||||
|   Linop.HermOpAndNorm(p,mmp,d,qq); | ||||
|   axpy(mmp,mass[0],p,mmp); | ||||
|   RealD rn = norm2(p); | ||||
| @@ -275,7 +269,6 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector | ||||
| 	RealD cn = norm2(src); | ||||
| 	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl; | ||||
|       } | ||||
|       iter = k; | ||||
|       return; | ||||
|     } | ||||
|   } | ||||
|   | ||||
| @@ -1,404 +0,0 @@ | ||||
|     /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
|     Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h | ||||
|  | ||||
|     Copyright (C) 2015 | ||||
|  | ||||
| Author: Chulwoo Jung <chulwoo@quark.phy.bnl.gov> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
|  | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
|  | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END/ LEGAL */ | ||||
| #ifndef GRID_CONJUGATE_GRADIENT_MULTI_MIXED_PREC_H | ||||
| #define GRID_CONJUGATE_GRADIENT_MULTI_MIXED_PREC_H | ||||
|  | ||||
| namespace Grid { | ||||
|  | ||||
|   //Mixed precision restarted defect correction CG | ||||
|   template<class FieldD,class FieldF | ||||
| //, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0 | ||||
| //, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0 | ||||
| >  | ||||
|   class MixedPrecisionConjugateGradientMultiShift : public LinearFunction<FieldD> { | ||||
|   public:                                                 | ||||
| //    RealD   Tolerance; | ||||
|     Integer MaxInnerIterations; | ||||
|     Integer MaxOuterIterations; | ||||
|     GridBase* SinglePrecGrid; //Grid for single-precision fields | ||||
|     RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance | ||||
|     LinearOperatorBase<FieldF> &Linop_f; | ||||
|     LinearOperatorBase<FieldD> &Linop_d; | ||||
|     MultiShiftFunction shifts; | ||||
|     Integer iter; | ||||
|  | ||||
|     //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess | ||||
| //    LinearFunction<FieldF> *guesser; | ||||
|      | ||||
|     MixedPrecisionConjugateGradientMultiShift(GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d,  | ||||
| Integer maxinnerit,	MultiShiftFunction &_shifts ) : | ||||
|       Linop_f(_Linop_f), Linop_d(_Linop_d), | ||||
|       MaxInnerIterations(maxinnerit), SinglePrecGrid(_sp_grid), | ||||
|       OuterLoopNormMult(100.), shifts(_shifts) {}; | ||||
|  | ||||
|    | ||||
|     void operator() (const FieldD &src_d_in, FieldD &sol_d){ | ||||
| 	assert(0); // not yet implemented | ||||
|     } | ||||
|     void operator() (const FieldD &src_d_in, std::vector<FieldD> &sol_d){ | ||||
|       GridStopWatch TotalTimer; | ||||
|       TotalTimer.Start(); | ||||
|      | ||||
|       int cb = src_d_in.checkerboard; | ||||
|  | ||||
|       int nshift = shifts.order; | ||||
|       assert(nshift == sol_d.size()); | ||||
|       for(int i=0;i<nshift;i++) sol_d[i].checkerboard = cb; | ||||
|      | ||||
|       RealD src_norm = norm2(src_d_in); | ||||
| //      RealD stop = src_norm * Tolerance*Tolerance; | ||||
|  | ||||
|       GridBase* DoublePrecGrid = src_d_in._grid; | ||||
|       FieldD tmp_d(DoublePrecGrid); tmp_d.checkerboard = cb; | ||||
|      | ||||
|       FieldD tmp2_d(DoublePrecGrid); tmp2_d.checkerboard = cb; | ||||
|      | ||||
|       FieldD src_d(DoublePrecGrid); | ||||
|       src_d = src_d_in; //source for next inner iteration, computed from residual during operation | ||||
|      | ||||
| //      RealD inner_tol = Tolerance; | ||||
|   	FieldD psi_d(DoublePrecGrid);psi_d.checkerboard = cb; | ||||
|      | ||||
|       FieldF src_f(SinglePrecGrid); | ||||
|       src_f.checkerboard = cb; | ||||
|      | ||||
|       std::vector<FieldF> sol_f(nshift,SinglePrecGrid); | ||||
|       for(int i=0;i<nshift;i++) sol_f[i].checkerboard = cb; | ||||
|      | ||||
| //      ConjugateGradientShifted<FieldF> CG_f(inner_tol, MaxInnerIterations); | ||||
|       ConjugateGradientMultiShift<FieldF> MSCG(MaxInnerIterations,shifts); | ||||
| //      CG_f.ErrorOnNoConverge = false; | ||||
|  | ||||
|       GridStopWatch InnerCGtimer; | ||||
|  | ||||
|       GridStopWatch PrecChangeTimer; | ||||
|      | ||||
| { | ||||
| //	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl; | ||||
|  | ||||
| //	if(norm < OuterLoopNormMult * stop){ | ||||
| //	  std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl; | ||||
| //	  break; | ||||
| //	} | ||||
| //	while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ?? | ||||
|  | ||||
| 	PrecChangeTimer.Start(); | ||||
| 	precisionChange(src_f, src_d); | ||||
| 	PrecChangeTimer.Stop(); | ||||
|        | ||||
| //	zeroit(sol_f); | ||||
|  | ||||
|  | ||||
| 	//Inner CG | ||||
| 	InnerCGtimer.Start(); | ||||
|   int if_relup = 0; | ||||
| #if 0 | ||||
|         MSCG(Linop_f,src_f,sol_f); | ||||
| #else | ||||
| { | ||||
|    | ||||
|   GridBase *grid = SinglePrecGrid; | ||||
|    | ||||
|   //////////////////////////////////////////////////////////////////////// | ||||
|   // Convenience references to the info stored in "MultiShiftFunction" | ||||
|   //////////////////////////////////////////////////////////////////////// | ||||
|   int nshift = shifts.order; | ||||
|  | ||||
|  | ||||
|   std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts" | ||||
|   std::vector<RealD> &mresidual(shifts.tolerances); | ||||
|   std::vector<RealD> alpha(nshift,1.); | ||||
|   std::vector<FieldF>   ps(nshift,grid);// Search directions | ||||
|  | ||||
|   assert(sol_f.size()==nshift); | ||||
|   assert(mass.size()==nshift); | ||||
|   assert(mresidual.size()==nshift); | ||||
|    | ||||
|   // dynamic sized arrays on stack; 2d is a pain with vector | ||||
|   RealD  bs[nshift]; | ||||
|   RealD  rsq[nshift]; | ||||
|   RealD  z[nshift][2]; | ||||
|   int     converged[nshift]; | ||||
|    | ||||
|   const int       primary =0; | ||||
|    | ||||
|   //Primary shift fields CG iteration | ||||
|   RealD a,b,c,d; | ||||
|   RealD cp,bp,qq; //prev | ||||
|    | ||||
|   int cb=src_f.checkerboard; | ||||
|   // Matrix mult fields | ||||
|   FieldF r(grid); r.checkerboard = src_f.checkerboard; | ||||
|   FieldF p(grid); p.checkerboard = src_f.checkerboard; | ||||
|   FieldF tmp(grid); tmp.checkerboard = src_f.checkerboard; | ||||
|   FieldF mmp(grid);mmp.checkerboard = src_f.checkerboard; | ||||
|   FieldF psi(grid);psi.checkerboard = src_f.checkerboard; | ||||
|     std::cout.precision(12); | ||||
|     std::cout<<GridLogMessage<<"norm2(psi_d)= "<<norm2(psi_d)<<std::endl; | ||||
|     std::cout<<GridLogMessage<<"norm2(psi)= "<<norm2(psi)<<std::endl; | ||||
|    | ||||
|    | ||||
|   // Check lightest mass | ||||
|   for(int s=0;s<nshift;s++){ | ||||
|     assert( mass[s]>= mass[primary] ); | ||||
|     converged[s]=0; | ||||
|   } | ||||
|    | ||||
|   // Wire guess to zero | ||||
|   // Residuals "r" are src | ||||
|   // First search direction "p" is also src | ||||
|   cp = norm2(src_f); | ||||
|   Real c_relup = cp; | ||||
|   for(int s=0;s<nshift;s++){ | ||||
|     rsq[s] = cp * mresidual[s] * mresidual[s]; | ||||
|     std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientMultiShift: shift "<<s | ||||
| 	     <<" target resid "<<rsq[s]<<std::endl; | ||||
|     ps[s] = src_f; | ||||
|   } | ||||
|   // r and p for primary | ||||
|   r=src_f; | ||||
|   p=src_f; | ||||
|    | ||||
|   //MdagM+m[0] | ||||
|   std::cout << "p.checkerboard " << p.checkerboard | ||||
|   << "mmp.checkerboard " << mmp.checkerboard << std::endl; | ||||
|  | ||||
|   Linop_f.HermOpAndNorm(p,mmp,d,qq); | ||||
|   axpy(mmp,mass[0],p,mmp); | ||||
|   RealD rn = norm2(p); | ||||
|   d += rn*mass[0]; | ||||
|    | ||||
|   // have verified that inner product of  | ||||
|   // p and mmp is equal to d after this since | ||||
|   // the d computation is tricky | ||||
|   //  qq = real(innerProduct(p,mmp)); | ||||
|   //  std::cout<<GridLogMessage << "debug equal ?  qq "<<qq<<" d "<< d<<std::endl; | ||||
|    | ||||
|   b = -cp /d; | ||||
|    | ||||
|   // Set up the various shift variables | ||||
|   int       iz=0; | ||||
|   z[0][1-iz] = 1.0; | ||||
|   z[0][iz]   = 1.0; | ||||
|   bs[0]      = b; | ||||
|   for(int s=1;s<nshift;s++){ | ||||
|     z[s][1-iz] = 1.0; | ||||
|     z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0])); | ||||
|     bs[s]      = b*z[s][iz];  | ||||
|   } | ||||
|    | ||||
|   // r += b[0] A.p[0] | ||||
|   // c= norm(r) | ||||
|   c=axpy_norm(r,b,mmp,r); | ||||
|    | ||||
|  axpby(psi,0.,-bs[0],src_f,src_f); | ||||
|   for(int s=0;s<nshift;s++) { | ||||
|     axpby(sol_f[s],0.,-bs[s]*alpha[s],src_f,src_f); | ||||
|   } | ||||
|    | ||||
|    | ||||
|   // Iteration loop | ||||
|   int k; | ||||
|  // inefficient zeroing, please replace! | ||||
| //  RealD sol_norm = axpy_norm(sol_d[0],-1.,sol_d[0],sol_d[0]); | ||||
|   zeroit(sol_d[0]); | ||||
|   std::cout<<GridLogMessage<<"norm(sol_d[0])= "<<norm2(sol_d[0])<<std::endl; | ||||
|    | ||||
|  | ||||
|   int all_converged = 1; | ||||
| 	RealD tmp1,tmp2; | ||||
|   for (k=1;k<=MaxOuterIterations;k++){ | ||||
|      | ||||
|     a = c /cp; | ||||
|     axpy(p,a,p,r); | ||||
|      | ||||
|     // Note to self - direction ps is iterated seperately | ||||
|     // for each shift. Does not appear to have any scope | ||||
|     // for avoiding linear algebra in "single" case. | ||||
|     //  | ||||
|     // However SAME r is used. Could load "r" and update | ||||
|     // ALL ps[s]. 2/3 Bandwidth saving | ||||
|     // New Kernel: Load r, vector of coeffs, vector of pointers ps | ||||
|     for(int s=0;s<nshift;s++){ | ||||
|       if ( ! converged[s] ) {  | ||||
| 	if (s==0){ | ||||
| 	  axpy(ps[s],a,ps[s],r); | ||||
| 	} else{ | ||||
| 	  RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b); | ||||
| 	  axpby(ps[s],z[s][iz],as,r,ps[s]); | ||||
| 	} | ||||
|       } | ||||
|     } | ||||
|      | ||||
|     cp=c; | ||||
|      | ||||
|     Linop_f.HermOpAndNorm(p,mmp,d,qq); | ||||
|     axpy(mmp,mass[0],p,mmp); | ||||
|     RealD rn = norm2(p); | ||||
|     d += rn*mass[0]; | ||||
|      | ||||
|     bp=b; | ||||
|     b=-cp/d; | ||||
|      | ||||
|     c=axpy_norm(r,b,mmp,r); | ||||
|  | ||||
|  | ||||
|     // Toggle the recurrence history | ||||
|     bs[0] = b; | ||||
|     iz = 1-iz; | ||||
|     for(int s=1;s<nshift;s++){ | ||||
|       if((!converged[s])){ | ||||
| 	RealD z0 = z[s][1-iz]; | ||||
| 	RealD z1 = z[s][iz]; | ||||
| 	z[s][iz] = z0*z1*bp | ||||
| 	  / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));  | ||||
| 	bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike | ||||
|       } | ||||
|     } | ||||
|      | ||||
|     axpy(psi,-bs[0],ps[0],psi); | ||||
|     for(int s=0;s<nshift;s++){ | ||||
|       int ss = s; | ||||
|       // Scope for optimisation here in case of "single". | ||||
|       // Could load sol_f[0] and pull all ps[s] in. | ||||
|       //      if ( single ) ss=primary; | ||||
|       // Bandwith saving in single case is Ls * 3 -> 2+Ls, so ~ 3x saving | ||||
|       // Pipelined CG gain: | ||||
|       // | ||||
|       // New Kernel: Load r, vector of coeffs, vector of pointers ps | ||||
|       // New Kernel: Load sol_f[0], vector of coeffs, vector of pointers ps | ||||
|       // If can predict the coefficient bs then we can fuse these and avoid write reread cyce | ||||
|       //  on ps[s]. | ||||
|       // Before:  3 x npole  + 3 x npole | ||||
|       // After :  2 x npole (ps[s])        => 3x speed up of multishift CG. | ||||
|        | ||||
|       if( (!converged[s]) ) {  | ||||
| 	axpy(sol_f[ss],-bs[s]*alpha[s],ps[s],sol_f[ss]); | ||||
|       } | ||||
|     } | ||||
|  | ||||
|  | ||||
|     if (k%MaxInnerIterations==0){ | ||||
| //    if (c < 1e-4*c_relup){ | ||||
|        RealD c_f=c; | ||||
|        precisionChange(tmp_d,psi); | ||||
|        RealD sol_norm =axpy_norm (psi_d,1.,tmp_d,psi_d); | ||||
|        tmp1 = norm2(psi); | ||||
|        zeroit(psi); | ||||
|        tmp2 = norm2(psi); | ||||
|        std::cout<<GridLogMessage<<"k= "<<k<<" norm2(sol)= "<<sol_norm<<" "<<tmp1<<" "<<tmp2<<std::endl; | ||||
| //       precisionChange(sol_d[0],sol_f[0]); | ||||
|        Linop_d.HermOpAndNorm(psi_d,tmp_d,tmp1,tmp2); | ||||
|        axpy(tmp2_d,mass[0],psi_d,tmp_d); | ||||
|        axpy(tmp_d,-1.,tmp2_d,src_d); | ||||
|        precisionChange(r,tmp_d); | ||||
| 	c_relup = norm2(r); | ||||
|        std::cout<<GridLogMessage<<"k= "<<k<<" norm2(r)= "<<c<<" "<<c_relup<<" "<<c_f<<std::endl; | ||||
| 	if_relup=1; | ||||
|     } | ||||
|      | ||||
|     // Convergence checks | ||||
|   all_converged=1; | ||||
|     for(int s=0;s<nshift;s++){ | ||||
|        | ||||
|       if ( (!converged[s]) ){ | ||||
| 	 | ||||
| 	RealD css  = c * z[s][iz]* z[s][iz]; | ||||
| 	 | ||||
| 	if(css<rsq[s]){ | ||||
| 	  if ( ! converged[s] ) | ||||
| 	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl; | ||||
| 	      converged[s]=1; | ||||
| 	} else { | ||||
| 		if (k%MaxInnerIterations==0) | ||||
| 	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has not converged "<<css<<"<"<<rsq[s]<<std::endl; | ||||
| 	  all_converged=0; | ||||
| 	} | ||||
|  | ||||
|       } | ||||
|     } | ||||
|      | ||||
| #if 0 | ||||
|     if ( all_converged ){ | ||||
|       std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl; | ||||
| #else | ||||
|     if ( converged[0] ){ | ||||
|       std::cout<<GridLogMessage<< "CGMultiShift: Shift 0 have converged iteration, terminating  "<<k<<std::endl; | ||||
| #endif | ||||
|        | ||||
| #if 1 | ||||
|       for(int s=1; s < nshift; s++) {  | ||||
| 	Linop_f.HermOpAndNorm(sol_f[s],mmp,d,qq); | ||||
| 	axpy(tmp,mass[s],sol_f[s],mmp); | ||||
| 	axpy(r,-alpha[s],src_f,tmp); | ||||
| 	RealD rn = norm2(r); | ||||
| 	RealD cn = norm2(src_f); | ||||
| 	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl; | ||||
|       } | ||||
| #endif | ||||
|      iter = k; | ||||
|       break; | ||||
|     } | ||||
|   } | ||||
|   // ugly hack | ||||
|   if ( !all_converged ) | ||||
|   std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl; | ||||
| //  assert(0); | ||||
| } | ||||
| 	 | ||||
| #endif | ||||
| 	InnerCGtimer.Stop(); | ||||
|        | ||||
| 	//Convert sol back to double and add to double prec solution | ||||
| 	PrecChangeTimer.Start(); | ||||
| 	sol_d[0]=psi_d; | ||||
| 	for(int i=1;i<nshift;i++)precisionChange(sol_d[i], sol_f[i]); | ||||
|       std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl; | ||||
|       // Check answers  | ||||
|       for(int s=0; s < nshift; s++) {  | ||||
| 	RealD tmp1,tmp2; | ||||
|        Linop_d.HermOpAndNorm(sol_d[s],tmp_d,tmp1,tmp2); | ||||
|        axpy(tmp2_d,shifts.poles[s],sol_d[s],tmp_d); | ||||
|        axpy(tmp_d,-1.,src_d,tmp2_d); | ||||
| 	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(norm2(tmp_d)/norm2(src_d))<<std::endl; | ||||
|       } | ||||
| 	PrecChangeTimer.Stop(); | ||||
|        | ||||
| } | ||||
|      | ||||
|       //Final trial CG | ||||
|  //     std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl; | ||||
|      | ||||
|       TotalTimer.Stop(); | ||||
|       std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl; | ||||
|     } | ||||
|   }; | ||||
|  | ||||
| } | ||||
|  | ||||
| #endif | ||||
| @@ -1,168 +0,0 @@ | ||||
|     /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
|     Source file: ./lib/algorithms/iterative/ConjugateGradient.h | ||||
|  | ||||
|     Copyright (C) 2015 | ||||
|  | ||||
| Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
|  | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
|  | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #ifndef GRID_CONJUGATE_GRADIENT_SHIFTED_H | ||||
| #define GRID_CONJUGATE_GRADIENT_SHIFTED_H | ||||
|  | ||||
| namespace Grid { | ||||
|  | ||||
|     ///////////////////////////////////////////////////////////// | ||||
|     // Base classes for iterative processes based on operators | ||||
|     // single input vec, single output vec. | ||||
|     ///////////////////////////////////////////////////////////// | ||||
|  | ||||
|   template<class Field>  | ||||
|     class ConjugateGradientShifted : public OperatorFunction<Field> { | ||||
| public:                                                 | ||||
|     bool ErrorOnNoConverge; //throw an assert when the CG fails to converge. Defaults true. | ||||
|     RealD   Tolerance; | ||||
|     Integer MaxIterations; | ||||
|     ConjugateGradientShifted(RealD tol,Integer maxit, bool err_on_no_conv = true) : Tolerance(tol), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv) {  | ||||
|     }; | ||||
|  | ||||
|     void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi ){ | ||||
| 	(*this)(Linop,src,psi,NULL); | ||||
|     } | ||||
|  | ||||
|     void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi, RealD *shift){ | ||||
|  | ||||
|       psi.checkerboard = src.checkerboard; | ||||
|       conformable(psi,src); | ||||
|  | ||||
|       RealD cp,c,a,d,b,ssq,qq,b_pred; | ||||
|        | ||||
|       Field   p(src); | ||||
|       Field mmp(src); | ||||
|       Field   r(src); | ||||
|        | ||||
|       //Initial residual computation & set up | ||||
|       RealD guess = norm2(psi); | ||||
|       assert(std::isnan(guess)==0); | ||||
|  | ||||
|       Linop.HermOpAndNorm(psi,mmp,d,b); | ||||
| 	if(shift) axpy(mmp,*shift,psi,mmp); | ||||
| 	RealD rn = norm2(psi); | ||||
| 	if(shift) d += rn*(*shift); | ||||
| 	RealD d2 = real(innerProduct(psi,mmp)); | ||||
| 	b= norm2(mmp); | ||||
|       RealD src_norm=norm2(src); | ||||
|       r= src-mmp; | ||||
|       p= r; | ||||
|        | ||||
|       a  =norm2(p); | ||||
|       cp =a; | ||||
|       ssq=norm2(src); | ||||
|  | ||||
|       std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient: guess "<<guess<<std::endl; | ||||
|       std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:   src "<<ssq  <<std::endl; | ||||
|       std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:    mp "<<d    <<std::endl; | ||||
|       std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:   mmp "<<b    <<std::endl; | ||||
|       std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:  cp,r "<<cp   <<std::endl; | ||||
|       std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:     p "<<a    <<std::endl; | ||||
|  | ||||
|       RealD rsq =  Tolerance* Tolerance*ssq; | ||||
|        | ||||
|       //Check if guess is really REALLY good :) | ||||
|       if ( cp <= rsq ) { | ||||
| 	return; | ||||
|       } | ||||
|        | ||||
|       std::cout<<GridLogIterative << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" target "<<rsq<<std::endl; | ||||
|  | ||||
|       GridStopWatch LinalgTimer; | ||||
|       GridStopWatch MatrixTimer; | ||||
|       GridStopWatch SolverTimer; | ||||
|  | ||||
|       SolverTimer.Start(); | ||||
|       int k; | ||||
|       for (k=1;k<=MaxIterations;k++){ | ||||
| 	 | ||||
| 	c=cp; | ||||
|  | ||||
| 	MatrixTimer.Start(); | ||||
| 	Linop.HermOpAndNorm(p,mmp,d,qq); | ||||
| 	MatrixTimer.Stop(); | ||||
| 	LinalgTimer.Start(); | ||||
| 	if(shift) axpy(mmp,*shift,p,mmp); | ||||
| 	RealD rn = norm2(p); | ||||
| 	if(shift) d += rn*(*shift); | ||||
| 	RealD d2 = real(innerProduct(p,mmp)); | ||||
| 	qq = norm2(mmp); | ||||
|       if (k%10==1) std::cout<< std::setprecision(4)<< "d:  "<<d<<" d2= "<<d2<<std::endl; | ||||
|  | ||||
| 	//	RealD    qqck = norm2(mmp); | ||||
| 	//	ComplexD dck  = innerProduct(p,mmp); | ||||
|        | ||||
| 	a      = c/d; | ||||
| 	b_pred = a*(a*qq-d)/c; | ||||
|  | ||||
| 	cp = axpy_norm(r,-a,mmp,r); | ||||
| 	b = cp/c; | ||||
|       if (k%10==1) std::cout<< std::setprecision(4)<<"k= "<<k<<" src:  "<<src_norm<<" r= "<<cp<<std::endl; | ||||
| 	 | ||||
| 	// Fuse these loops ; should be really easy | ||||
| 	psi= a*p+psi; | ||||
| 	p  = p*b+r; | ||||
| 	   | ||||
| 	LinalgTimer.Stop(); | ||||
| 	std::cout<<GridLogIterative<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target "<< rsq<<std::endl; | ||||
| 	 | ||||
| 	// Stopping condition | ||||
| 	if ( cp <= rsq ) {  | ||||
| 	   | ||||
| 	  SolverTimer.Stop(); | ||||
| 	  Linop.HermOpAndNorm(psi,mmp,d,qq); | ||||
| 	  if(shift) mmp = mmp + (*shift) * psi; | ||||
| 	  p=mmp-src; | ||||
| 	   | ||||
| 	  RealD mmpnorm = sqrt(norm2(mmp)); | ||||
| 	  RealD psinorm = sqrt(norm2(psi)); | ||||
| 	  RealD srcnorm = sqrt(norm2(src)); | ||||
| 	  RealD resnorm = sqrt(norm2(p)); | ||||
| 	  RealD true_residual = resnorm/srcnorm; | ||||
|  | ||||
| 	  std::cout<<GridLogMessage<<"ConjugateGradient: Converged on iteration " <<k | ||||
| 		   <<" computed residual "<<sqrt(cp/ssq) | ||||
| 		   <<" true residual "    <<true_residual | ||||
| 		   <<" target "<<Tolerance<<std::endl; | ||||
| 	  std::cout<<GridLogMessage<<"Time elapsed: Total "<< SolverTimer.Elapsed() << " Matrix  "<<MatrixTimer.Elapsed() << " Linalg "<<LinalgTimer.Elapsed(); | ||||
| 	  std::cout<<std::endl; | ||||
| 	   | ||||
| 	if(ErrorOnNoConverge) | ||||
| 	  assert(true_residual/Tolerance < 1000.0); | ||||
|  | ||||
| 	  return; | ||||
| 	} | ||||
|       } | ||||
|       std::cout<<GridLogMessage<<"ConjugateGradient did NOT converge"<<std::endl; | ||||
| //      assert(0); | ||||
|     } | ||||
|   }; | ||||
| } | ||||
| #endif | ||||
| @@ -30,20 +30,17 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
| #define GRID_IRL_H | ||||
|  | ||||
| #include <string.h> //memset | ||||
|  | ||||
| #ifdef USE_LAPACK | ||||
| #ifdef USE_MKL | ||||
| #include<mkl_lapack.h> | ||||
| #else | ||||
| void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e, | ||||
|                    double *vl, double *vu, int *il, int *iu, double *abstol, | ||||
|                    int *m, double *w, double *z, int *ldz, int *isuppz, | ||||
|                    double *work, int *lwork, int *iwork, int *liwork, | ||||
|                    int *info); | ||||
| //#include <lapacke/lapacke.h> | ||||
| #endif | ||||
| #endif | ||||
| #include "DenseMatrix.h" | ||||
| #include "EigenSort.h" | ||||
|  | ||||
| #include <Grid/algorithms/densematrix/DenseMatrix.h> | ||||
| #include <Grid/algorithms/iterative/EigenSort.h> | ||||
|  | ||||
| namespace Grid { | ||||
|  | ||||
| @@ -67,13 +64,12 @@ public: | ||||
|     int Np;      // Np -- Number of spare vecs in kryloc space | ||||
|     int Nm;      // Nm -- total number of vectors | ||||
|  | ||||
|  | ||||
|     RealD OrthoTime; | ||||
|  | ||||
|     RealD eresid; | ||||
|  | ||||
|     SortEigen<Field> _sort; | ||||
|  | ||||
| //    GridCartesian &_fgrid; | ||||
|  | ||||
|     LinearOperatorBase<Field> &_Linop; | ||||
|  | ||||
|     OperatorFunction<Field>   &_poly; | ||||
| @@ -130,23 +126,23 @@ public: | ||||
|  | ||||
|       GridBase *grid = evec[0]._grid; | ||||
|       Field w(grid); | ||||
|       std::cout<<GridLogMessage << "RitzMatrix "<<std::endl; | ||||
|       std::cout << "RitzMatrix "<<std::endl; | ||||
|       for(int i=0;i<k;i++){ | ||||
| 	_poly(_Linop,evec[i],w); | ||||
| 	std::cout<<GridLogMessage << "["<<i<<"] "; | ||||
| 	std::cout << "["<<i<<"] "; | ||||
| 	for(int j=0;j<k;j++){ | ||||
| 	  ComplexD in = innerProduct(evec[j],w); | ||||
| 	  if ( fabs((double)i-j)>1 ) {  | ||||
| 	    if (abs(in) >1.0e-9 )  {  | ||||
| 	      std::cout<<GridLogMessage<<"oops"<<std::endl; | ||||
| 	      std::cout<<"oops"<<std::endl; | ||||
| 	      abort(); | ||||
| 	    } else  | ||||
| 	      std::cout<<GridLogMessage << " 0 "; | ||||
| 	      std::cout << " 0 "; | ||||
| 	  } else {  | ||||
| 	    std::cout<<GridLogMessage << " "<<in<<" "; | ||||
| 	    std::cout << " "<<in<<" "; | ||||
| 	  } | ||||
| 	} | ||||
| 	std::cout<<GridLogMessage << std::endl; | ||||
| 	std::cout << std::endl; | ||||
|       } | ||||
|     } | ||||
|  | ||||
| @@ -180,10 +176,10 @@ public: | ||||
|       RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop | ||||
|                                  // 7. vk+1 := wk/βk+1 | ||||
|  | ||||
| 	std::cout<<GridLogMessage << "alpha = " << zalph << " beta "<<beta<<std::endl; | ||||
| //	std::cout << "alpha = " << zalph << " beta "<<beta<<std::endl; | ||||
|       const RealD tiny = 1.0e-20; | ||||
|       if ( beta < tiny ) {  | ||||
| 	std::cout<<GridLogMessage << " beta is tiny "<<beta<<std::endl; | ||||
| 	std::cout << " beta is tiny "<<beta<<std::endl; | ||||
|      } | ||||
|       lmd[k] = alph; | ||||
|       lme[k]  = beta; | ||||
| @@ -259,7 +255,6 @@ public: | ||||
|     } | ||||
|  | ||||
| #ifdef USE_LAPACK | ||||
| #define LAPACK_INT long long | ||||
|     void diagonalize_lapack(DenseVector<RealD>& lmd, | ||||
| 		     DenseVector<RealD>& lme,  | ||||
| 		     int N1, | ||||
| @@ -269,7 +264,7 @@ public: | ||||
|   const int size = Nm; | ||||
| //  tevals.resize(size); | ||||
| //  tevecs.resize(size); | ||||
|   LAPACK_INT NN = N1; | ||||
|   int NN = N1; | ||||
|   double evals_tmp[NN]; | ||||
|   double evec_tmp[NN][NN]; | ||||
|   memset(evec_tmp[0],0,sizeof(double)*NN*NN); | ||||
| @@ -283,19 +278,19 @@ public: | ||||
|         if (i==j) evals_tmp[i] = lmd[i]; | ||||
|         if (j==(i-1)) EE[j] = lme[j]; | ||||
|       } | ||||
|   LAPACK_INT evals_found; | ||||
|   LAPACK_INT lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ; | ||||
|   LAPACK_INT liwork =  3+NN*10 ; | ||||
|   LAPACK_INT iwork[liwork]; | ||||
|   int evals_found; | ||||
|   int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ; | ||||
|   int liwork =  3+NN*10 ; | ||||
|   int iwork[liwork]; | ||||
|   double work[lwork]; | ||||
|   LAPACK_INT isuppz[2*NN]; | ||||
|   int isuppz[2*NN]; | ||||
|   char jobz = 'V'; // calculate evals & evecs | ||||
|   char range = 'I'; // calculate all evals | ||||
|   //    char range = 'A'; // calculate all evals | ||||
|   char uplo = 'U'; // refer to upper half of original matrix | ||||
|   char compz = 'I'; // Compute eigenvectors of tridiagonal matrix | ||||
|   int ifail[NN]; | ||||
|   long long info; | ||||
|   int info; | ||||
| //  int total = QMP_get_number_of_nodes(); | ||||
| //  int node = QMP_get_node_number(); | ||||
| //  GridBase *grid = evec[0]._grid; | ||||
| @@ -303,18 +298,14 @@ public: | ||||
|   int node = grid->_processor; | ||||
|   int interval = (NN/total)+1; | ||||
|   double vl = 0.0, vu = 0.0; | ||||
|   LAPACK_INT il = interval*node+1 , iu = interval*(node+1); | ||||
|   int il = interval*node+1 , iu = interval*(node+1); | ||||
|   if (iu > NN)  iu=NN; | ||||
|   double tol = 0.0; | ||||
|     if (1) { | ||||
|       memset(evals_tmp,0,sizeof(double)*NN); | ||||
|       if ( il <= NN){ | ||||
|         printf("total=%d node=%d il=%d iu=%d\n",total,node,il,iu); | ||||
| #ifdef USE_MKL | ||||
|         dstegr(&jobz, &range, &NN, | ||||
| #else | ||||
|         LAPACK_dstegr(&jobz, &range, &NN, | ||||
| #endif | ||||
|             (double*)DD, (double*)EE, | ||||
|             &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A' | ||||
|             &tol, // tolerance | ||||
| @@ -346,7 +337,6 @@ public: | ||||
|       lmd [NN-1-i]=evals_tmp[i]; | ||||
|   } | ||||
| } | ||||
| #undef LAPACK_INT  | ||||
| #endif | ||||
|  | ||||
|  | ||||
| @@ -377,14 +367,12 @@ public: | ||||
| //	diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid); | ||||
| #endif | ||||
|  | ||||
|       int Niter = 10000*N1; | ||||
|       int Niter = 100*N1; | ||||
|       int kmin = 1; | ||||
|       int kmax = N2; | ||||
|       // (this should be more sophisticated) | ||||
|  | ||||
|       for(int iter=0; ; ++iter){ | ||||
|       if ( (iter+1)%(100*N1)==0)  | ||||
|       std::cout<<GridLogMessage << "[QL method] Not converged - iteration "<<iter+1<<"\n"; | ||||
|       for(int iter=0; iter<Niter; ++iter){ | ||||
|  | ||||
| 	// determination of 2x2 leading submatrix | ||||
| 	RealD dsub = lmd[kmax-1]-lmd[kmax-2]; | ||||
| @@ -413,11 +401,11 @@ public: | ||||
|         _sort.push(lmd3,N2); | ||||
|         _sort.push(lmd2,N2); | ||||
|          for(int k=0; k<N2; ++k){ | ||||
| 	    if (fabs(lmd2[k] - lmd3[k]) >SMALL)  std::cout<<GridLogMessage <<"lmd(qr) lmd(lapack) "<< k << ": " << lmd2[k] <<" "<< lmd3[k] <<std::endl; | ||||
| //	    if (fabs(lme2[k] - lme[k]) >SMALL)  std::cout<<GridLogMessage <<"lme(qr)-lme(lapack) "<< k << ": " << lme2[k] - lme[k] <<std::endl; | ||||
| 	    if (fabs(lmd2[k] - lmd3[k]) >SMALL)  std::cout <<"lmd(qr) lmd(lapack) "<< k << ": " << lmd2[k] <<" "<< lmd3[k] <<std::endl; | ||||
| //	    if (fabs(lme2[k] - lme[k]) >SMALL)  std::cout <<"lme(qr)-lme(lapack) "<< k << ": " << lme2[k] - lme[k] <<std::endl; | ||||
| 	  } | ||||
|          for(int k=0; k<N1*N1; ++k){ | ||||
| //	    if (fabs(Qt2[k] - Qt[k]) >SMALL)  std::cout<<GridLogMessage <<"Qt(qr)-Qt(lapack) "<< k << ": " << Qt2[k] - Qt[k] <<std::endl; | ||||
| //	    if (fabs(Qt2[k] - Qt[k]) >SMALL)  std::cout <<"Qt(qr)-Qt(lapack) "<< k << ": " << Qt2[k] - Qt[k] <<std::endl; | ||||
| 	} | ||||
|     } | ||||
| #endif | ||||
| @@ -432,7 +420,7 @@ public: | ||||
| 	  } | ||||
| 	} | ||||
|       } | ||||
|       std::cout<<GridLogMessage << "[QL method] Error - Too many iteration: "<<Niter<<"\n"; | ||||
|       std::cout << "[QL method] Error - Too many iteration: "<<Niter<<"\n"; | ||||
|       abort(); | ||||
|     } | ||||
|  | ||||
| @@ -449,7 +437,6 @@ public: | ||||
| 		       DenseVector<Field>& evec, | ||||
| 		       int k) | ||||
|     { | ||||
|       double t0=-usecond()/1e6; | ||||
|       typedef typename Field::scalar_type MyComplex; | ||||
|       MyComplex ip; | ||||
|  | ||||
| @@ -468,8 +455,6 @@ public: | ||||
| 	w = w - ip * evec[j]; | ||||
|       } | ||||
|       normalise(w); | ||||
|       t0+=usecond()/1e6; | ||||
|       OrthoTime +=t0; | ||||
|     } | ||||
|  | ||||
|     void setUnit_Qt(int Nm, DenseVector<RealD> &Qt) { | ||||
| @@ -503,10 +488,10 @@ until convergence | ||||
| 	GridBase *grid = evec[0]._grid; | ||||
| 	assert(grid == src._grid); | ||||
|  | ||||
| 	std::cout<<GridLogMessage << " -- Nk = " << Nk << " Np = "<< Np << std::endl; | ||||
| 	std::cout<<GridLogMessage << " -- Nm = " << Nm << std::endl; | ||||
| 	std::cout<<GridLogMessage << " -- size of eval   = " << eval.size() << std::endl; | ||||
| 	std::cout<<GridLogMessage << " -- size of evec  = " << evec.size() << std::endl; | ||||
| 	std::cout << " -- Nk = " << Nk << " Np = "<< Np << std::endl; | ||||
| 	std::cout << " -- Nm = " << Nm << std::endl; | ||||
| 	std::cout << " -- size of eval   = " << eval.size() << std::endl; | ||||
| 	std::cout << " -- size of evec  = " << evec.size() << std::endl; | ||||
| 	 | ||||
| 	assert(Nm == evec.size() && Nm == eval.size()); | ||||
| 	 | ||||
| @@ -517,7 +502,6 @@ until convergence | ||||
| 	DenseVector<int>   Iconv(Nm); | ||||
|  | ||||
| 	DenseVector<Field>  B(Nm,grid); // waste of space replicating | ||||
| //	DenseVector<Field>  Btemp(Nm,grid); // waste of space replicating | ||||
| 	 | ||||
| 	Field f(grid); | ||||
| 	Field v(grid); | ||||
| @@ -533,48 +517,35 @@ until convergence | ||||
| 	// (uniform vector) Why not src?? | ||||
| 	//	evec[0] = 1.0; | ||||
| 	evec[0] = src; | ||||
| 	std:: cout<<GridLogMessage <<"norm2(src)= " << norm2(src)<<std::endl; | ||||
| 	std:: cout <<"norm2(src)= " << norm2(src)<<std::endl; | ||||
| // << src._grid  << std::endl; | ||||
| 	normalise(evec[0]); | ||||
| 	std:: cout<<GridLogMessage <<"norm2(evec[0])= " << norm2(evec[0]) <<std::endl; | ||||
| 	std:: cout <<"norm2(evec[0])= " << norm2(evec[0]) <<std::endl; | ||||
| // << evec[0]._grid << std::endl; | ||||
| 	 | ||||
| 	// Initial Nk steps | ||||
| 	OrthoTime=0.; | ||||
| 	double t0=usecond()/1e6; | ||||
| 	for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k); | ||||
| 	double t1=usecond()/1e6; | ||||
| 	std::cout<<GridLogMessage <<"IRL::Initial steps: "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||
| 	std::cout<<GridLogMessage <<"IRL::Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl; | ||||
| //	std:: cout<<GridLogMessage <<"norm2(evec[1])= " << norm2(evec[1]) << std::endl; | ||||
| //	std:: cout<<GridLogMessage <<"norm2(evec[2])= " << norm2(evec[2]) << std::endl; | ||||
| //	std:: cout <<"norm2(evec[1])= " << norm2(evec[1]) << std::endl; | ||||
| //	std:: cout <<"norm2(evec[2])= " << norm2(evec[2]) << std::endl; | ||||
| 	RitzMatrix(evec,Nk); | ||||
| 	t1=usecond()/1e6; | ||||
| 	std::cout<<GridLogMessage <<"IRL::RitzMatrix: "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||
| 	for(int k=0; k<Nk; ++k){ | ||||
| //	std:: cout<<GridLogMessage <<"eval " << k << " " <<eval[k] << std::endl; | ||||
| //	std:: cout<<GridLogMessage <<"lme " << k << " " << lme[k] << std::endl; | ||||
| //	std:: cout <<"eval " << k << " " <<eval[k] << std::endl; | ||||
| //	std:: cout <<"lme " << k << " " << lme[k] << std::endl; | ||||
| 	} | ||||
|  | ||||
| 	// Restarting loop begins | ||||
| 	for(int iter = 0; iter<Niter; ++iter){ | ||||
|  | ||||
| 	  std::cout<<GridLogMessage<<"\n Restart iteration = "<< iter << std::endl; | ||||
| 	  std::cout<<"\n Restart iteration = "<< iter << std::endl; | ||||
|  | ||||
| 	  //  | ||||
| 	  // Rudy does a sort first which looks very different. Getting fed up with sorting out the algo defs. | ||||
| 	  // We loop over  | ||||
| 	  // | ||||
| 	OrthoTime=0.; | ||||
| 	  for(int k=Nk; k<Nm; ++k) step(eval,lme,evec,f,Nm,k); | ||||
| 	t1=usecond()/1e6; | ||||
| 	std::cout<<GridLogMessage <<"IRL:: "<<Np <<" steps: "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||
| 	std::cout<<GridLogMessage <<"IRL::Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl; | ||||
| 	  f *= lme[Nm-1]; | ||||
|  | ||||
| 	  RitzMatrix(evec,k2); | ||||
| 	t1=usecond()/1e6; | ||||
| 	std::cout<<GridLogMessage <<"IRL:: RitzMatrix: "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||
| 	   | ||||
| 	  // getting eigenvalues | ||||
| 	  for(int k=0; k<Nm; ++k){ | ||||
| @@ -583,27 +554,18 @@ until convergence | ||||
| 	  } | ||||
| 	  setUnit_Qt(Nm,Qt); | ||||
| 	  diagonalize(eval2,lme2,Nm,Nm,Qt,grid); | ||||
| 	t1=usecond()/1e6; | ||||
| 	std::cout<<GridLogMessage <<"IRL:: diagonalize: "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||
|  | ||||
| 	  // sorting | ||||
| 	  _sort.push(eval2,Nm); | ||||
| 	t1=usecond()/1e6; | ||||
| 	std::cout<<GridLogMessage <<"IRL:: eval sorting: "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||
| 	   | ||||
| 	  // Implicitly shifted QR transformations | ||||
| 	  setUnit_Qt(Nm,Qt); | ||||
| 	  for(int ip=0; ip<k2; ++ip){ | ||||
| 	std::cout<<GridLogMessage << "eval "<< ip << " "<< eval2[ip] << std::endl; | ||||
| 	} | ||||
| 	  for(int ip=k2; ip<Nm; ++ip){  | ||||
| 	std::cout<<GridLogMessage << "qr_decomp "<< ip << " "<< eval2[ip] << std::endl; | ||||
| 	std::cout << "qr_decomp "<< ip << " "<< eval2[ip] << std::endl; | ||||
| 	    qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm); | ||||
| 		 | ||||
| 	} | ||||
| 	t1=usecond()/1e6; | ||||
| 	std::cout<<GridLogMessage <<"IRL::qr_decomp: "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||
| if (0) {   | ||||
|      | ||||
| 	  for(int i=0; i<(Nk+1); ++i) B[i] = 0.0; | ||||
| 	   | ||||
| 	  for(int j=k1-1; j<k2+1; ++j){ | ||||
| @@ -612,38 +574,14 @@ if (0) { | ||||
| 	      B[j] += Qt[k+Nm*j] * evec[k]; | ||||
| 	    } | ||||
| 	  } | ||||
| 	t1=usecond()/1e6; | ||||
| 	std::cout<<GridLogMessage <<"IRL::QR Rotate: "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||
| } | ||||
|  | ||||
| if (1) { | ||||
| 	for(int i=0; i<(Nk+1); ++i) { | ||||
| 		B[i] = 0.0; | ||||
| 	  	B[i].checkerboard = evec[0].checkerboard; | ||||
| 	} | ||||
|  | ||||
| 	int j_block = 24; int k_block=24; | ||||
| PARALLEL_FOR_LOOP | ||||
| 	for(int ss=0;ss < grid->oSites();ss++){ | ||||
| 	for(int jj=k1-1; jj<k2+1; jj += j_block) | ||||
| 	for(int kk=0; kk<Nm; kk += k_block) | ||||
| 	for(int j=jj; (j<(k2+1)) && j<(jj+j_block); ++j){ | ||||
| 	for(int k=kk; (k<Nm) && k<(kk+k_block) ; ++k){ | ||||
| 	    B[j]._odata[ss] +=Qt[k+Nm*j] * evec[k]._odata[ss];  | ||||
| 	} | ||||
| 	} | ||||
| 	} | ||||
| 	t1=usecond()/1e6; | ||||
| 	std::cout<<GridLogMessage <<"IRL::QR rotation: "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||
| } | ||||
| 	for(int j=k1-1; j<k2+1; ++j) evec[j] = B[j]; | ||||
| 	  for(int j=k1-1; j<k2+1; ++j) evec[j] = B[j]; | ||||
|  | ||||
| 	  // Compressed vector f and beta(k2) | ||||
| 	  f *= Qt[Nm-1+Nm*(k2-1)]; | ||||
| 	  f += lme[k2-1] * evec[k2]; | ||||
| 	  beta_k = norm2(f); | ||||
| 	  beta_k = sqrt(beta_k); | ||||
| 	  std::cout<<GridLogMessage<<" beta(k) = "<<beta_k<<std::endl; | ||||
| 	  std::cout<<" beta(k) = "<<beta_k<<std::endl; | ||||
|  | ||||
| 	  RealD betar = 1.0/beta_k; | ||||
| 	  evec[k2] = betar * f; | ||||
| @@ -656,10 +594,7 @@ PARALLEL_FOR_LOOP | ||||
| 	  } | ||||
| 	  setUnit_Qt(Nm,Qt); | ||||
| 	  diagonalize(eval2,lme2,Nk,Nm,Qt,grid); | ||||
| 	t1=usecond()/1e6; | ||||
| 	std::cout<<GridLogMessage <<"IRL::diagonalize: "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||
| 	   | ||||
| if (0) { | ||||
| 	  for(int k = 0; k<Nk; ++k) B[k]=0.0; | ||||
| 	   | ||||
| 	  for(int j = 0; j<Nk; ++j){ | ||||
| @@ -667,34 +602,12 @@ if (0) { | ||||
| 	    B[j].checkerboard = evec[k].checkerboard; | ||||
| 	      B[j] += Qt[k+j*Nm] * evec[k]; | ||||
| 	    } | ||||
| 	    std::cout<<GridLogMessage << "norm(B["<<j<<"])="<<norm2(B[j])<<std::endl; | ||||
| //	    std::cout << "norm(B["<<j<<"])="<<norm2(B[j])<<std::endl; | ||||
| 	  } | ||||
| 	t1=usecond()/1e6; | ||||
| 	std::cout<<GridLogMessage <<"IRL::Convergence rotation: "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||
| } | ||||
| if (1) { | ||||
| 	for(int i=0; i<(Nk+1); ++i) { | ||||
| 		B[i] = 0.0; | ||||
| 	  	B[i].checkerboard = evec[0].checkerboard; | ||||
| 	} | ||||
|  | ||||
| 	int j_block = 24; int k_block=24; | ||||
| PARALLEL_FOR_LOOP | ||||
| 	for(int ss=0;ss < grid->oSites();ss++){ | ||||
| 	for(int jj=0; jj<Nk; jj += j_block) | ||||
| 	for(int kk=0; kk<Nk; kk += k_block) | ||||
| 	for(int j=jj; (j<Nk) && j<(jj+j_block); ++j){ | ||||
| 	for(int k=kk; (k<Nk) && k<(kk+k_block) ; ++k){ | ||||
| 	    B[j]._odata[ss] +=Qt[k+Nm*j] * evec[k]._odata[ss];  | ||||
| 	} | ||||
| 	} | ||||
| 	} | ||||
| 	t1=usecond()/1e6; | ||||
| 	std::cout<<GridLogMessage <<"IRL::convergence rotation : "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||
| } | ||||
| //	_sort.push(eval2,B,Nk); | ||||
|  | ||||
| 	  Nconv = 0; | ||||
| 	  //	  std::cout<<GridLogMessage << std::setiosflags(std::ios_base::scientific); | ||||
| 	  //	  std::cout << std::setiosflags(std::ios_base::scientific); | ||||
| 	  for(int i=0; i<Nk; ++i){ | ||||
|  | ||||
| //	    _poly(_Linop,B[i],v); | ||||
| @@ -702,16 +615,14 @@ PARALLEL_FOR_LOOP | ||||
| 	     | ||||
| 	    RealD vnum = real(innerProduct(B[i],v)); // HermOp. | ||||
| 	    RealD vden = norm2(B[i]); | ||||
| 	    RealD vv0 = norm2(v); | ||||
| 	    eval2[i] = vnum/vden; | ||||
| 	    v -= eval2[i]*B[i]; | ||||
| 	    RealD vv = norm2(v); | ||||
| 	     | ||||
| 	    std::cout.precision(13); | ||||
| 	    std::cout<<GridLogMessage << "[" << std::setw(3)<< std::setiosflags(std::ios_base::right) <<i<<"] "; | ||||
| 	    std::cout<<"eval = "<<std::setw(25)<< std::setiosflags(std::ios_base::left)<< eval2[i]; | ||||
| 	    std::cout<<"|H B[i] - eval[i]B[i]|^2 "<< std::setw(25)<< std::setiosflags(std::ios_base::right)<< vv; | ||||
| 	    std::cout<<" "<< vnum/(sqrt(vden)*sqrt(vv0)) << std::endl; | ||||
| 	    std::cout << "[" << std::setw(3)<< std::setiosflags(std::ios_base::right) <<i<<"] "; | ||||
| 	    std::cout << "eval = "<<std::setw(25)<< std::setiosflags(std::ios_base::left)<< eval2[i]; | ||||
| 	    std::cout <<" |H B[i] - eval[i]B[i]|^2 "<< std::setw(25)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl; | ||||
| 	     | ||||
| 	// change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged | ||||
| 	    if((vv<eresid*eresid) && (i == Nconv) ){ | ||||
| @@ -720,19 +631,17 @@ PARALLEL_FOR_LOOP | ||||
| 	    } | ||||
|  | ||||
| 	  }  // i-loop end | ||||
| 	  //	  std::cout<<GridLogMessage << std::resetiosflags(std::ios_base::scientific); | ||||
| 	t1=usecond()/1e6; | ||||
| 	std::cout<<GridLogMessage <<"IRL::convergence testing: "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||
| 	  //	  std::cout << std::resetiosflags(std::ios_base::scientific); | ||||
|  | ||||
|  | ||||
| 	  std::cout<<GridLogMessage<<" #modes converged: "<<Nconv<<std::endl; | ||||
| 	  std::cout<<" #modes converged: "<<Nconv<<std::endl; | ||||
|  | ||||
| 	  if( Nconv>=Nstop ){ | ||||
| 	    goto converged; | ||||
| 	  } | ||||
| 	} // end of iter loop | ||||
| 	 | ||||
| 	std::cout<<GridLogMessage<<"\n NOT converged.\n"; | ||||
| 	std::cout<<"\n NOT converged.\n"; | ||||
| 	abort(); | ||||
| 	 | ||||
|       converged: | ||||
| @@ -745,10 +654,10 @@ PARALLEL_FOR_LOOP | ||||
|        } | ||||
|       _sort.push(eval,evec,Nconv); | ||||
|  | ||||
|       std::cout<<GridLogMessage << "\n Converged\n Summary :\n"; | ||||
|       std::cout<<GridLogMessage << " -- Iterations  = "<< Nconv  << "\n"; | ||||
|       std::cout<<GridLogMessage << " -- beta(k)     = "<< beta_k << "\n"; | ||||
|       std::cout<<GridLogMessage << " -- Nconv       = "<< Nconv  << "\n"; | ||||
|       std::cout << "\n Converged\n Summary :\n"; | ||||
|       std::cout << " -- Iterations  = "<< Nconv  << "\n"; | ||||
|       std::cout << " -- beta(k)     = "<< beta_k << "\n"; | ||||
|       std::cout << " -- Nconv       = "<< Nconv  << "\n"; | ||||
|      } | ||||
|  | ||||
|     ///////////////////////////////////////////////// | ||||
| @@ -771,25 +680,25 @@ PARALLEL_FOR_LOOP | ||||
| 	} | ||||
|       } | ||||
|  | ||||
|       std::cout<<GridLogMessage<<"Lanczos_Factor start/end " <<start <<"/"<<end<<std::endl; | ||||
|       std::cout<<"Lanczos_Factor start/end " <<start <<"/"<<end<<std::endl; | ||||
|  | ||||
|       // Starting from scratch, bq[0] contains a random vector and |bq[0]| = 1 | ||||
|       int first; | ||||
|       if(start == 0){ | ||||
|  | ||||
| 	std::cout<<GridLogMessage << "start == 0\n"; //TESTING | ||||
| 	std::cout << "start == 0\n"; //TESTING | ||||
|  | ||||
| 	_poly(_Linop,bq[0],bf); | ||||
|  | ||||
| 	alpha = real(innerProduct(bq[0],bf));//alpha =  bq[0]^dag A bq[0] | ||||
|  | ||||
| 	std::cout<<GridLogMessage << "alpha = " << alpha << std::endl; | ||||
| 	std::cout << "alpha = " << alpha << std::endl; | ||||
| 	 | ||||
| 	bf = bf - alpha * bq[0];  //bf =  A bq[0] - alpha bq[0] | ||||
|  | ||||
| 	H[0][0]=alpha; | ||||
|  | ||||
| 	std::cout<<GridLogMessage << "Set H(0,0) to " << H[0][0] << std::endl; | ||||
| 	std::cout << "Set H(0,0) to " << H[0][0] << std::endl; | ||||
|  | ||||
| 	first = 1; | ||||
|  | ||||
| @@ -809,19 +718,19 @@ PARALLEL_FOR_LOOP | ||||
|  | ||||
| 	beta = 0;sqbt = 0; | ||||
|  | ||||
| 	std::cout<<GridLogMessage << "cont is true so setting beta to zero\n"; | ||||
| 	std::cout << "cont is true so setting beta to zero\n"; | ||||
|  | ||||
|       }	else { | ||||
|  | ||||
| 	beta = norm2(bf); | ||||
| 	sqbt = sqrt(beta); | ||||
|  | ||||
| 	std::cout<<GridLogMessage << "beta = " << beta << std::endl; | ||||
| 	std::cout << "beta = " << beta << std::endl; | ||||
|       } | ||||
|  | ||||
|       for(int j=first;j<end;j++){ | ||||
|  | ||||
| 	std::cout<<GridLogMessage << "Factor j " << j <<std::endl; | ||||
| 	std::cout << "Factor j " << j <<std::endl; | ||||
|  | ||||
| 	if(cont){ // switches to factoring; understand start!=0 and initial bf value is right. | ||||
| 	  bq[j] = bf; cont = false; | ||||
| @@ -844,7 +753,7 @@ PARALLEL_FOR_LOOP | ||||
|  | ||||
| 	beta = fnorm; | ||||
| 	sqbt = sqrt(beta); | ||||
| 	std::cout<<GridLogMessage << "alpha = " << alpha << " fnorm = " << fnorm << '\n'; | ||||
| 	std::cout << "alpha = " << alpha << " fnorm = " << fnorm << '\n'; | ||||
|  | ||||
| 	///Iterative refinement of orthogonality V = [ bq[0]  bq[1]  ...  bq[M] ] | ||||
| 	int re = 0; | ||||
| @@ -879,8 +788,8 @@ PARALLEL_FOR_LOOP | ||||
| 	  bck = sqrt( nmbex ); | ||||
| 	  re++; | ||||
| 	} | ||||
| 	std::cout<<GridLogMessage << "Iteratively refined orthogonality, changes alpha\n"; | ||||
| 	if(re > 1) std::cout<<GridLogMessage << "orthagonality refined " << re << " times" <<std::endl; | ||||
| 	std::cout << "Iteratively refined orthogonality, changes alpha\n"; | ||||
| 	if(re > 1) std::cout << "orthagonality refined " << re << " times" <<std::endl; | ||||
| 	H[j][j]=alpha; | ||||
|       } | ||||
|  | ||||
| @@ -895,13 +804,11 @@ PARALLEL_FOR_LOOP | ||||
|  | ||||
|     void ImplicitRestart(int TM, DenseVector<RealD> &evals,  DenseVector<DenseVector<RealD> > &evecs, DenseVector<Field> &bq, Field &bf, int cont) | ||||
|     { | ||||
|       std::cout<<GridLogMessage << "ImplicitRestart begin. Eigensort starting\n"; | ||||
|       std::cout << "ImplicitRestart begin. Eigensort starting\n"; | ||||
|  | ||||
|       DenseMatrix<RealD> H; Resize(H,Nm,Nm); | ||||
|  | ||||
| #ifndef USE_LAPACK | ||||
|       EigenSort(evals, evecs); | ||||
| #endif | ||||
|  | ||||
|       ///Assign shifts | ||||
|       int K=Nk; | ||||
| @@ -924,15 +831,15 @@ PARALLEL_FOR_LOOP | ||||
|       /// Shifted H defines a new K step Arnoldi factorization | ||||
|       RealD  beta = H[ff][ff-1];  | ||||
|       RealD  sig  = Q[TM - 1][ff - 1]; | ||||
|       std::cout<<GridLogMessage << "beta = " << beta << " sig = " << real(sig) <<std::endl; | ||||
|       std::cout << "beta = " << beta << " sig = " << real(sig) <<std::endl; | ||||
|  | ||||
|       std::cout<<GridLogMessage << "TM = " << TM << " "; | ||||
|       std::cout<<GridLogMessage << norm2(bq[0]) << " -- before" <<std::endl; | ||||
|       std::cout << "TM = " << TM << " "; | ||||
|       std::cout << norm2(bq[0]) << " -- before" <<std::endl; | ||||
|  | ||||
|       /// q -> q Q | ||||
|       times_real(bq, Q, TM); | ||||
|  | ||||
|       std::cout<<GridLogMessage << norm2(bq[0]) << " -- after " << ff <<std::endl; | ||||
|       std::cout << norm2(bq[0]) << " -- after " << ff <<std::endl; | ||||
|       bf =  beta* bq[ff] + sig* bf; | ||||
|  | ||||
|       /// Do the rest of the factorization | ||||
| @@ -956,7 +863,7 @@ PARALLEL_FOR_LOOP | ||||
|       int ff = Lanczos_Factor(0, M, cont, bq,bf,H); // 0--M to begin with | ||||
|  | ||||
|       if(ff < M) { | ||||
| 	std::cout<<GridLogMessage << "Krylov: aborting ff "<<ff <<" "<<M<<std::endl; | ||||
| 	std::cout << "Krylov: aborting ff "<<ff <<" "<<M<<std::endl; | ||||
| 	abort(); // Why would this happen? | ||||
|       } | ||||
|  | ||||
| @@ -965,7 +872,7 @@ PARALLEL_FOR_LOOP | ||||
|  | ||||
|       for(int it = 0; it < Niter && (converged < Nk); ++it) { | ||||
|  | ||||
| 	std::cout<<GridLogMessage << "Krylov: Iteration --> " << it << std::endl; | ||||
| 	std::cout << "Krylov: Iteration --> " << it << std::endl; | ||||
| 	int lock_num = lock ? converged : 0; | ||||
| 	DenseVector<RealD> tevals(M - lock_num ); | ||||
| 	DenseMatrix<RealD> tevecs; Resize(tevecs,M - lock_num,M - lock_num); | ||||
| @@ -981,7 +888,7 @@ PARALLEL_FOR_LOOP | ||||
|       Wilkinson<RealD>(H, evals, evecs, small);  | ||||
|       //      Check(); | ||||
|  | ||||
|       std::cout<<GridLogMessage << "Done  "<<std::endl; | ||||
|       std::cout << "Done  "<<std::endl; | ||||
|  | ||||
|     } | ||||
|  | ||||
| @@ -1046,7 +953,7 @@ PARALLEL_FOR_LOOP | ||||
| 		  DenseVector<RealD> &tevals, DenseVector<DenseVector<RealD> > &tevecs,  | ||||
| 		  int lock, int converged) | ||||
|     { | ||||
|       std::cout<<GridLogMessage << "Converged " << converged << " so far." << std::endl; | ||||
|       std::cout << "Converged " << converged << " so far." << std::endl; | ||||
|       int lock_num = lock ? converged : 0; | ||||
|       int M = Nm; | ||||
|  | ||||
| @@ -1061,9 +968,7 @@ PARALLEL_FOR_LOOP | ||||
|       RealD small=1.0e-16; | ||||
|       Wilkinson<RealD>(AH, tevals, tevecs, small); | ||||
|  | ||||
| #ifndef USE_LAPACK | ||||
|       EigenSort(tevals, tevecs); | ||||
| #endif | ||||
|  | ||||
|       RealD resid_nrm=  norm2(bf); | ||||
|  | ||||
| @@ -1074,7 +979,7 @@ PARALLEL_FOR_LOOP | ||||
| 	RealD diff = 0; | ||||
| 	diff = abs( tevecs[i][Nm - 1 - lock_num] ) * resid_nrm; | ||||
|  | ||||
| 	std::cout<<GridLogMessage << "residual estimate " << SS-1-i << " " << diff << " of (" << tevals[i] << ")" << std::endl; | ||||
| 	std::cout << "residual estimate " << SS-1-i << " " << diff << " of (" << tevals[i] << ")" << std::endl; | ||||
|  | ||||
| 	if(diff < converged) { | ||||
|  | ||||
| @@ -1090,13 +995,13 @@ PARALLEL_FOR_LOOP | ||||
| 	    lock_num++; | ||||
| 	  } | ||||
| 	  converged++; | ||||
| 	  std::cout<<GridLogMessage << " converged on eval " << converged << " of " << Nk << std::endl; | ||||
| 	  std::cout << " converged on eval " << converged << " of " << Nk << std::endl; | ||||
| 	} else { | ||||
| 	  break; | ||||
| 	} | ||||
|       } | ||||
| #endif | ||||
|       std::cout<<GridLogMessage << "Got " << converged << " so far " <<std::endl;	 | ||||
|       std::cout << "Got " << converged << " so far " <<std::endl;	 | ||||
|     } | ||||
|  | ||||
|     ///Check | ||||
| @@ -1105,9 +1010,7 @@ PARALLEL_FOR_LOOP | ||||
|  | ||||
|       DenseVector<RealD> goodval(this->get); | ||||
|  | ||||
| #ifndef USE_LAPACK | ||||
|       EigenSort(evals,evecs); | ||||
| #endif | ||||
|  | ||||
|       int NM = Nm; | ||||
|  | ||||
| @@ -1179,16 +1082,14 @@ say con = 2 | ||||
| **/ | ||||
|  | ||||
| template<class T> | ||||
| static void Lock(DenseMatrix<T> &H, 	///Hess mtx	 | ||||
| 		 DenseMatrix<T> &Q, 	///Lock Transform | ||||
| 		 T val, 		///value to be locked | ||||
| 		 int con, 	///number already locked | ||||
| static void Lock(DenseMatrix<T> &H, 	// Hess mtx	 | ||||
| 		 DenseMatrix<T> &Q, 	// Lock Transform | ||||
| 		 T val, 		// value to be locked | ||||
| 		 int con, 	// number already locked | ||||
| 		 RealD small, | ||||
| 		 int dfg, | ||||
| 		 bool herm) | ||||
| {	 | ||||
|  | ||||
|  | ||||
|   //ForceTridiagonal(H); | ||||
|  | ||||
|   int M = H.dim; | ||||
| @@ -1221,7 +1122,6 @@ static void Lock(DenseMatrix<T> &H, 	///Hess mtx | ||||
|   AH = Hermitian(QQ)*AH; | ||||
|   AH = AH*QQ; | ||||
|  | ||||
|  | ||||
|   for(int i=con;i<M;i++){ | ||||
|     for(int j=con;j<M;j++){ | ||||
|       Q[i][j]=QQ[i-con][j-con]; | ||||
|   | ||||
| @@ -1,453 +0,0 @@ | ||||
|     /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
|     Source file: ./lib/algorithms/iterative/Matrix.h | ||||
|  | ||||
|     Copyright (C) 2015 | ||||
|  | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
|  | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
|  | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #ifndef MATRIX_H | ||||
| #define MATRIX_H | ||||
|  | ||||
| #include <cstdlib> | ||||
| #include <string> | ||||
| #include <cmath> | ||||
| #include <vector> | ||||
| #include <iostream> | ||||
| #include <iomanip> | ||||
| #include <complex> | ||||
| #include <typeinfo> | ||||
| #include <Grid.h> | ||||
|  | ||||
|  | ||||
| /** Sign function **/ | ||||
| template <class T> T sign(T p){return ( p/abs(p) );} | ||||
|  | ||||
| ///////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| ///////////////////// Hijack STL containers for our wicked means ///////////////////////////////////////// | ||||
| ///////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| template<class T> using Vector = Vector<T>; | ||||
| template<class T> using Matrix = Vector<Vector<T> >; | ||||
|  | ||||
| template<class T> void Resize(Vector<T > & vec, int N) { vec.resize(N); } | ||||
|  | ||||
| template<class T> void Resize(Matrix<T > & mat, int N, int M) {  | ||||
|   mat.resize(N); | ||||
|   for(int i=0;i<N;i++){ | ||||
|     mat[i].resize(M); | ||||
|   } | ||||
| } | ||||
| template<class T> void Size(Vector<T> & vec, int &N)  | ||||
| {  | ||||
|   N= vec.size(); | ||||
| } | ||||
| template<class T> void Size(Matrix<T> & mat, int &N,int &M)  | ||||
| {  | ||||
|   N= mat.size(); | ||||
|   M= mat[0].size(); | ||||
| } | ||||
| template<class T> void SizeSquare(Matrix<T> & mat, int &N)  | ||||
| {  | ||||
|   int M; Size(mat,N,M); | ||||
|   assert(N==M); | ||||
| } | ||||
| template<class T> void SizeSame(Matrix<T> & mat1,Matrix<T> &mat2, int &N1,int &M1)  | ||||
| {  | ||||
|   int N2,M2; | ||||
|   Size(mat1,N1,M1); | ||||
|   Size(mat2,N2,M2); | ||||
|   assert(N1==N2); | ||||
|   assert(M1==M2); | ||||
| } | ||||
|  | ||||
| //***************************************** | ||||
| //*	(Complex) Vector operations	* | ||||
| //***************************************** | ||||
|  | ||||
| /**Conj of a Vector **/ | ||||
| template <class T> Vector<T> conj(Vector<T> p){ | ||||
| 	Vector<T> q(p.size()); | ||||
| 	for(int i=0;i<p.size();i++){q[i] = conj(p[i]);} | ||||
| 	return q; | ||||
| } | ||||
|  | ||||
| /** Norm of a Vector**/ | ||||
| template <class T> T norm(Vector<T> p){ | ||||
| 	T sum = 0; | ||||
| 	for(int i=0;i<p.size();i++){sum = sum + p[i]*conj(p[i]);} | ||||
| 	return abs(sqrt(sum)); | ||||
| } | ||||
|  | ||||
| /** Norm squared of a Vector **/ | ||||
| template <class T> T norm2(Vector<T> p){ | ||||
| 	T sum = 0; | ||||
| 	for(int i=0;i<p.size();i++){sum = sum + p[i]*conj(p[i]);} | ||||
| 	return abs((sum)); | ||||
| } | ||||
|  | ||||
| /** Sum elements of a Vector **/ | ||||
| template <class T> T trace(Vector<T> p){ | ||||
| 	T sum = 0; | ||||
| 	for(int i=0;i<p.size();i++){sum = sum + p[i];} | ||||
| 	return sum; | ||||
| } | ||||
|  | ||||
| /** Fill a Vector with constant c **/ | ||||
| template <class T> void Fill(Vector<T> &p, T c){ | ||||
| 	for(int i=0;i<p.size();i++){p[i] = c;} | ||||
| } | ||||
| /** Normalize a Vector **/ | ||||
| template <class T> void normalize(Vector<T> &p){ | ||||
| 	T m = norm(p); | ||||
| 	if( abs(m) > 0.0) for(int i=0;i<p.size();i++){p[i] /= m;} | ||||
| } | ||||
| /** Vector by scalar **/ | ||||
| template <class T, class U> Vector<T> times(Vector<T> p, U s){ | ||||
| 	for(int i=0;i<p.size();i++){p[i] *= s;} | ||||
| 	return p; | ||||
| } | ||||
| template <class T, class U> Vector<T> times(U s, Vector<T> p){ | ||||
| 	for(int i=0;i<p.size();i++){p[i] *= s;} | ||||
| 	return p; | ||||
| } | ||||
| /** inner product of a and b = conj(a) . b **/ | ||||
| template <class T> T inner(Vector<T> a, Vector<T> b){ | ||||
| 	T m = 0.; | ||||
| 	for(int i=0;i<a.size();i++){m = m + conj(a[i])*b[i];} | ||||
| 	return m; | ||||
| } | ||||
| /** sum of a and b = a + b **/ | ||||
| template <class T> Vector<T> add(Vector<T> a, Vector<T> b){ | ||||
| 	Vector<T> m(a.size()); | ||||
| 	for(int i=0;i<a.size();i++){m[i] = a[i] + b[i];} | ||||
| 	return m; | ||||
| } | ||||
| /** sum of a and b = a - b **/ | ||||
| template <class T> Vector<T> sub(Vector<T> a, Vector<T> b){ | ||||
| 	Vector<T> m(a.size()); | ||||
| 	for(int i=0;i<a.size();i++){m[i] = a[i] - b[i];} | ||||
| 	return m; | ||||
| } | ||||
|  | ||||
| /**  | ||||
|  ********************************* | ||||
|  *	Matrices	         * | ||||
|  ********************************* | ||||
|  **/ | ||||
|  | ||||
| template<class T> void Fill(Matrix<T> & mat, T&val) {  | ||||
|   int N,M; | ||||
|   Size(mat,N,M); | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int j=0;j<M;j++){ | ||||
|     mat[i][j] = val; | ||||
|   }} | ||||
| } | ||||
|  | ||||
| /** Transpose of a matrix **/ | ||||
| Matrix<T> Transpose(Matrix<T> & mat){ | ||||
|   int N,M; | ||||
|   Size(mat,N,M); | ||||
|   Matrix C; Resize(C,M,N); | ||||
|   for(int i=0;i<M;i++){ | ||||
|   for(int j=0;j<N;j++){ | ||||
|     C[i][j] = mat[j][i]; | ||||
|   }}  | ||||
|   return C; | ||||
| } | ||||
| /** Set Matrix to unit matrix **/ | ||||
| template<class T> void Unity(Matrix<T> &mat){ | ||||
|   int N;  SizeSquare(mat,N); | ||||
|   for(int i=0;i<N;i++){ | ||||
|     for(int j=0;j<N;j++){ | ||||
|       if ( i==j ) A[i][j] = 1; | ||||
|       else        A[i][j] = 0; | ||||
|     }  | ||||
|   }  | ||||
| } | ||||
| /** Add C * I to matrix **/ | ||||
| template<class T> | ||||
| void PlusUnit(Matrix<T> & A,T c){ | ||||
|   int dim;  SizeSquare(A,dim); | ||||
|   for(int i=0;i<dim;i++){A[i][i] = A[i][i] + c;}  | ||||
| } | ||||
|  | ||||
| /** return the Hermitian conjugate of matrix **/ | ||||
| Matrix<T> HermitianConj(Matrix<T> &mat){ | ||||
|  | ||||
|   int dim; SizeSquare(mat,dim); | ||||
|  | ||||
|   Matrix<T> C; Resize(C,dim,dim); | ||||
|  | ||||
|   for(int i=0;i<dim;i++){ | ||||
|     for(int j=0;j<dim;j++){ | ||||
|       C[i][j] = conj(mat[j][i]); | ||||
|     }  | ||||
|   }  | ||||
|   return C; | ||||
| } | ||||
|  | ||||
| /** return diagonal entries as a Vector **/ | ||||
| Vector<T> diag(Matrix<T> &A) | ||||
| { | ||||
|   int dim; SizeSquare(A,dim); | ||||
|   Vector<T> d; Resize(d,dim); | ||||
|  | ||||
|   for(int i=0;i<dim;i++){ | ||||
|     d[i] = A[i][i]; | ||||
|   } | ||||
|   return d; | ||||
| } | ||||
|  | ||||
| /** Left multiply by a Vector **/ | ||||
| Vector<T> operator *(Vector<T> &B,Matrix<T> &A) | ||||
| { | ||||
|   int K,M,N;  | ||||
|   Size(B,K); | ||||
|   Size(A,M,N); | ||||
|   assert(K==M); | ||||
|    | ||||
|   Vector<T> C; Resize(C,N); | ||||
|  | ||||
|   for(int j=0;j<N;j++){ | ||||
|     T sum = 0.0; | ||||
|     for(int i=0;i<M;i++){ | ||||
|       sum += B[i] * A[i][j]; | ||||
|     } | ||||
|     C[j] =  sum; | ||||
|   } | ||||
|   return C;  | ||||
| } | ||||
|  | ||||
| /** return 1/diagonal entries as a Vector **/ | ||||
| Vector<T> inv_diag(Matrix<T> & A){ | ||||
|   int dim; SizeSquare(A,dim); | ||||
|   Vector<T> d; Resize(d,dim); | ||||
|   for(int i=0;i<dim;i++){ | ||||
|     d[i] = 1.0/A[i][i]; | ||||
|   } | ||||
|   return d; | ||||
| } | ||||
| /** Matrix Addition **/ | ||||
| inline Matrix<T> operator + (Matrix<T> &A,Matrix<T> &B) | ||||
| { | ||||
|   int N,M  ; SizeSame(A,B,N,M); | ||||
|   Matrix C; Resize(C,N,M); | ||||
|   for(int i=0;i<N;i++){ | ||||
|     for(int j=0;j<M;j++){ | ||||
|       C[i][j] = A[i][j] +  B[i][j]; | ||||
|     }  | ||||
|   }  | ||||
|   return C; | ||||
| }  | ||||
| /** Matrix Subtraction **/ | ||||
| inline Matrix<T> operator- (Matrix<T> & A,Matrix<T> &B){ | ||||
|   int N,M  ; SizeSame(A,B,N,M); | ||||
|   Matrix C; Resize(C,N,M); | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int j=0;j<M;j++){ | ||||
|     C[i][j] = A[i][j] -  B[i][j]; | ||||
|   }} | ||||
|   return C; | ||||
| }  | ||||
|  | ||||
| /** Matrix scalar multiplication **/ | ||||
| inline Matrix<T> operator* (Matrix<T> & A,T c){ | ||||
|   int N,M; Size(A,N,M); | ||||
|   Matrix C; Resize(C,N,M); | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int j=0;j<M;j++){ | ||||
|     C[i][j] = A[i][j]*c; | ||||
|   }}  | ||||
|   return C; | ||||
| }  | ||||
| /** Matrix Matrix multiplication **/ | ||||
| inline Matrix<T> operator* (Matrix<T> &A,Matrix<T> &B){ | ||||
|   int K,L,N,M; | ||||
|   Size(A,K,L); | ||||
|   Size(B,N,M); assert(L==N); | ||||
|   Matrix C; Resize(C,K,M); | ||||
|  | ||||
|   for(int i=0;i<K;i++){ | ||||
|     for(int j=0;j<M;j++){ | ||||
|       T sum = 0.0; | ||||
|       for(int k=0;k<N;k++) sum += A[i][k]*B[k][j]; | ||||
|       C[i][j] =sum; | ||||
|     } | ||||
|   } | ||||
|   return C;  | ||||
| }  | ||||
| /** Matrix Vector multiplication **/ | ||||
| inline Vector<T> operator* (Matrix<T> &A,Vector<T> &B){ | ||||
|   int M,N,K; | ||||
|   Size(A,N,M); | ||||
|   Size(B,K); assert(K==M); | ||||
|   Vector<T> C; Resize(C,N); | ||||
|   for(int i=0;i<N;i++){ | ||||
|     T sum = 0.0; | ||||
|     for(int j=0;j<M;j++) sum += A[i][j]*B[j]; | ||||
|     C[i] =  sum; | ||||
|   } | ||||
|   return C;  | ||||
| }  | ||||
|  | ||||
| /** Some version of Matrix norm **/ | ||||
| /* | ||||
| inline T Norm(){ // this is not a usual L2 norm | ||||
|     T norm = 0; | ||||
|     for(int i=0;i<dim;i++){ | ||||
|       for(int j=0;j<dim;j++){ | ||||
| 	norm += abs(A[i][j]); | ||||
|     }} | ||||
|     return norm; | ||||
|   } | ||||
| */ | ||||
|  | ||||
| /** Some version of Matrix norm **/ | ||||
| template<class T> T LargestDiag(Matrix<T> &A) | ||||
| { | ||||
|   int dim ; SizeSquare(A,dim);  | ||||
|  | ||||
|   T ld = abs(A[0][0]); | ||||
|   for(int i=1;i<dim;i++){ | ||||
|     T cf = abs(A[i][i]); | ||||
|     if(abs(cf) > abs(ld) ){ld = cf;} | ||||
|   } | ||||
|   return ld; | ||||
| } | ||||
|  | ||||
| /** Look for entries on the leading subdiagonal that are smaller than 'small' **/ | ||||
| template <class T,class U> int Chop_subdiag(Matrix<T> &A,T norm, int offset, U small) | ||||
| { | ||||
|   int dim; SizeSquare(A,dim); | ||||
|   for(int l = dim - 1 - offset; l >= 1; l--) {             		 | ||||
|     if((U)abs(A[l][l - 1]) < (U)small) { | ||||
|       A[l][l-1]=(U)0.0; | ||||
|       return l; | ||||
|     } | ||||
|   } | ||||
|   return 0; | ||||
| } | ||||
|  | ||||
| /** Look for entries on the leading subdiagonal that are smaller than 'small' **/ | ||||
| template <class T,class U> int Chop_symm_subdiag(Matrix<T> & A,T norm, int offset, U small)  | ||||
| { | ||||
|   int dim; SizeSquare(A,dim); | ||||
|   for(int l = dim - 1 - offset; l >= 1; l--) { | ||||
|     if((U)abs(A[l][l - 1]) < (U)small) { | ||||
|       A[l][l - 1] = (U)0.0; | ||||
|       A[l - 1][l] = (U)0.0; | ||||
|       return l; | ||||
|     } | ||||
|   } | ||||
|   return 0; | ||||
| } | ||||
| /**Assign a submatrix to a larger one**/ | ||||
| template<class T> | ||||
| void AssignSubMtx(Matrix<T> & A,int row_st, int row_end, int col_st, int col_end, Matrix<T> &S) | ||||
| { | ||||
|   for(int i = row_st; i<row_end; i++){ | ||||
|     for(int j = col_st; j<col_end; j++){ | ||||
|       A[i][j] = S[i - row_st][j - col_st]; | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| /**Get a square submatrix**/ | ||||
| template <class T> | ||||
| Matrix<T> GetSubMtx(Matrix<T> &A,int row_st, int row_end, int col_st, int col_end) | ||||
| { | ||||
|   Matrix<T> H; Resize(row_end - row_st,col_end-col_st); | ||||
|  | ||||
|   for(int i = row_st; i<row_end; i++){ | ||||
|   for(int j = col_st; j<col_end; j++){ | ||||
|     H[i-row_st][j-col_st]=A[i][j]; | ||||
|   }} | ||||
|   return H; | ||||
| } | ||||
|    | ||||
|  /**Assign a submatrix to a larger one NB remember Vector Vectors are transposes of the matricies they represent**/ | ||||
| template<class T> | ||||
| void AssignSubMtx(Matrix<T> & A,int row_st, int row_end, int col_st, int col_end, Matrix<T> &S) | ||||
| { | ||||
|   for(int i = row_st; i<row_end; i++){ | ||||
|   for(int j = col_st; j<col_end; j++){ | ||||
|     A[i][j] = S[i - row_st][j - col_st]; | ||||
|   }} | ||||
| } | ||||
|    | ||||
| /** compute b_i A_ij b_j **/ // surprised no Conj | ||||
| template<class T> T proj(Matrix<T> A, Vector<T> B){ | ||||
|   int dim; SizeSquare(A,dim); | ||||
|   int dimB; Size(B,dimB); | ||||
|   assert(dimB==dim); | ||||
|   T C = 0; | ||||
|   for(int i=0;i<dim;i++){ | ||||
|     T sum = 0.0; | ||||
|     for(int j=0;j<dim;j++){ | ||||
|       sum += A[i][j]*B[j]; | ||||
|     } | ||||
|     C +=  B[i]*sum; // No conj? | ||||
|   } | ||||
|   return C;  | ||||
| } | ||||
|  | ||||
|  | ||||
| /* | ||||
|  ************************************************************* | ||||
|  * | ||||
|  * Matrix Vector products | ||||
|  * | ||||
|  ************************************************************* | ||||
|  */ | ||||
| // Instead make a linop and call my CG; | ||||
|  | ||||
| /// q -> q Q | ||||
| template <class T,class Fermion> void times(Vector<Fermion> &q, Matrix<T> &Q) | ||||
| { | ||||
|   int M; SizeSquare(Q,M); | ||||
|   int N; Size(q,N);  | ||||
|   assert(M==N); | ||||
|  | ||||
|   times(q,Q,N); | ||||
| } | ||||
|  | ||||
| /// q -> q Q | ||||
| template <class T> void times(multi1d<LatticeFermion> &q, Matrix<T> &Q, int N) | ||||
| { | ||||
|   GridBase *grid = q[0]._grid; | ||||
|   int M; SizeSquare(Q,M); | ||||
|   int K; Size(q,K);  | ||||
|   assert(N<M); | ||||
|   assert(N<K); | ||||
|   Vector<Fermion> S(N,grid ); | ||||
|   for(int j=0;j<N;j++){ | ||||
|     S[j] = zero; | ||||
|     for(int k=0;k<N;k++){ | ||||
|       S[j] = S[j] +  q[k]* Q[k][j];  | ||||
|     } | ||||
|   } | ||||
|   for(int j=0;j<q.size();j++){ | ||||
|     q[j] = S[j]; | ||||
|   } | ||||
| } | ||||
| #endif | ||||
| @@ -1,15 +0,0 @@ | ||||
| - ConjugateGradientMultiShift | ||||
| - MCR | ||||
|  | ||||
| - Potentially Useful Boost libraries | ||||
|  | ||||
| - MultiArray | ||||
| - Aligned allocator; memory pool | ||||
| - Remez -- Mike or Boost? | ||||
| - Multiprecision | ||||
| - quaternians | ||||
| - Tokenize | ||||
| - Serialization | ||||
| - Regex | ||||
| - Proto (ET) | ||||
| - uBlas | ||||
| @@ -1,122 +0,0 @@ | ||||
| #include <math.h> | ||||
| #include <stdlib.h> | ||||
| #include <vector> | ||||
|  | ||||
| struct Bisection { | ||||
|  | ||||
| static void get_eig2(int row_num,std::vector<RealD> &ALPHA,std::vector<RealD> &BETA, std::vector<RealD> & eig) | ||||
| { | ||||
|   int i,j; | ||||
|   std::vector<RealD> evec1(row_num+3); | ||||
|   std::vector<RealD> evec2(row_num+3); | ||||
|   RealD eps2; | ||||
|   ALPHA[1]=0.; | ||||
|   BETHA[1]=0.; | ||||
|   for(i=0;i<row_num-1;i++) { | ||||
|     ALPHA[i+1] = A[i*(row_num+1)].real(); | ||||
|     BETHA[i+2] = A[i*(row_num+1)+1].real(); | ||||
|   } | ||||
|   ALPHA[row_num] = A[(row_num-1)*(row_num+1)].real(); | ||||
|   bisec(ALPHA,BETHA,row_num,1,row_num,1e-10,1e-10,evec1,eps2); | ||||
|   bisec(ALPHA,BETHA,row_num,1,row_num,1e-16,1e-16,evec2,eps2); | ||||
|  | ||||
|   // Do we really need to sort here? | ||||
|   int begin=1; | ||||
|   int end = row_num; | ||||
|   int swapped=1; | ||||
|   while(swapped) { | ||||
|     swapped=0; | ||||
|     for(i=begin;i<end;i++){ | ||||
|       if(mag(evec2[i])>mag(evec2[i+1]))	{ | ||||
| 	swap(evec2+i,evec2+i+1); | ||||
| 	swapped=1; | ||||
|       } | ||||
|     } | ||||
|     end--; | ||||
|     for(i=end-1;i>=begin;i--){ | ||||
|       if(mag(evec2[i])>mag(evec2[i+1]))	{ | ||||
| 	swap(evec2+i,evec2+i+1); | ||||
| 	swapped=1; | ||||
|       } | ||||
|     } | ||||
|     begin++; | ||||
|   } | ||||
|  | ||||
|   for(i=0;i<row_num;i++){ | ||||
|     for(j=0;j<row_num;j++) { | ||||
|       if(i==j) H[i*row_num+j]=evec2[i+1]; | ||||
|       else H[i*row_num+j]=0.; | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| static void bisec(std::vector<RealD> &c,    | ||||
| 		  std::vector<RealD> &b, | ||||
| 		  int n, | ||||
| 		  int m1, | ||||
| 		  int m2, | ||||
| 		  RealD eps1, | ||||
| 		  RealD relfeh, | ||||
| 		  std::vector<RealD> &x, | ||||
| 		  RealD &eps2) | ||||
| { | ||||
|   std::vector<RealD> wu(n+2); | ||||
|  | ||||
|   RealD h,q,x1,xu,x0,xmin,xmax;  | ||||
|   int i,a,k; | ||||
|  | ||||
|   b[1]=0.0; | ||||
|   xmin=c[n]-fabs(b[n]); | ||||
|   xmax=c[n]+fabs(b[n]); | ||||
|   for(i=1;i<n;i++){ | ||||
|     h=fabs(b[i])+fabs(b[i+1]); | ||||
|     if(c[i]+h>xmax) xmax= c[i]+h; | ||||
|     if(c[i]-h<xmin) xmin= c[i]-h; | ||||
|   } | ||||
|   xmax *=2.; | ||||
|  | ||||
|   eps2=relfeh*((xmin+xmax)>0.0 ? xmax : -xmin); | ||||
|   if(eps1<=0.0) eps1=eps2; | ||||
|   eps2=0.5*eps1+7.0*(eps2); | ||||
|   x0=xmax; | ||||
|   for(i=m1;i<=m2;i++){ | ||||
|     x[i]=xmax; | ||||
|     wu[i]=xmin; | ||||
|   } | ||||
|  | ||||
|   for(k=m2;k>=m1;k--){ | ||||
|     xu=xmin; | ||||
|     i=k; | ||||
|     do{ | ||||
|       if(xu<wu[i]){ | ||||
| 	xu=wu[i]; | ||||
| 	i=m1-1; | ||||
|       } | ||||
|       i--; | ||||
|     }while(i>=m1); | ||||
|     if(x0>x[k]) x0=x[k]; | ||||
|     while((x0-xu)>2*relfeh*(fabs(xu)+fabs(x0))+eps1){ | ||||
|       x1=(xu+x0)/2; | ||||
|  | ||||
|       a=0; | ||||
|       q=1.0; | ||||
|       for(i=1;i<=n;i++){ | ||||
| 	q=c[i]-x1-((q!=0.0)? b[i]*b[i]/q:fabs(b[i])/relfeh); | ||||
| 	if(q<0) a++; | ||||
|       } | ||||
|       //			printf("x1=%e a=%d\n",x1,a); | ||||
|       if(a<k){ | ||||
| 	if(a<m1){ | ||||
| 	  xu=x1; | ||||
| 	  wu[m1]=x1; | ||||
| 	}else { | ||||
| 	  xu=x1; | ||||
| 	  wu[a+1]=x1; | ||||
| 	  if(x[a]>x1) x[a]=x1; | ||||
| 	} | ||||
|       }else x0=x1; | ||||
|     } | ||||
|     x[k]=(x0+xu)/2; | ||||
|   } | ||||
| } | ||||
| } | ||||
| @@ -1 +0,0 @@ | ||||
|  | ||||
| @@ -1,7 +1,7 @@ | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| #include <Grid/Grid.h> | ||||
| #include <Grid/GridCore.h> | ||||
| 
 | ||||
| namespace Grid { | ||||
| 
 | ||||
| @@ -13,9 +13,10 @@ void *PointerCache::Insert(void *ptr,size_t bytes) { | ||||
| 
 | ||||
|   if (bytes < 4096 ) return NULL; | ||||
| 
 | ||||
| #ifdef _OPENMP | ||||
| #ifdef GRID_OMP | ||||
|   assert(omp_in_parallel()==0); | ||||
| #endif  | ||||
| 
 | ||||
|   void * ret = NULL; | ||||
|   int v = -1; | ||||
| 
 | ||||
| @@ -6,8 +6,9 @@ | ||||
|  | ||||
|     Copyright (C) 2015 | ||||
|  | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|     Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|     Author: Guido Cossu <guido.cossu@ed.ac.uk> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
| @@ -49,10 +50,9 @@ public: | ||||
|  | ||||
|     GridBase(const std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {}; | ||||
|  | ||||
|  | ||||
|     // Physics Grid information. | ||||
|     std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes. | ||||
|     std::vector<int> _fdimensions;// Global dimensions of array prior to cb removal | ||||
|     std::vector<int> _fdimensions;// (full) Global dimensions of array prior to cb removal | ||||
|     std::vector<int> _gdimensions;// Global dimensions of array after cb removal | ||||
|     std::vector<int> _ldimensions;// local dimensions of array with processor images removed | ||||
|     std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed  | ||||
| @@ -62,13 +62,12 @@ public: | ||||
|     int _isites; | ||||
|     int _fsites;                  // _isites*_osites = product(dimensions). | ||||
|     int _gsites; | ||||
|     std::vector<int> _slice_block;   // subslice information | ||||
|     std::vector<int> _slice_block;// subslice information | ||||
|     std::vector<int> _slice_stride; | ||||
|     std::vector<int> _slice_nblock; | ||||
|  | ||||
|     // Might need these at some point | ||||
|     //    std::vector<int> _lstart;     // local start of array in gcoors. _processor_coor[d]*_ldimensions[d] | ||||
|     //    std::vector<int> _lend;       // local end of array in gcoors    _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1 | ||||
|     std::vector<int> _lstart;     // local start of array in gcoors _processor_coor[d]*_ldimensions[d] | ||||
|     std::vector<int> _lend  ;     // local end of array in gcoors   _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1 | ||||
|  | ||||
| public: | ||||
|  | ||||
| @@ -99,7 +98,7 @@ public: | ||||
|     virtual int oIndex(std::vector<int> &coor) | ||||
|     { | ||||
|         int idx=0; | ||||
| 	// Works with either global or local coordinates | ||||
|         // Works with either global or local coordinates | ||||
|         for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]); | ||||
|         return idx; | ||||
|     } | ||||
| @@ -121,6 +120,11 @@ public: | ||||
|       Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions); | ||||
|     } | ||||
|  | ||||
|     inline void InOutCoorToLocalCoor (std::vector<int> &ocoor, std::vector<int> &icoor, std::vector<int> &lcoor) { | ||||
|       lcoor.resize(_ndimension); | ||||
|       for (int d = 0; d < _ndimension; d++) | ||||
|         lcoor[d] = ocoor[d] + _rdimensions[d] * icoor[d]; | ||||
|     } | ||||
|  | ||||
|     ////////////////////////////////////////////////////////// | ||||
|     // SIMD lane addressing | ||||
| @@ -129,6 +133,7 @@ public: | ||||
|     { | ||||
|       Lexicographic::CoorFromIndex(coor,lane,_simd_layout); | ||||
|     } | ||||
|  | ||||
|     inline int PermuteDim(int dimension){ | ||||
|       return _simd_layout[dimension]>1; | ||||
|     } | ||||
| @@ -146,15 +151,15 @@ public: | ||||
|       // Distance should be either 0,1,2.. | ||||
|       // | ||||
|       if ( _simd_layout[dimension] > 2 ) {  | ||||
| 	for(int d=0;d<_ndimension;d++){ | ||||
| 	  if ( d != dimension ) assert ( (_simd_layout[d]==1)  ); | ||||
| 	} | ||||
| 	permute_type = RotateBit; // How to specify distance; this is not just direction. | ||||
| 	return permute_type; | ||||
|         for(int d=0;d<_ndimension;d++){ | ||||
|           if ( d != dimension ) assert ( (_simd_layout[d]==1)  ); | ||||
|         } | ||||
|         permute_type = RotateBit; // How to specify distance; this is not just direction. | ||||
|         return permute_type; | ||||
|       } | ||||
|  | ||||
|       for(int d=_ndimension-1;d>dimension;d--){ | ||||
| 	if (_simd_layout[d]>1 ) permute_type++; | ||||
|         if (_simd_layout[d]>1 ) permute_type++; | ||||
|       } | ||||
|       return permute_type; | ||||
|     } | ||||
| @@ -169,26 +174,50 @@ public: | ||||
|     inline int gSites(void) const { return _isites*_osites*_Nprocessors; };  | ||||
|     inline int Nd    (void) const { return _ndimension;}; | ||||
|  | ||||
|     inline const std::vector<int> LocalStarts(void)             { return _lstart;    }; | ||||
|     inline const std::vector<int> &FullDimensions(void)         { return _fdimensions;}; | ||||
|     inline const std::vector<int> &GlobalDimensions(void)       { return _gdimensions;}; | ||||
|     inline const std::vector<int> &LocalDimensions(void)        { return _ldimensions;}; | ||||
|     inline const std::vector<int> &VirtualLocalDimensions(void) { return _ldimensions;}; | ||||
|  | ||||
|     //////////////////////////////////////////////////////////////// | ||||
|     // Utility to print the full decomposition details  | ||||
|     //////////////////////////////////////////////////////////////// | ||||
|  | ||||
|     void show_decomposition(){ | ||||
|       std::cout << GridLogMessage << "Full Dimensions    : " << _fdimensions << std::endl; | ||||
|       std::cout << GridLogMessage << "Global Dimensions  : " << _gdimensions << std::endl; | ||||
|       std::cout << GridLogMessage << "Local Dimensions   : " << _ldimensions << std::endl; | ||||
|       std::cout << GridLogMessage << "Reduced Dimensions : " << _rdimensions << std::endl; | ||||
|       std::cout << GridLogMessage << "Outer strides      : " << _ostride << std::endl; | ||||
|       std::cout << GridLogMessage << "Inner strides      : " << _istride << std::endl; | ||||
|       std::cout << GridLogMessage << "iSites             : " << _isites << std::endl; | ||||
|       std::cout << GridLogMessage << "oSites             : " << _osites << std::endl; | ||||
|       std::cout << GridLogMessage << "lSites             : " << lSites() << std::endl;         | ||||
|       std::cout << GridLogMessage << "gSites             : " << gSites() << std::endl; | ||||
|       std::cout << GridLogMessage << "Nd                 : " << _ndimension << std::endl;              | ||||
|     }  | ||||
|  | ||||
|     //////////////////////////////////////////////////////////////// | ||||
|     // Global addressing | ||||
|     //////////////////////////////////////////////////////////////// | ||||
|     void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){ | ||||
|       assert(gidx< gSites()); | ||||
|       Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions); | ||||
|     } | ||||
|     void LocalIndexToLocalCoor(int lidx,std::vector<int> &lcoor){ | ||||
|       assert(lidx<lSites()); | ||||
|       Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions); | ||||
|     } | ||||
|  | ||||
|  | ||||
|  | ||||
|     void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){ | ||||
|       gidx=0; | ||||
|       int mult=1; | ||||
|       for(int mu=0;mu<_ndimension;mu++) { | ||||
| 	gidx+=mult*gcoor[mu]; | ||||
| 	mult*=_gdimensions[mu]; | ||||
|         gidx+=mult*gcoor[mu]; | ||||
|         mult*=_gdimensions[mu]; | ||||
|       } | ||||
|     } | ||||
|     void GlobalCoorToProcessorCoorLocalCoor(std::vector<int> &pcoor,std::vector<int> &lcoor,const std::vector<int> &gcoor) | ||||
| @@ -196,9 +225,9 @@ public: | ||||
|       pcoor.resize(_ndimension); | ||||
|       lcoor.resize(_ndimension); | ||||
|       for(int mu=0;mu<_ndimension;mu++){ | ||||
| 	int _fld  = _fdimensions[mu]/_processors[mu]; | ||||
| 	pcoor[mu] = gcoor[mu]/_fld; | ||||
| 	lcoor[mu] = gcoor[mu]%_fld; | ||||
|         int _fld  = _fdimensions[mu]/_processors[mu]; | ||||
|         pcoor[mu] = gcoor[mu]/_fld; | ||||
|         lcoor[mu] = gcoor[mu]%_fld; | ||||
|       } | ||||
|     } | ||||
|     void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor) | ||||
| @@ -207,16 +236,16 @@ public: | ||||
|       std::vector<int> lcoor; | ||||
|       GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor); | ||||
|       rank = RankFromProcessorCoor(pcoor); | ||||
|  | ||||
|       /* | ||||
|       std::vector<int> cblcoor(lcoor); | ||||
|       for(int d=0;d<cblcoor.size();d++){ | ||||
| 	if( this->CheckerBoarded(d) ) { | ||||
| 	  cblcoor[d] = lcoor[d]/2; | ||||
| 	} | ||||
|         if( this->CheckerBoarded(d) ) { | ||||
|           cblcoor[d] = lcoor[d]/2; | ||||
|         } | ||||
|       } | ||||
|  | ||||
|       i_idx= iIndex(cblcoor);// this does not imply divide by 2 on checker dim | ||||
|       o_idx= oIndex(lcoor);  // this implies divide by 2 on checkerdim | ||||
|       */ | ||||
|       i_idx= iIndex(lcoor); | ||||
|       o_idx= oIndex(lcoor); | ||||
|     } | ||||
|  | ||||
|     void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor) | ||||
| @@ -238,7 +267,7 @@ public: | ||||
|     { | ||||
|       RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor); | ||||
|       if(CheckerBoarded(0)){ | ||||
| 	fcoor[0] = fcoor[0]*2+cb; | ||||
|         fcoor[0] = fcoor[0]*2+cb; | ||||
|       } | ||||
|     } | ||||
|     void ProcessorCoorLocalCoorToGlobalCoor(std::vector<int> &Pcoor,std::vector<int> &Lcoor,std::vector<int> &gcoor) | ||||
|   | ||||
| @@ -76,6 +76,8 @@ public: | ||||
|         _ldimensions.resize(_ndimension); | ||||
|         _rdimensions.resize(_ndimension); | ||||
|         _simd_layout.resize(_ndimension); | ||||
| 	_lstart.resize(_ndimension); | ||||
| 	_lend.resize(_ndimension); | ||||
|              | ||||
|         _ostride.resize(_ndimension); | ||||
|         _istride.resize(_ndimension); | ||||
| @@ -94,8 +96,10 @@ public: | ||||
| 	  // Use a reduced simd grid | ||||
| 	  _ldimensions[d]= _gdimensions[d]/_processors[d];  //local dimensions | ||||
| 	  _rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition | ||||
| 	  _osites *= _rdimensions[d]; | ||||
| 	  _isites *= _simd_layout[d]; | ||||
| 	  _lstart[d]     = _processor_coor[d]*_ldimensions[d]; | ||||
| 	  _lend[d]       = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1; | ||||
| 	  _osites  *= _rdimensions[d]; | ||||
| 	  _isites  *= _simd_layout[d]; | ||||
|                  | ||||
| 	  // Addressing support | ||||
| 	  if ( d==0 ) { | ||||
|   | ||||
| @@ -151,6 +151,8 @@ public: | ||||
|       _ldimensions.resize(_ndimension); | ||||
|       _rdimensions.resize(_ndimension); | ||||
|       _simd_layout.resize(_ndimension); | ||||
|       _lstart.resize(_ndimension); | ||||
|       _lend.resize(_ndimension); | ||||
|        | ||||
|       _ostride.resize(_ndimension); | ||||
|       _istride.resize(_ndimension); | ||||
| @@ -169,6 +171,8 @@ public: | ||||
| 	  _gdimensions[d] = _gdimensions[d]/2; // Remove a checkerboard | ||||
| 	} | ||||
| 	_ldimensions[d] = _gdimensions[d]/_processors[d]; | ||||
| 	_lstart[d]     = _processor_coor[d]*_ldimensions[d]; | ||||
| 	_lend[d]       = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1; | ||||
|  | ||||
| 	// Use a reduced simd grid | ||||
| 	_simd_layout[d] = simd_layout[d]; | ||||
|   | ||||
| @@ -25,7 +25,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #include <Grid/Grid.h> | ||||
| #include <Grid/GridCore.h> | ||||
|  | ||||
| namespace Grid { | ||||
|  | ||||
| /////////////////////////////////////////////////////////////// | ||||
| @@ -33,6 +34,7 @@ namespace Grid { | ||||
| /////////////////////////////////////////////////////////////// | ||||
| void *              CartesianCommunicator::ShmCommBuf; | ||||
| uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024;  | ||||
| CartesianCommunicator::CommunicatorPolicy_t  CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent; | ||||
|  | ||||
| ///////////////////////////////// | ||||
| // Alloc, free shmem region | ||||
| @@ -58,6 +60,7 @@ void CartesianCommunicator::ShmBufferFreeAll(void) { | ||||
| ///////////////////////////////// | ||||
| // Grid information queries | ||||
| ///////////////////////////////// | ||||
| int                      CartesianCommunicator::Dimensions(void)         { return _ndimension; }; | ||||
| int                      CartesianCommunicator::IsBoss(void)            { return _processor==0; }; | ||||
| int                      CartesianCommunicator::BossRank(void)          { return 0; }; | ||||
| int                      CartesianCommunicator::ThisRank(void)          { return _processor; }; | ||||
| @@ -88,7 +91,10 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) | ||||
|  | ||||
| #if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L) | ||||
|  | ||||
| void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||
| int                      CartesianCommunicator::NodeCount(void)    { return ProcessorCount();}; | ||||
| int                      CartesianCommunicator::RankCount(void)    { return ProcessorCount();}; | ||||
|  | ||||
| double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||
| 						       void *xmit, | ||||
| 						       int xmit_to_rank, | ||||
| 						       void *recv, | ||||
| @@ -96,6 +102,7 @@ void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_ | ||||
| 						       int bytes) | ||||
| { | ||||
|   SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); | ||||
|   return 2.0*bytes; | ||||
| } | ||||
| void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall) | ||||
| { | ||||
|   | ||||
| @@ -116,6 +116,12 @@ class CartesianCommunicator { | ||||
|   // Implemented in Communicator_base.C | ||||
|   ///////////////////////////////// | ||||
|   static void * ShmCommBuf; | ||||
|  | ||||
|   // Isend/Irecv/Wait, or Sendrecv blocking | ||||
|   enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential }; | ||||
|   static CommunicatorPolicy_t CommunicatorPolicy; | ||||
|   static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; } | ||||
|  | ||||
|   size_t heap_top; | ||||
|   size_t heap_bytes; | ||||
|  | ||||
| @@ -142,12 +148,15 @@ class CartesianCommunicator { | ||||
|   int  RankFromProcessorCoor(std::vector<int> &coor); | ||||
|   void ProcessorCoorFromRank(int rank,std::vector<int> &coor); | ||||
|    | ||||
|   int                      Dimensions(void)        ; | ||||
|   int                      IsBoss(void)            ; | ||||
|   int                      BossRank(void)          ; | ||||
|   int                      ThisRank(void)          ; | ||||
|   const std::vector<int> & ThisProcessorCoor(void) ; | ||||
|   const std::vector<int> & ProcessorGrid(void)     ; | ||||
|   int                      ProcessorCount(void)    ; | ||||
|   int                      NodeCount(void)    ; | ||||
|   int                      RankCount(void)    ; | ||||
|  | ||||
|   //////////////////////////////////////////////////////////////////////////////// | ||||
|   // very VERY rarely (Log, serial RNG) we need world without a grid | ||||
| @@ -168,6 +177,8 @@ class CartesianCommunicator { | ||||
|   void GlobalSumVector(ComplexF *c,int N); | ||||
|   void GlobalSum(ComplexD &c); | ||||
|   void GlobalSumVector(ComplexD *c,int N); | ||||
|   void GlobalXOR(uint32_t &); | ||||
|   void GlobalXOR(uint64_t &); | ||||
|    | ||||
|   template<class obj> void GlobalSum(obj &o){ | ||||
|     typedef typename obj::scalar_type scalar_type; | ||||
| @@ -200,7 +211,7 @@ class CartesianCommunicator { | ||||
|    | ||||
|   void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall); | ||||
|  | ||||
|   void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||
|   double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||
| 				  void *xmit, | ||||
| 				  int xmit_to_rank, | ||||
| 				  void *recv, | ||||
|   | ||||
| @@ -25,7 +25,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #include <Grid/Grid.h> | ||||
| #include <Grid/GridCore.h> | ||||
| #include <Grid/GridQCDcore.h> | ||||
| #include <Grid/qcd/action/ActionCore.h> | ||||
| #include <mpi.h> | ||||
|  | ||||
| namespace Grid { | ||||
| @@ -39,9 +41,13 @@ MPI_Comm CartesianCommunicator::communicator_world; | ||||
| // Should error check all MPI calls. | ||||
| void CartesianCommunicator::Init(int *argc, char ***argv) { | ||||
|   int flag; | ||||
|   int provided; | ||||
|   MPI_Initialized(&flag); // needed to coexist with other libs apparently | ||||
|   if ( !flag ) { | ||||
|     MPI_Init(argc,argv); | ||||
|     MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); | ||||
|     if ( provided != MPI_THREAD_MULTIPLE ) { | ||||
|       QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute; | ||||
|     } | ||||
|   } | ||||
|   MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); | ||||
|   ShmInitGeneric(); | ||||
| @@ -77,6 +83,14 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){ | ||||
|   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); | ||||
|   assert(ierr==0); | ||||
| } | ||||
| void CartesianCommunicator::GlobalXOR(uint32_t &u){ | ||||
|   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator); | ||||
|   assert(ierr==0); | ||||
| } | ||||
| void CartesianCommunicator::GlobalXOR(uint64_t &u){ | ||||
|   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator); | ||||
|   assert(ierr==0); | ||||
| } | ||||
| void CartesianCommunicator::GlobalSum(float &f){ | ||||
|   int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); | ||||
|   assert(ierr==0); | ||||
| @@ -152,24 +166,34 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis | ||||
| 						int from, | ||||
| 						int bytes) | ||||
| { | ||||
|   MPI_Request xrq; | ||||
|   MPI_Request rrq; | ||||
|   int rank = _processor; | ||||
|   int myrank = _processor; | ||||
|   int ierr; | ||||
|   ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); | ||||
|   ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); | ||||
|   if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {  | ||||
|     MPI_Request xrq; | ||||
|     MPI_Request rrq; | ||||
|  | ||||
|   assert(ierr==0); | ||||
|     ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); | ||||
|     ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); | ||||
|      | ||||
|   list.push_back(xrq); | ||||
|   list.push_back(rrq); | ||||
|     assert(ierr==0); | ||||
|     list.push_back(xrq); | ||||
|     list.push_back(rrq); | ||||
|   } else {  | ||||
|     // Give the CPU to MPI immediately; can use threads to overlap optionally | ||||
|     ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, | ||||
| 		      recv,bytes,MPI_CHAR,from, from, | ||||
| 		      communicator,MPI_STATUS_IGNORE); | ||||
|     assert(ierr==0); | ||||
|   } | ||||
| } | ||||
| void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) | ||||
| { | ||||
|   int nreq=list.size(); | ||||
|   std::vector<MPI_Status> status(nreq); | ||||
|   int ierr = MPI_Waitall(nreq,&list[0],&status[0]); | ||||
|   assert(ierr==0); | ||||
|   if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {  | ||||
|     int nreq=list.size(); | ||||
|     std::vector<MPI_Status> status(nreq); | ||||
|     int ierr = MPI_Waitall(nreq,&list[0],&status[0]); | ||||
|     assert(ierr==0); | ||||
|   } | ||||
| } | ||||
|  | ||||
| void CartesianCommunicator::Barrier(void) | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -25,9 +25,23 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #include <Grid/Grid.h> | ||||
| #include <Grid/GridCore.h> | ||||
|  | ||||
| #include <mpi.h> | ||||
|  | ||||
| #include <semaphore.h> | ||||
| #include <fcntl.h> | ||||
| #include <unistd.h> | ||||
| #include <limits.h> | ||||
| #include <sys/types.h> | ||||
| #include <sys/ipc.h> | ||||
| #include <sys/shm.h> | ||||
| #include <sys/mman.h> | ||||
| //#include <zlib.h> | ||||
| #ifndef SHM_HUGETLB | ||||
| #define SHM_HUGETLB 04000 | ||||
| #endif | ||||
|  | ||||
| namespace Grid { | ||||
|  | ||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| @@ -50,6 +64,11 @@ std::vector<int> CartesianCommunicator::GroupRanks; | ||||
| std::vector<int> CartesianCommunicator::MyGroup; | ||||
| std::vector<void *> CartesianCommunicator::ShmCommBufs; | ||||
|  | ||||
| int CartesianCommunicator::NodeCount(void)    { return GroupSize;}; | ||||
| int CartesianCommunicator::RankCount(void)    { return WorldSize;}; | ||||
|  | ||||
|  | ||||
| #undef FORCE_COMMS | ||||
| void *CartesianCommunicator::ShmBufferSelf(void) | ||||
| { | ||||
|   return ShmCommBufs[ShmRank]; | ||||
| @@ -57,6 +76,9 @@ void *CartesianCommunicator::ShmBufferSelf(void) | ||||
| void *CartesianCommunicator::ShmBuffer(int rank) | ||||
| { | ||||
|   int gpeer = GroupRanks[rank]; | ||||
| #ifdef FORCE_COMMS | ||||
|   return NULL; | ||||
| #endif | ||||
|   if (gpeer == MPI_UNDEFINED){ | ||||
|     return NULL; | ||||
|   } else {  | ||||
| @@ -65,7 +87,13 @@ void *CartesianCommunicator::ShmBuffer(int rank) | ||||
| } | ||||
| void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) | ||||
| { | ||||
|   static int count =0; | ||||
|   int gpeer = GroupRanks[rank]; | ||||
|   assert(gpeer!=ShmRank); // never send to self | ||||
|   assert(rank!=WorldRank);// never send to self | ||||
| #ifdef FORCE_COMMS | ||||
|   return NULL; | ||||
| #endif | ||||
|   if (gpeer == MPI_UNDEFINED){ | ||||
|     return NULL; | ||||
|   } else {  | ||||
| @@ -76,16 +104,27 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) | ||||
| } | ||||
|  | ||||
| void CartesianCommunicator::Init(int *argc, char ***argv) { | ||||
|  | ||||
|   int flag; | ||||
|   int provided; | ||||
|   //  mtrace(); | ||||
|  | ||||
|   MPI_Initialized(&flag); // needed to coexist with other libs apparently | ||||
|   if ( !flag ) { | ||||
|     MPI_Init(argc,argv); | ||||
|     MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); | ||||
|     assert (provided == MPI_THREAD_MULTIPLE); | ||||
|   } | ||||
|  | ||||
|   Grid_quiesce_nodes(); | ||||
|  | ||||
|   MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); | ||||
|   MPI_Comm_rank(communicator_world,&WorldRank); | ||||
|   MPI_Comm_size(communicator_world,&WorldSize); | ||||
|  | ||||
|   if ( WorldRank == 0 ) { | ||||
|     std::cout << GridLogMessage<< "Initialising MPI "<< WorldRank <<"/"<<WorldSize <<std::endl; | ||||
|   } | ||||
|  | ||||
|   ///////////////////////////////////////////////////////////////////// | ||||
|   // Split into groups that can share memory | ||||
|   ///////////////////////////////////////////////////////////////////// | ||||
| @@ -131,7 +170,6 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { | ||||
|   /////////////////////////////////////////////////////////////////// | ||||
|   int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator_world); | ||||
|   assert(ierr==0); | ||||
|    | ||||
|   /////////////////////////////////////////////////////////////////// | ||||
|   // find the group leaders world rank | ||||
|   /////////////////////////////////////////////////////////////////// | ||||
| @@ -141,7 +179,6 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { | ||||
|       leaders_group[group++] = l; | ||||
|     } | ||||
|   } | ||||
|    | ||||
|   /////////////////////////////////////////////////////////////////// | ||||
|   // Identify the rank of the group in which I (and my leader) live | ||||
|   /////////////////////////////////////////////////////////////////// | ||||
| @@ -152,39 +189,114 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { | ||||
|     } | ||||
|   } | ||||
|   assert(GroupRank!=-1); | ||||
|    | ||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // allocate the shared window for our group | ||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   MPI_Barrier(ShmComm); | ||||
|  | ||||
|   ShmCommBuf = 0; | ||||
|   ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow); | ||||
|   assert(ierr==0); | ||||
|   // KNL hack -- force to numa-domain 1 in flat | ||||
| #if 0 | ||||
|   //#include <numaif.h> | ||||
|   for(uint64_t page=0;page<MAX_MPI_SHM_BYTES;page+=4096){ | ||||
|     void *pages = (void *) ( page + ShmCommBuf ); | ||||
|     int status; | ||||
|     int flags=MPOL_MF_MOVE_ALL; | ||||
|     int nodes=1; // numa domain == MCDRAM | ||||
|     unsigned long count=1; | ||||
|     ierr= move_pages(0,count, &pages,&nodes,&status,flags); | ||||
|     if (ierr && (page==0)) perror("numa relocate command failed"); | ||||
|   } | ||||
| #endif | ||||
|   MPI_Win_lock_all (MPI_MODE_NOCHECK, ShmWindow); | ||||
|    | ||||
|   ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // Plan: allocate a fixed SHM region. Scratch that is just used via some scheme during stencil comms, with no allocate free. | ||||
|   ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   ShmCommBufs.resize(ShmSize); | ||||
|   for(int r=0;r<ShmSize;r++){ | ||||
|     MPI_Aint sz; | ||||
|     int dsp_unit; | ||||
|     MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]); | ||||
|  | ||||
| #if 1 | ||||
|   char shm_name [NAME_MAX]; | ||||
|   if ( ShmRank == 0 ) { | ||||
|     for(int r=0;r<ShmSize;r++){ | ||||
|  | ||||
|       size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES; | ||||
|  | ||||
|       sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",GroupRank,r); | ||||
|  | ||||
|       shm_unlink(shm_name); | ||||
|       int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666); | ||||
|       if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      } | ||||
|       ftruncate(fd, size); | ||||
|  | ||||
|       void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); | ||||
|       if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    } | ||||
|       assert(((uint64_t)ptr&0x3F)==0); | ||||
|       ShmCommBufs[r] =ptr; | ||||
|        | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   MPI_Barrier(ShmComm); | ||||
|  | ||||
|   if ( ShmRank != 0 ) {  | ||||
|     for(int r=0;r<ShmSize;r++){ | ||||
|       size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES ; | ||||
|      | ||||
|       sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",GroupRank,r); | ||||
|  | ||||
|       int fd=shm_open(shm_name,O_RDWR,0666); | ||||
|       if ( fd<0 ) {	perror("failed shm_open");	assert(0);      } | ||||
|  | ||||
|       void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); | ||||
|       if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    } | ||||
|       assert(((uint64_t)ptr&0x3F)==0); | ||||
|       ShmCommBufs[r] =ptr; | ||||
|     } | ||||
|   } | ||||
|  | ||||
| #else | ||||
|   std::vector<int> shmids(ShmSize); | ||||
|  | ||||
|   if ( ShmRank == 0 ) { | ||||
|     for(int r=0;r<ShmSize;r++){ | ||||
|       size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES; | ||||
|       key_t key   = 0x4545 + r; | ||||
|       if ((shmids[r]= shmget(key,size, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) { | ||||
| 	int errsv = errno; | ||||
| 	printf("Errno %d\n",errsv); | ||||
| 	perror("shmget"); | ||||
| 	exit(1); | ||||
|       } | ||||
|       printf("shmid: 0x%x\n", shmids[r]); | ||||
|     } | ||||
|   } | ||||
|   MPI_Barrier(ShmComm); | ||||
|   MPI_Bcast(&shmids[0],ShmSize*sizeof(int),MPI_BYTE,0,ShmComm); | ||||
|   MPI_Barrier(ShmComm); | ||||
|  | ||||
|   for(int r=0;r<ShmSize;r++){ | ||||
|     ShmCommBufs[r] = (uint64_t *)shmat(shmids[r], NULL,0); | ||||
|     if (ShmCommBufs[r] == (uint64_t *)-1) { | ||||
|       perror("Shared memory attach failure"); | ||||
|       shmctl(shmids[r], IPC_RMID, NULL); | ||||
|       exit(2); | ||||
|     } | ||||
|     printf("shmaddr: %p\n", ShmCommBufs[r]); | ||||
|   } | ||||
|   MPI_Barrier(ShmComm); | ||||
|   // Mark for clean up | ||||
|   for(int r=0;r<ShmSize;r++){ | ||||
|     shmctl(shmids[r], IPC_RMID,(struct shmid_ds *)NULL); | ||||
|   } | ||||
|   MPI_Barrier(ShmComm); | ||||
|  | ||||
| #endif | ||||
|   ShmCommBuf         = ShmCommBufs[ShmRank]; | ||||
|  | ||||
|   MPI_Barrier(ShmComm); | ||||
|   if ( ShmRank == 0 ) { | ||||
|     for(int r=0;r<ShmSize;r++){ | ||||
|       uint64_t * check = (uint64_t *) ShmCommBufs[r]; | ||||
|       check[0] = GroupRank; | ||||
|       check[1] = r; | ||||
|       check[2] = 0x5A5A5A; | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   MPI_Barrier(ShmComm); | ||||
|   for(int r=0;r<ShmSize;r++){ | ||||
|     uint64_t * check = (uint64_t *) ShmCommBufs[r]; | ||||
|      | ||||
|     assert(check[0]==GroupRank); | ||||
|     assert(check[1]==r); | ||||
|     assert(check[2]==0x5A5A5A); | ||||
|  | ||||
|   } | ||||
|   MPI_Barrier(ShmComm); | ||||
|  | ||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // Verbose for now | ||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| @@ -192,7 +304,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { | ||||
|     std::cout<<GridLogMessage<< "Grid MPI-3 configuration: detected "; | ||||
|     std::cout<< WorldSize << " Ranks " ; | ||||
|     std::cout<< GroupSize << " Nodes " ; | ||||
|     std::cout<<  ShmSize  << " with ranks-per-node "<<std::endl; | ||||
|     std::cout<< " with "<< ShmSize  << " ranks-per-node "<<std::endl; | ||||
|      | ||||
|     std::cout<<GridLogMessage     <<"Grid MPI-3 configuration: allocated shared memory region of size "; | ||||
|     std::cout<<std::hex << MAX_MPI_SHM_BYTES <<" ShmCommBuf address = "<<ShmCommBuf << std::dec<<std::endl; | ||||
| @@ -207,7 +319,6 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { | ||||
|       if(g!=ShmSize-1) std::cout<<","; | ||||
|       else std::cout<<"}"<<std::endl; | ||||
|     } | ||||
|  | ||||
|   } | ||||
|    | ||||
|   for(int g=0;g<GroupSize;g++){ | ||||
| @@ -216,7 +327,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { | ||||
|       if ( (ShmRank == 0) && (GroupRank==g) ) { | ||||
| 	std::cout<<MyGroup[r]; | ||||
| 	if(r<ShmSize-1) std::cout<<","; | ||||
| 	else std::cout<<"}"<<std::endl; | ||||
| 	else std::cout<<"}"<<std::endl<<std::flush; | ||||
|       } | ||||
|       MPI_Barrier(communicator_world); | ||||
|     } | ||||
| @@ -225,14 +336,12 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { | ||||
|   assert(ShmSetup==0);  ShmSetup=1; | ||||
| } | ||||
|  | ||||
|  | ||||
| //////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // Want to implement some magic ... Group sub-cubes into those on same node | ||||
| //////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) | ||||
| void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &dest,int &source) | ||||
| { | ||||
|   std::vector<int> coor = _processor_coor; | ||||
|  | ||||
|   std::vector<int> coor = _processor_coor; // my coord | ||||
|   assert(std::abs(shift) <_processors[dim]); | ||||
|  | ||||
|   coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim]; | ||||
| @@ -242,26 +351,30 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest | ||||
|   coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim]; | ||||
|   Lexicographic::IndexFromCoor(coor,dest,_processors); | ||||
|   dest = LexicographicToWorldRank[dest]; | ||||
| } | ||||
|  | ||||
| }// rank is world rank. | ||||
|  | ||||
| int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) | ||||
| { | ||||
|   int rank; | ||||
|   Lexicographic::IndexFromCoor(coor,rank,_processors); | ||||
|   rank = LexicographicToWorldRank[rank]; | ||||
|   return rank; | ||||
| } | ||||
| }// rank is world rank | ||||
|  | ||||
| void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor) | ||||
| { | ||||
|   Lexicographic::CoorFromIndex(coor,rank,_processors); | ||||
|   rank = LexicographicToWorldRank[rank]; | ||||
|   int lr=-1; | ||||
|   for(int r=0;r<WorldSize;r++){// map world Rank to lexico and then to coor | ||||
|     if( LexicographicToWorldRank[r]==rank) lr = r; | ||||
|   } | ||||
|   assert(lr!=-1); | ||||
|   Lexicographic::CoorFromIndex(coor,lr,_processors); | ||||
| } | ||||
|  | ||||
| CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | ||||
| {  | ||||
|   int ierr; | ||||
|  | ||||
|   communicator=communicator_world; | ||||
|  | ||||
|   _ndimension = processors.size(); | ||||
|  | ||||
|   //////////////////////////////////////////////////////////////// | ||||
| @@ -280,19 +393,17 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | ||||
|   // Identify subblock of ranks on node spreading across dims | ||||
|   // in a maximally symmetrical way | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   int dim = 0; | ||||
|    | ||||
|   std::vector<int> WorldDims = processors; | ||||
|  | ||||
|   ShmDims.resize(_ndimension,1); | ||||
|   ShmDims.resize  (_ndimension,1); | ||||
|   GroupDims.resize(_ndimension); | ||||
|      | ||||
|   ShmCoor.resize(_ndimension); | ||||
|   ShmCoor.resize  (_ndimension); | ||||
|   GroupCoor.resize(_ndimension); | ||||
|   WorldCoor.resize(_ndimension); | ||||
|  | ||||
|   int dim = 0; | ||||
|   for(int l2=0;l2<log2size;l2++){ | ||||
|     while ( WorldDims[dim] / ShmDims[dim] <= 1 ) dim=(dim+1)%_ndimension; | ||||
|     while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%_ndimension; | ||||
|     ShmDims[dim]*=2; | ||||
|     dim=(dim+1)%_ndimension; | ||||
|   } | ||||
| @@ -304,6 +415,29 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | ||||
|     GroupDims[d] = WorldDims[d]/ShmDims[d]; | ||||
|   } | ||||
|  | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   // Verbose | ||||
|   //////////////////////////////////////////////////////////////// | ||||
| #if 0 | ||||
|   std::cout<< GridLogMessage << "MPI-3 usage "<<std::endl; | ||||
|   std::cout<< GridLogMessage << "SHM   "; | ||||
|   for(int d=0;d<_ndimension;d++){ | ||||
|     std::cout<< ShmDims[d] <<" "; | ||||
|   } | ||||
|   std::cout<< std::endl; | ||||
|  | ||||
|   std::cout<< GridLogMessage << "Group "; | ||||
|   for(int d=0;d<_ndimension;d++){ | ||||
|     std::cout<< GroupDims[d] <<" "; | ||||
|   } | ||||
|   std::cout<< std::endl; | ||||
|  | ||||
|   std::cout<< GridLogMessage<<"World "; | ||||
|   for(int d=0;d<_ndimension;d++){ | ||||
|     std::cout<< WorldDims[d] <<" "; | ||||
|   } | ||||
|   std::cout<< std::endl; | ||||
| #endif | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   // Check processor counts match | ||||
|   //////////////////////////////////////////////////////////////// | ||||
| @@ -317,29 +451,57 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | ||||
|        | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   // Establish mapping between lexico physics coord and WorldRank | ||||
|   //  | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   LexicographicToWorldRank.resize(WorldSize,0); | ||||
|   Lexicographic::CoorFromIndex(GroupCoor,GroupRank,GroupDims); | ||||
|   Lexicographic::CoorFromIndex(ShmCoor,ShmRank,ShmDims); | ||||
|   for(int d=0;d<_ndimension;d++){ | ||||
|     WorldCoor[d] = GroupCoor[d]*ShmDims[d]+ShmCoor[d]; | ||||
|   } | ||||
|   _processor_coor = WorldCoor; | ||||
|  | ||||
|   int lexico; | ||||
|   Lexicographic::IndexFromCoor(WorldCoor,lexico,WorldDims); | ||||
|   LexicographicToWorldRank[lexico]=WorldRank; | ||||
|   _processor = lexico; | ||||
|   _processor      = WorldRank; | ||||
|  | ||||
|   /////////////////////////////////////////////////////////////////// | ||||
|   // global sum Lexico to World mapping | ||||
|   /////////////////////////////////////////////////////////////////// | ||||
|   int lexico; | ||||
|   LexicographicToWorldRank.resize(WorldSize,0); | ||||
|   Lexicographic::IndexFromCoor(WorldCoor,lexico,WorldDims); | ||||
|   LexicographicToWorldRank[lexico] = WorldRank; | ||||
|   ierr=MPI_Allreduce(MPI_IN_PLACE,&LexicographicToWorldRank[0],WorldSize,MPI_INT,MPI_SUM,communicator); | ||||
|   assert(ierr==0); | ||||
|  | ||||
| }; | ||||
|   for(int i=0;i<WorldSize;i++){ | ||||
|  | ||||
|     int wr = LexicographicToWorldRank[i]; | ||||
|     //    int wr = i; | ||||
|  | ||||
|     std::vector<int> coor(_ndimension); | ||||
|     ProcessorCoorFromRank(wr,coor); // from world rank | ||||
|     int ck = RankFromProcessorCoor(coor); | ||||
|     assert(ck==wr); | ||||
|  | ||||
|     if ( wr == WorldRank ) {  | ||||
|       for(int j=0;j<coor.size();j++) { | ||||
| 	assert(coor[j] == _processor_coor[j]); | ||||
|       } | ||||
|     } | ||||
|     /* | ||||
|     std::cout << GridLogMessage<< " Lexicographic "<<i; | ||||
|     std::cout << " MPI rank      "<<wr; | ||||
|     std::cout << " Coor          "; | ||||
|     for(int j=0;j<coor.size();j++) std::cout << coor[j]; | ||||
|     std::cout<< std::endl; | ||||
|     */ | ||||
|     ///////////////////////////////////////////////////// | ||||
|     // Check everyone agrees on everyone elses coords | ||||
|     ///////////////////////////////////////////////////// | ||||
|     std::vector<int> mcoor = coor; | ||||
|     this->Broadcast(0,(void *)&mcoor[0],mcoor.size()*sizeof(int)); | ||||
|     for(int d = 0 ; d< _ndimension; d++) { | ||||
|       assert(coor[d] == mcoor[d]); | ||||
|     } | ||||
|   } | ||||
| }; | ||||
| void CartesianCommunicator::GlobalSum(uint32_t &u){ | ||||
|   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); | ||||
|   assert(ierr==0); | ||||
| @@ -348,6 +510,14 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){ | ||||
|   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); | ||||
|   assert(ierr==0); | ||||
| } | ||||
| void CartesianCommunicator::GlobalXOR(uint32_t &u){ | ||||
|   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator); | ||||
|   assert(ierr==0); | ||||
| } | ||||
| void CartesianCommunicator::GlobalXOR(uint64_t &u){ | ||||
|   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator); | ||||
|   assert(ierr==0); | ||||
| } | ||||
| void CartesianCommunicator::GlobalSum(float &f){ | ||||
|   int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); | ||||
|   assert(ierr==0); | ||||
| @@ -367,8 +537,6 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N) | ||||
|   int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); | ||||
|   assert(ierr==0); | ||||
| } | ||||
|  | ||||
|  | ||||
| // Basic Halo comms primitive | ||||
| void CartesianCommunicator::SendToRecvFrom(void *xmit, | ||||
| 					   int dest, | ||||
| @@ -377,10 +545,14 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit, | ||||
| 					   int bytes) | ||||
| { | ||||
|   std::vector<CommsRequest_t> reqs(0); | ||||
|   //    unsigned long  xcrc = crc32(0L, Z_NULL, 0); | ||||
|   //    unsigned long  rcrc = crc32(0L, Z_NULL, 0); | ||||
|   //    xcrc = crc32(xcrc,(unsigned char *)xmit,bytes); | ||||
|   SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes); | ||||
|   SendToRecvFromComplete(reqs); | ||||
|   //    rcrc = crc32(rcrc,(unsigned char *)recv,bytes); | ||||
|   //    printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc); | ||||
| } | ||||
|  | ||||
| void CartesianCommunicator::SendRecvPacket(void *xmit, | ||||
| 					   void *recv, | ||||
| 					   int sender, | ||||
| @@ -397,7 +569,6 @@ void CartesianCommunicator::SendRecvPacket(void *xmit, | ||||
|     MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat); | ||||
|   } | ||||
| } | ||||
|  | ||||
| // Basic Halo comms primitive | ||||
| void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||
| 						void *xmit, | ||||
| @@ -406,95 +577,29 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis | ||||
| 						int from, | ||||
| 						int bytes) | ||||
| { | ||||
| #if 0 | ||||
|   this->StencilBarrier(); | ||||
|  | ||||
|   MPI_Request xrq; | ||||
|   MPI_Request rrq; | ||||
|    | ||||
|   static int sequence; | ||||
|  | ||||
|   int myrank = _processor; | ||||
|   int ierr; | ||||
|   int tag; | ||||
|   int check; | ||||
|  | ||||
|   assert(dest != _processor); | ||||
|   assert(from != _processor); | ||||
|   if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {  | ||||
|     MPI_Request xrq; | ||||
|     MPI_Request rrq; | ||||
|  | ||||
|   int gdest = GroupRanks[dest]; | ||||
|   int gfrom = GroupRanks[from]; | ||||
|   int gme   = GroupRanks[_processor]; | ||||
|     ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); | ||||
|     ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); | ||||
|      | ||||
|   sequence++; | ||||
|    | ||||
|   char *from_ptr = (char *)ShmCommBufs[ShmRank]; | ||||
|  | ||||
|   int small = (bytes<MAX_MPI_SHM_BYTES); | ||||
|  | ||||
|   typedef uint64_t T; | ||||
|   int words = bytes/sizeof(T); | ||||
|  | ||||
|   assert(((size_t)bytes &(sizeof(T)-1))==0); | ||||
|   assert(gme == ShmRank); | ||||
|  | ||||
|   if ( small && (gdest !=MPI_UNDEFINED) ) { | ||||
|  | ||||
|     char *to_ptr   = (char *)ShmCommBufs[gdest]; | ||||
|  | ||||
|     assert(gme != gdest); | ||||
|  | ||||
|     T *ip = (T *)xmit; | ||||
|     T *op = (T *)to_ptr; | ||||
| PARALLEL_FOR_LOOP  | ||||
|     for(int w=0;w<words;w++) { | ||||
|       op[w]=ip[w]; | ||||
|     } | ||||
|  | ||||
|     bcopy(&_processor,&to_ptr[bytes],sizeof(_processor)); | ||||
|     bcopy(&  sequence,&to_ptr[bytes+4],sizeof(sequence)); | ||||
|   } else {  | ||||
|     ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); | ||||
|     assert(ierr==0); | ||||
|     list.push_back(xrq); | ||||
|   } | ||||
|  | ||||
|   this->StencilBarrier(); | ||||
|    | ||||
|   if (small && (gfrom !=MPI_UNDEFINED) ) { | ||||
|     T *ip = (T *)from_ptr; | ||||
|     T *op = (T *)recv; | ||||
| PARALLEL_FOR_LOOP  | ||||
|     for(int w=0;w<words;w++) { | ||||
|       op[w]=ip[w]; | ||||
|     } | ||||
|     bcopy(&from_ptr[bytes]  ,&tag  ,sizeof(tag)); | ||||
|     bcopy(&from_ptr[bytes+4],&check,sizeof(check)); | ||||
|     assert(check==sequence); | ||||
|     assert(tag==from); | ||||
|   } else {  | ||||
|     ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); | ||||
|     assert(ierr==0); | ||||
|     list.push_back(rrq); | ||||
|   } else {  | ||||
|     // Give the CPU to MPI immediately; can use threads to overlap optionally | ||||
|     ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, | ||||
| 		      recv,bytes,MPI_CHAR,from, from, | ||||
| 		      communicator,MPI_STATUS_IGNORE); | ||||
|     assert(ierr==0); | ||||
|   } | ||||
|  | ||||
|   this->StencilBarrier(); | ||||
|  | ||||
| #else | ||||
|   MPI_Request xrq; | ||||
|   MPI_Request rrq; | ||||
|   int rank = _processor; | ||||
|   int ierr; | ||||
|   ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); | ||||
|   ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); | ||||
|    | ||||
|   assert(ierr==0); | ||||
|  | ||||
|   list.push_back(xrq); | ||||
|   list.push_back(rrq); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||
| double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||
| 						       void *xmit, | ||||
| 						       int dest, | ||||
| 						       void *recv, | ||||
| @@ -505,57 +610,63 @@ void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_ | ||||
|   MPI_Request rrq; | ||||
|  | ||||
|   int ierr; | ||||
|  | ||||
|   assert(dest != _processor); | ||||
|   assert(from != _processor); | ||||
|    | ||||
|   int gdest = GroupRanks[dest]; | ||||
|   int gfrom = GroupRanks[from]; | ||||
|   int gme   = GroupRanks[_processor]; | ||||
|  | ||||
|   assert(gme == ShmRank); | ||||
|   assert(dest != _processor); | ||||
|   assert(from != _processor); | ||||
|   assert(gme  == ShmRank); | ||||
|   double off_node_bytes=0.0; | ||||
|  | ||||
| #ifdef FORCE_COMMS | ||||
|   gdest = MPI_UNDEFINED; | ||||
|   gfrom = MPI_UNDEFINED; | ||||
| #endif | ||||
|   if ( gfrom ==MPI_UNDEFINED) { | ||||
|     ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); | ||||
|     assert(ierr==0); | ||||
|     list.push_back(rrq); | ||||
|     off_node_bytes+=bytes; | ||||
|   } | ||||
|  | ||||
|   if ( gdest == MPI_UNDEFINED ) { | ||||
|     ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); | ||||
|     assert(ierr==0); | ||||
|     list.push_back(xrq); | ||||
|     off_node_bytes+=bytes; | ||||
|   } | ||||
|  | ||||
|   if ( gfrom ==MPI_UNDEFINED) { | ||||
|     ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); | ||||
|     assert(ierr==0); | ||||
|     list.push_back(rrq); | ||||
|   if ( CommunicatorPolicy == CommunicatorPolicySequential ) {  | ||||
|     this->StencilSendToRecvFromComplete(list); | ||||
|   } | ||||
|  | ||||
|   return off_node_bytes; | ||||
| } | ||||
|  | ||||
|  | ||||
| void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list) | ||||
| void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall) | ||||
| { | ||||
|   SendToRecvFromComplete(list); | ||||
|   SendToRecvFromComplete(waitall); | ||||
| } | ||||
|  | ||||
| void CartesianCommunicator::StencilBarrier(void) | ||||
| { | ||||
|   MPI_Win_sync (ShmWindow);    | ||||
|   MPI_Barrier  (ShmComm); | ||||
|   MPI_Win_sync (ShmWindow);    | ||||
| } | ||||
|  | ||||
| void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) | ||||
| { | ||||
|   int nreq=list.size(); | ||||
|  | ||||
|   if (nreq==0) return; | ||||
|  | ||||
|   std::vector<MPI_Status> status(nreq); | ||||
|   int ierr = MPI_Waitall(nreq,&list[0],&status[0]); | ||||
|   assert(ierr==0); | ||||
|   list.resize(0); | ||||
| } | ||||
|  | ||||
| void CartesianCommunicator::Barrier(void) | ||||
| { | ||||
|   int ierr = MPI_Barrier(communicator); | ||||
|   assert(ierr==0); | ||||
| } | ||||
|  | ||||
| void CartesianCommunicator::Broadcast(int root,void* data, int bytes) | ||||
| { | ||||
|   int ierr=MPI_Bcast(data, | ||||
| @@ -565,7 +676,11 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes) | ||||
| 		     communicator); | ||||
|   assert(ierr==0); | ||||
| } | ||||
|  | ||||
| int CartesianCommunicator::RankWorld(void){  | ||||
|   int r;  | ||||
|   MPI_Comm_rank(communicator_world,&r); | ||||
|   return r; | ||||
| } | ||||
| void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) | ||||
| { | ||||
|   int ierr= MPI_Bcast(data, | ||||
|   | ||||
| @@ -27,6 +27,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     /*  END LEGAL */ | ||||
| #include "Grid.h" | ||||
| #include <mpi.h> | ||||
| //#include <numaif.h> | ||||
|  | ||||
| //////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| /// Workarounds: | ||||
| @@ -42,19 +43,27 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| #include <fcntl.h> | ||||
| #include <unistd.h> | ||||
| #include <limits.h> | ||||
|  | ||||
| typedef sem_t *Grid_semaphore; | ||||
|  | ||||
|  | ||||
| #error  /*THis is deprecated*/ | ||||
|  | ||||
| #if 0  | ||||
| #define SEM_INIT(S)      S = sem_open(sem_name,0,0600,0); assert ( S != SEM_FAILED ); | ||||
| #define SEM_INIT_EXCL(S) sem_unlink(sem_name); S = sem_open(sem_name,O_CREAT|O_EXCL,0600,0); assert ( S != SEM_FAILED ); | ||||
| #define SEM_POST(S) assert ( sem_post(S) == 0 );  | ||||
| #define SEM_WAIT(S) assert ( sem_wait(S) == 0 ); | ||||
|  | ||||
| #else | ||||
| #define SEM_INIT(S)      ; | ||||
| #define SEM_INIT_EXCL(S) ; | ||||
| #define SEM_POST(S) ; | ||||
| #define SEM_WAIT(S) ; | ||||
| #endif | ||||
| #include <sys/mman.h> | ||||
|  | ||||
| namespace Grid { | ||||
|  | ||||
| enum { COMMAND_ISEND, COMMAND_IRECV, COMMAND_WAITALL }; | ||||
| enum { COMMAND_ISEND, COMMAND_IRECV, COMMAND_WAITALL, COMMAND_SENDRECV }; | ||||
|  | ||||
| struct Descriptor { | ||||
|   uint64_t buf; | ||||
| @@ -62,6 +71,12 @@ struct Descriptor { | ||||
|   int rank; | ||||
|   int tag; | ||||
|   int command; | ||||
|   uint64_t xbuf; | ||||
|   uint64_t rbuf; | ||||
|   int xtag; | ||||
|   int rtag; | ||||
|   int src; | ||||
|   int dest; | ||||
|   MPI_Request request; | ||||
| }; | ||||
|  | ||||
| @@ -94,18 +109,14 @@ public: | ||||
|  | ||||
|   void SemInit(void) { | ||||
|     sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank); | ||||
|     //    printf("SEM_NAME: %s \n",sem_name); | ||||
|     SEM_INIT(sem_head); | ||||
|     sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank); | ||||
|     //    printf("SEM_NAME: %s \n",sem_name); | ||||
|     SEM_INIT(sem_tail); | ||||
|   }   | ||||
|   void SemInitExcl(void) { | ||||
|     sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank); | ||||
|     //    printf("SEM_INIT_EXCL: %s \n",sem_name); | ||||
|     SEM_INIT_EXCL(sem_head); | ||||
|     sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank); | ||||
|     //    printf("SEM_INIT_EXCL: %s \n",sem_name); | ||||
|     SEM_INIT_EXCL(sem_tail); | ||||
|   }   | ||||
|   void WakeUpDMA(void) {  | ||||
| @@ -125,6 +136,13 @@ public: | ||||
|     while(1){ | ||||
|       WaitForCommand(); | ||||
|       //      std::cout << "Getting command "<<std::endl; | ||||
| #if 0 | ||||
|       _mm_monitor((void *)&state->head,0,0); | ||||
|       int s=state->start; | ||||
|       if ( s != state->head ) { | ||||
| 	_mm_mwait(0,0); | ||||
|       } | ||||
| #endif | ||||
|       Event(); | ||||
|     } | ||||
|   } | ||||
| @@ -132,6 +150,7 @@ public: | ||||
|   int Event (void) ; | ||||
|  | ||||
|   uint64_t QueueCommand(int command,void *buf, int bytes, int hashtag, MPI_Comm comm,int u_rank) ; | ||||
|   void QueueSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) ; | ||||
|  | ||||
|   void WaitAll() { | ||||
|     //    std::cout << "Queueing WAIT command  "<<std::endl; | ||||
| @@ -141,7 +160,7 @@ public: | ||||
|     //    std::cout << "Waiting from semaphore "<<std::endl; | ||||
|     WaitForComplete(); | ||||
|     //    std::cout << "Checking FIFO is empty "<<std::endl; | ||||
|     assert ( state->tail == state->head ); | ||||
|     while ( state->tail != state->head ); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| @@ -196,6 +215,12 @@ public: | ||||
|     //    std::cout << "Waking up DMA "<< slave<<std::endl; | ||||
|   }; | ||||
|  | ||||
|   static void QueueSendRecv(int slave,void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src)  | ||||
|   { | ||||
|     Slaves[slave].QueueSendRecv(xbuf,rbuf,bytes,xtag,rtag,comm,dest,src); | ||||
|     Slaves[slave].WakeUpDMA(); | ||||
|   } | ||||
|  | ||||
|   static void QueueRecv(int slave, void *buf, int bytes, int tag, MPI_Comm comm,int rank) { | ||||
|     //    std::cout<< " Queueing recv "<< bytes<< " slave "<< slave << " from comm "<<rank  <<std::endl; | ||||
|     Slaves[slave].QueueCommand(COMMAND_IRECV,buf,bytes,tag,comm,rank); | ||||
| @@ -226,6 +251,28 @@ public: | ||||
|     return; | ||||
|   }; | ||||
|  | ||||
|   static void QueueRoundRobinSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) { | ||||
|     uint8_t * cxbuf = (uint8_t *) xbuf; | ||||
|     uint8_t * crbuf = (uint8_t *) rbuf; | ||||
|     static int rrp=0; | ||||
|     int procs = VerticalSize-1; | ||||
|     int myoff=0; | ||||
|     int mywork=bytes; | ||||
|     QueueSendRecv(rrp+1,&cxbuf[myoff],&crbuf[myoff],mywork,xtag,rtag,comm,dest,src); | ||||
|     rrp = rrp+1; | ||||
|     if ( rrp == (VerticalSize-1) ) rrp = 0; | ||||
|   } | ||||
|  | ||||
|   static void QueueMultiplexedSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) { | ||||
|     uint8_t * cxbuf = (uint8_t *) xbuf; | ||||
|     uint8_t * crbuf = (uint8_t *) rbuf; | ||||
|     int mywork, myoff, procs; | ||||
|     procs = VerticalSize-1; | ||||
|     for(int s=0;s<procs;s++) { | ||||
|       GetWork(bytes,s,mywork,myoff,procs); | ||||
|       QueueSendRecv(s+1,&cxbuf[myoff],&crbuf[myoff],mywork,xtag,rtag,comm,dest,src); | ||||
|     } | ||||
|   }; | ||||
|   static void QueueMultiplexedSend(void *buf, int bytes, int tag, MPI_Comm comm,int rank) { | ||||
|     uint8_t * cbuf = (uint8_t *) buf; | ||||
|     int mywork, myoff, procs; | ||||
| @@ -275,6 +322,7 @@ std::vector<void *>            MPIoffloadEngine::VerticalShmBufs; | ||||
| std::vector<std::vector<int> > MPIoffloadEngine::UniverseRanks; | ||||
| std::vector<int>               MPIoffloadEngine::UserCommunicatorToWorldRanks;  | ||||
|  | ||||
| int CartesianCommunicator::NodeCount(void)    { return HorizontalSize;}; | ||||
| int MPIoffloadEngine::ShmSetup = 0; | ||||
|  | ||||
| void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world, | ||||
| @@ -370,12 +418,22 @@ void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world, | ||||
|       ftruncate(fd, size); | ||||
|  | ||||
|       VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); | ||||
|  | ||||
|       if ( VerticalShmBufs[r] == MAP_FAILED ) {  | ||||
| 	perror("failed mmap"); | ||||
| 	assert(0); | ||||
|       } | ||||
|  | ||||
|       /* | ||||
|       for(uint64_t page=0;page<size;page+=4096){ | ||||
| 	void *pages = (void *) ( page + (uint64_t)VerticalShmBufs[r] ); | ||||
| 	int status; | ||||
| 	int flags=MPOL_MF_MOVE_ALL; | ||||
| 	int nodes=1; // numa domain == MCDRAM | ||||
| 	unsigned long count=1; | ||||
| 	ierr= move_pages(0,count, &pages,&nodes,&status,flags); | ||||
| 	if (ierr && (page==0)) perror("numa relocate command failed"); | ||||
|       } | ||||
|       */ | ||||
|       uint64_t * check = (uint64_t *) VerticalShmBufs[r]; | ||||
|       check[0] = WorldRank; | ||||
|       check[1] = r; | ||||
| @@ -404,7 +462,7 @@ void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world, | ||||
|     uint64_t * check = (uint64_t *) VerticalShmBufs[r]; | ||||
|     assert(check[0]== WorldRank); | ||||
|     assert(check[1]== r); | ||||
|     std::cerr<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl; | ||||
|     //    std::cerr<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl; | ||||
|   } | ||||
|   } | ||||
| #endif | ||||
| @@ -542,6 +600,8 @@ int Slave::Event (void) { | ||||
|   static int head_last; | ||||
|   static int start_last; | ||||
|   int ierr; | ||||
|   MPI_Status stat; | ||||
|   static int i=0; | ||||
|  | ||||
|   //////////////////////////////////////////////////// | ||||
|   // Try to advance the start pointers | ||||
| @@ -550,11 +610,6 @@ int Slave::Event (void) { | ||||
|   if ( s != state->head ) { | ||||
|     switch ( state->Descrs[s].command ) { | ||||
|     case COMMAND_ISEND: | ||||
|       /* | ||||
|             std::cout<< " Send "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]" | ||||
|       	       << " to " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag | ||||
|        << " Comm " << MPIoffloadEngine::communicator_universe<< " me " <<universe_rank<< std::endl; | ||||
|       */ | ||||
|       ierr = MPI_Isend((void *)(state->Descrs[s].buf+base),  | ||||
| 		       state->Descrs[s].bytes,  | ||||
| 		       MPI_CHAR, | ||||
| @@ -568,11 +623,6 @@ int Slave::Event (void) { | ||||
|       break; | ||||
|  | ||||
|     case COMMAND_IRECV: | ||||
|       /* | ||||
|       std::cout<< " Recv "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]" | ||||
| 	       << " from " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag | ||||
| 	       << " Comm " << MPIoffloadEngine::communicator_universe<< " me "<< universe_rank<< std::endl; | ||||
|       */ | ||||
|       ierr=MPI_Irecv((void *)(state->Descrs[s].buf+base),  | ||||
| 		     state->Descrs[s].bytes,  | ||||
| 		     MPI_CHAR, | ||||
| @@ -588,10 +638,32 @@ int Slave::Event (void) { | ||||
|       return 1; | ||||
|       break; | ||||
|  | ||||
|     case COMMAND_SENDRECV: | ||||
|  | ||||
|       //      fprintf(stderr,"Sendrecv ->%d %d : <-%d %d \n",state->Descrs[s].dest, state->Descrs[s].xtag+i*10,state->Descrs[s].src, state->Descrs[s].rtag+i*10); | ||||
|  | ||||
|       ierr=MPI_Sendrecv((void *)(state->Descrs[s].xbuf+base), state->Descrs[s].bytes, MPI_CHAR, state->Descrs[s].dest, state->Descrs[s].xtag+i*10, | ||||
| 			(void *)(state->Descrs[s].rbuf+base), state->Descrs[s].bytes, MPI_CHAR, state->Descrs[s].src , state->Descrs[s].rtag+i*10, | ||||
| 			MPIoffloadEngine::communicator_universe,MPI_STATUS_IGNORE); | ||||
|  | ||||
|       assert(ierr==0); | ||||
|  | ||||
|       //      fprintf(stderr,"Sendrecv done %d %d\n",ierr,i); | ||||
|       //      MPI_Barrier(MPIoffloadEngine::HorizontalComm); | ||||
|       //      fprintf(stderr,"Barrier\n"); | ||||
|       i++; | ||||
|  | ||||
|       state->start = PERI_PLUS(s); | ||||
|  | ||||
|       return 1; | ||||
|       break; | ||||
|  | ||||
|     case COMMAND_WAITALL: | ||||
|  | ||||
|       for(int t=state->tail;t!=s; t=PERI_PLUS(t) ){ | ||||
| 	MPI_Wait((MPI_Request *)&state->Descrs[t].request,MPI_STATUS_IGNORE); | ||||
| 	if ( state->Descrs[t].command != COMMAND_SENDRECV ) { | ||||
| 	  MPI_Wait((MPI_Request *)&state->Descrs[t].request,MPI_STATUS_IGNORE); | ||||
| 	} | ||||
|       }; | ||||
|       s=PERI_PLUS(s); | ||||
|       state->start = s; | ||||
| @@ -613,6 +685,45 @@ int Slave::Event (void) { | ||||
|   // External interaction with the queue | ||||
|   ////////////////////////////////////////////////////////////////////////////// | ||||
|    | ||||
| void Slave::QueueSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src)  | ||||
| { | ||||
|   int head =state->head; | ||||
|   int next = PERI_PLUS(head); | ||||
|    | ||||
|   // Set up descriptor | ||||
|   int worldrank; | ||||
|   int hashtag; | ||||
|   MPI_Comm    communicator; | ||||
|   MPI_Request request; | ||||
|   uint64_t relative; | ||||
|    | ||||
|   relative = (uint64_t)xbuf - base; | ||||
|   state->Descrs[head].xbuf    = relative; | ||||
|    | ||||
|   relative= (uint64_t)rbuf - base; | ||||
|   state->Descrs[head].rbuf    = relative; | ||||
|    | ||||
|   state->Descrs[head].bytes  = bytes; | ||||
|    | ||||
|   MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,xtag,comm,dest); | ||||
|   state->Descrs[head].dest   = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]; | ||||
|   state->Descrs[head].xtag    = hashtag; | ||||
|    | ||||
|   MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,rtag,comm,src); | ||||
|   state->Descrs[head].src    = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]; | ||||
|   state->Descrs[head].rtag    = hashtag; | ||||
|    | ||||
|   state->Descrs[head].command= COMMAND_SENDRECV; | ||||
|    | ||||
|   // Block until FIFO has space | ||||
|   while( state->tail==next ); | ||||
|    | ||||
|   // Msync on weak order architectures | ||||
|    | ||||
|   // Advance pointer | ||||
|   state->head = next; | ||||
|    | ||||
| }; | ||||
| uint64_t Slave::QueueCommand(int command,void *buf, int bytes, int tag, MPI_Comm comm,int commrank)  | ||||
| { | ||||
|   ///////////////////////////////////////// | ||||
| @@ -812,19 +923,22 @@ void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_ | ||||
|   assert( (recv_i >= shm) && (recv_i+bytes <= shm+MAX_MPI_SHM_BYTES) ); | ||||
|   assert(from!=_processor); | ||||
|   assert(dest!=_processor); | ||||
|   MPIoffloadEngine::QueueMultiplexedSend(xmit,bytes,_processor,communicator,dest); | ||||
|   MPIoffloadEngine::QueueMultiplexedRecv(recv,bytes,from,communicator,from); | ||||
| } | ||||
|  | ||||
|   MPIoffloadEngine::QueueMultiplexedSendRecv(xmit,recv,bytes,_processor,from,communicator,dest,from); | ||||
|  | ||||
|   //MPIoffloadEngine::QueueRoundRobinSendRecv(xmit,recv,bytes,_processor,from,communicator,dest,from); | ||||
|  | ||||
|   //MPIoffloadEngine::QueueMultiplexedSend(xmit,bytes,_processor,communicator,dest); | ||||
|   //MPIoffloadEngine::QueueMultiplexedRecv(recv,bytes,from,communicator,from); | ||||
| } | ||||
|  | ||||
| void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list) | ||||
| { | ||||
|   MPIoffloadEngine::WaitAll(); | ||||
|   //this->Barrier(); | ||||
| } | ||||
|  | ||||
| void CartesianCommunicator::StencilBarrier(void) | ||||
| { | ||||
| } | ||||
| void CartesianCommunicator::StencilBarrier(void) { } | ||||
|  | ||||
| void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) | ||||
| { | ||||
|   | ||||
| @@ -25,7 +25,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #include <Grid/Grid.h> | ||||
| #include <Grid/GridCore.h> | ||||
|  | ||||
| namespace Grid { | ||||
|  | ||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| @@ -58,6 +59,8 @@ void CartesianCommunicator::GlobalSum(double &){} | ||||
| void CartesianCommunicator::GlobalSum(uint32_t &){} | ||||
| void CartesianCommunicator::GlobalSum(uint64_t &){} | ||||
| void CartesianCommunicator::GlobalSumVector(double *,int N){} | ||||
| void CartesianCommunicator::GlobalXOR(uint32_t &){} | ||||
| void CartesianCommunicator::GlobalXOR(uint64_t &){} | ||||
|  | ||||
| void CartesianCommunicator::SendRecvPacket(void *xmit, | ||||
| 					   void *recv, | ||||
| @@ -87,6 +90,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis | ||||
| { | ||||
|   assert(0); | ||||
| } | ||||
|  | ||||
| void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) | ||||
| { | ||||
|   assert(0); | ||||
| @@ -97,7 +101,7 @@ void CartesianCommunicator::Barrier(void){} | ||||
| void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {} | ||||
| void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { } | ||||
| int  CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) {  return 0;} | ||||
| void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){ coor = _processor_coor ;} | ||||
| void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){  coor = _processor_coor; } | ||||
| void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) | ||||
| { | ||||
|   source =0; | ||||
|   | ||||
| @@ -27,6 +27,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     /*  END LEGAL */ | ||||
| #include <Grid/Grid.h> | ||||
| #include <mpp/shmem.h> | ||||
| #include <array> | ||||
|  | ||||
| namespace Grid { | ||||
|  | ||||
| @@ -51,7 +52,7 @@ typedef struct HandShake_t { | ||||
| } HandShake; | ||||
|  | ||||
| std::array<long,_SHMEM_REDUCE_SYNC_SIZE> make_psync_init(void) { | ||||
|   array<long,_SHMEM_REDUCE_SYNC_SIZE> ret; | ||||
|   std::array<long,_SHMEM_REDUCE_SYNC_SIZE> ret; | ||||
|   ret.fill(SHMEM_SYNC_VALUE); | ||||
|   return ret; | ||||
| } | ||||
| @@ -109,7 +110,7 @@ void CartesianCommunicator::GlobalSum(uint32_t &u){ | ||||
|  | ||||
|   source = u; | ||||
|   dest   = 0; | ||||
|   shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync); | ||||
|   shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data()); | ||||
|   shmem_barrier_all(); // necessary? | ||||
|   u = dest; | ||||
| } | ||||
| @@ -125,7 +126,7 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){ | ||||
|  | ||||
|   source = u; | ||||
|   dest   = 0; | ||||
|   shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync); | ||||
|   shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data()); | ||||
|   shmem_barrier_all(); // necessary? | ||||
|   u = dest; | ||||
| } | ||||
| @@ -137,7 +138,8 @@ void CartesianCommunicator::GlobalSum(float &f){ | ||||
|  | ||||
|   source = f; | ||||
|   dest   =0.0; | ||||
|   shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync); | ||||
|   shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data()); | ||||
|   shmem_barrier_all(); | ||||
|   f = dest; | ||||
| } | ||||
| void CartesianCommunicator::GlobalSumVector(float *f,int N) | ||||
| @@ -148,14 +150,16 @@ void CartesianCommunicator::GlobalSumVector(float *f,int N) | ||||
|   static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init; | ||||
|  | ||||
|   if ( shmem_addr_accessible(f,_processor)  ){ | ||||
|     shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync); | ||||
|     shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync.data()); | ||||
|     shmem_barrier_all(); | ||||
|     return; | ||||
|   } | ||||
|  | ||||
|   for(int i=0;i<N;i++){ | ||||
|     dest   =0.0; | ||||
|     source = f[i]; | ||||
|     shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync); | ||||
|     shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data()); | ||||
|     shmem_barrier_all(); | ||||
|     f[i] = dest; | ||||
|   } | ||||
| } | ||||
| @@ -168,7 +172,8 @@ void CartesianCommunicator::GlobalSum(double &d) | ||||
|  | ||||
|   source = d; | ||||
|   dest   = 0; | ||||
|   shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync); | ||||
|   shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data()); | ||||
|   shmem_barrier_all(); | ||||
|   d = dest; | ||||
| } | ||||
| void CartesianCommunicator::GlobalSumVector(double *d,int N) | ||||
| @@ -180,14 +185,16 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N) | ||||
|  | ||||
|  | ||||
|   if ( shmem_addr_accessible(d,_processor)  ){ | ||||
|     shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync); | ||||
|     shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync.data()); | ||||
|     shmem_barrier_all(); | ||||
|     return; | ||||
|   } | ||||
|  | ||||
|   for(int i=0;i<N;i++){ | ||||
|     source = d[i]; | ||||
|     dest   =0.0; | ||||
|     shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync); | ||||
|     shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data()); | ||||
|     shmem_barrier_all(); | ||||
|     d[i] = dest; | ||||
|   } | ||||
| } | ||||
| @@ -282,11 +289,13 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis | ||||
|   SHMEM_VET(recv); | ||||
|   //  shmem_putmem_nb(recv,xmit,bytes,dest,NULL); | ||||
|   shmem_putmem(recv,xmit,bytes,dest); | ||||
|  | ||||
|   if ( CommunicatorPolicy == CommunicatorPolicySequential ) shmem_barrier_all();  | ||||
| } | ||||
| void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) | ||||
| { | ||||
|   //  shmem_quiet();      // I'm done | ||||
|   shmem_barrier_all();// He's done too | ||||
|   if( CommunicatorPolicy == CommunicatorPolicyConcurrent ) shmem_barrier_all();// He's done too | ||||
| } | ||||
| void CartesianCommunicator::Barrier(void) | ||||
| { | ||||
| @@ -301,13 +310,13 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes) | ||||
|   int words = bytes/4; | ||||
|  | ||||
|   if ( shmem_addr_accessible(data,_processor)  ){ | ||||
|     shmem_broadcast32(data,data,words,root,0,0,shmem_n_pes(),psync); | ||||
|     shmem_broadcast32(data,data,words,root,0,0,shmem_n_pes(),psync.data()); | ||||
|     return; | ||||
|   } | ||||
|  | ||||
|   for(int w=0;w<words;w++){ | ||||
|     word = array[w]; | ||||
|     shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync); | ||||
|     shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync.data()); | ||||
|     if ( shmem_my_pe() != root ) { | ||||
|       array[w] = word; | ||||
|     } | ||||
| @@ -325,7 +334,7 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) | ||||
|  | ||||
|   for(int w=0;w<words;w++){ | ||||
|     word = array[w]; | ||||
|     shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync); | ||||
|     shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync.data()); | ||||
|     if ( shmem_my_pe() != root ) { | ||||
|       array[w]= word; | ||||
|     } | ||||
| @@ -333,5 +342,9 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) | ||||
|   } | ||||
| } | ||||
|    | ||||
| int CartesianCommunicator::RankWorld(void){  | ||||
|   return shmem_my_pe(); | ||||
| } | ||||
|  | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -1,5 +1,4 @@ | ||||
|  | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -31,21 +30,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
| namespace Grid { | ||||
|  | ||||
| template<class vobj> | ||||
| class SimpleCompressor { | ||||
| public: | ||||
|   void Point(int) {}; | ||||
|  | ||||
|   vobj operator() (const vobj &arg) { | ||||
|     return arg; | ||||
|   } | ||||
| }; | ||||
|  | ||||
| /////////////////////////////////////////////////////////////////// | ||||
| // Gather for when there is no need to SIMD split with compression | ||||
| // Gather for when there is no need to SIMD split  | ||||
| /////////////////////////////////////////////////////////////////// | ||||
| template<class vobj,class cobj,class compressor> void  | ||||
| Gather_plane_simple (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0) | ||||
| template<class vobj> void  | ||||
| Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0) | ||||
| { | ||||
|   int rd = rhs._grid->_rdimensions[dimension]; | ||||
|  | ||||
| @@ -53,19 +42,17 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimen | ||||
|     cbmask = 0x3; | ||||
|   } | ||||
|    | ||||
|   int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane  | ||||
|    | ||||
|   int so=plane*rhs._grid->_ostride[dimension]; // base offset for start of plane  | ||||
|   int e1=rhs._grid->_slice_nblock[dimension]; | ||||
|   int e2=rhs._grid->_slice_block[dimension]; | ||||
|  | ||||
|   int stride=rhs._grid->_slice_stride[dimension]; | ||||
|   if ( cbmask == 0x3 ) {  | ||||
| PARALLEL_NESTED_LOOP2 | ||||
|     for(int n=0;n<e1;n++){ | ||||
|     parallel_for_nest2(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
| 	int o  = n*stride; | ||||
| 	int bo = n*e2; | ||||
| 	buffer[off+bo+b]=compress(rhs._odata[so+o+b]); | ||||
| 	buffer[off+bo+b]=rhs._odata[so+o+b]; | ||||
|       } | ||||
|     } | ||||
|   } else {  | ||||
| @@ -74,25 +61,23 @@ PARALLEL_NESTED_LOOP2 | ||||
|      for(int n=0;n<e1;n++){ | ||||
|        for(int b=0;b<e2;b++){ | ||||
| 	 int o  = n*stride; | ||||
| 	 int ocb=1<<rhs._grid->CheckerBoardFromOindexTable(o+b); | ||||
| 	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b); | ||||
| 	 if ( ocb &cbmask ) { | ||||
| 	   table.push_back(std::pair<int,int> (bo++,o+b)); | ||||
| 	 } | ||||
|        } | ||||
|      } | ||||
| PARALLEL_FOR_LOOP      | ||||
|      for(int i=0;i<table.size();i++){ | ||||
|        buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]); | ||||
|      parallel_for(int i=0;i<table.size();i++){ | ||||
|        buffer[off+table[i].first]=rhs._odata[so+table[i].second]; | ||||
|      } | ||||
|   } | ||||
| } | ||||
|  | ||||
|  | ||||
| /////////////////////////////////////////////////////////////////// | ||||
| // Gather for when there *is* need to SIMD split with compression | ||||
| // Gather for when there *is* need to SIMD split  | ||||
| /////////////////////////////////////////////////////////////////// | ||||
| template<class cobj,class vobj,class compressor> void  | ||||
| Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_object *> pointers,int dimension,int plane,int cbmask,compressor &compress) | ||||
| template<class vobj> void  | ||||
| Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask) | ||||
| { | ||||
|   int rd = rhs._grid->_rdimensions[dimension]; | ||||
|  | ||||
| @@ -105,57 +90,40 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_ | ||||
|   int e1=rhs._grid->_slice_nblock[dimension]; | ||||
|   int e2=rhs._grid->_slice_block[dimension]; | ||||
|   int n1=rhs._grid->_slice_stride[dimension]; | ||||
|   int n2=rhs._grid->_slice_block[dimension]; | ||||
|  | ||||
|   if ( cbmask ==0x3){ | ||||
| PARALLEL_NESTED_LOOP2 | ||||
|     for(int n=0;n<e1;n++){ | ||||
|     parallel_for_nest2(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
|  | ||||
| 	int o      =   n*n1; | ||||
| 	int offset = b+n*n2; | ||||
| 	cobj temp =compress(rhs._odata[so+o+b]); | ||||
| 	int offset = b+n*e2; | ||||
| 	 | ||||
| 	extract<cobj>(temp,pointers,offset); | ||||
| 	vobj temp =rhs._odata[so+o+b]; | ||||
| 	extract<vobj>(temp,pointers,offset); | ||||
|  | ||||
|       } | ||||
|     } | ||||
|   } else {  | ||||
|  | ||||
|     assert(0); //Fixme think this is buggy | ||||
|  | ||||
|     for(int n=0;n<e1;n++){ | ||||
|     // Case of SIMD split AND checker dim cannot currently be hit, except in  | ||||
|     // Test_cshift_red_black code. | ||||
|     std::cout << " Dense packed buffer WARNING " <<std::endl; | ||||
|     parallel_for_nest2(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
| 	int o=n*rhs._grid->_slice_stride[dimension]; | ||||
|  | ||||
| 	int o=n*n1; | ||||
| 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b); | ||||
| 	int offset = b+n*rhs._grid->_slice_block[dimension]; | ||||
| 	int offset = b+n*e2; | ||||
|  | ||||
| 	if ( ocb & cbmask ) { | ||||
| 	  cobj temp =compress(rhs._odata[so+o+b]); | ||||
| 	  extract<cobj>(temp,pointers,offset); | ||||
| 	  vobj temp =rhs._odata[so+o+b]; | ||||
| 	  extract<vobj>(temp,pointers,offset); | ||||
| 	} | ||||
|       } | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| ////////////////////////////////////////////////////// | ||||
| // Gather for when there is no need to SIMD split | ||||
| ////////////////////////////////////////////////////// | ||||
| template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask) | ||||
| { | ||||
|   SimpleCompressor<vobj> dontcompress; | ||||
|   Gather_plane_simple (rhs,buffer,dimension,plane,cbmask,dontcompress); | ||||
| } | ||||
|  | ||||
| ////////////////////////////////////////////////////// | ||||
| // Gather for when there *is* need to SIMD split | ||||
| ////////////////////////////////////////////////////// | ||||
| template<class vobj> void Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask) | ||||
| { | ||||
|   SimpleCompressor<vobj> dontcompress; | ||||
|   Gather_plane_extract<vobj,vobj,decltype(dontcompress)>(rhs,pointers,dimension,plane,cbmask,dontcompress); | ||||
| } | ||||
|  | ||||
| ////////////////////////////////////////////////////// | ||||
| // Scatter for when there is no need to SIMD split | ||||
| ////////////////////////////////////////////////////// | ||||
| @@ -171,10 +139,10 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo | ||||
|      | ||||
|   int e1=rhs._grid->_slice_nblock[dimension]; | ||||
|   int e2=rhs._grid->_slice_block[dimension]; | ||||
|   int stride=rhs._grid->_slice_stride[dimension]; | ||||
|    | ||||
|   if ( cbmask ==0x3 ) { | ||||
| PARALLEL_NESTED_LOOP2 | ||||
|     for(int n=0;n<e1;n++){ | ||||
|     parallel_for_nest2(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
| 	int o   =n*rhs._grid->_slice_stride[dimension]; | ||||
| 	int bo  =n*rhs._grid->_slice_block[dimension]; | ||||
| @@ -182,24 +150,28 @@ PARALLEL_NESTED_LOOP2 | ||||
|       } | ||||
|     } | ||||
|   } else {  | ||||
|     std::vector<std::pair<int,int> > table; | ||||
|     int bo=0; | ||||
|     for(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
| 	int o   =n*rhs._grid->_slice_stride[dimension]; | ||||
| 	int bo  =n*rhs._grid->_slice_block[dimension]; | ||||
| 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup | ||||
| 	if ( ocb & cbmask ) { | ||||
| 	  rhs._odata[so+o+b]=buffer[bo++]; | ||||
| 	  table.push_back(std::pair<int,int> (so+o+b,bo++)); | ||||
| 	} | ||||
|       } | ||||
|     } | ||||
|     parallel_for(int i=0;i<table.size();i++){ | ||||
|        //       std::cout << "Rcv"<< table[i].first << " " << table[i].second << " " <<buffer[table[i].second]<<std::endl; | ||||
|        rhs._odata[table[i].first]=buffer[table[i].second]; | ||||
|      } | ||||
|   } | ||||
| } | ||||
|  | ||||
| ////////////////////////////////////////////////////// | ||||
| // Scatter for when there *is* need to SIMD split | ||||
| ////////////////////////////////////////////////////// | ||||
|  template<class vobj,class cobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<cobj *> pointers,int dimension,int plane,int cbmask) | ||||
| template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask) | ||||
| { | ||||
|   int rd = rhs._grid->_rdimensions[dimension]; | ||||
|  | ||||
| @@ -213,8 +185,7 @@ PARALLEL_NESTED_LOOP2 | ||||
|   int e2=rhs._grid->_slice_block[dimension]; | ||||
|  | ||||
|   if(cbmask ==0x3 ) { | ||||
| PARALLEL_NESTED_LOOP2 | ||||
|     for(int n=0;n<e1;n++){ | ||||
|     parallel_for_nest2(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
| 	int o      = n*rhs._grid->_slice_stride[dimension]; | ||||
| 	int offset = b+n*rhs._grid->_slice_block[dimension]; | ||||
| @@ -222,7 +193,11 @@ PARALLEL_NESTED_LOOP2 | ||||
|       } | ||||
|     } | ||||
|   } else {  | ||||
|     assert(0); // think this is buggy FIXME | ||||
|  | ||||
|     // Case of SIMD split AND checker dim cannot currently be hit, except in  | ||||
|     // Test_cshift_red_black code. | ||||
|     //    std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME | ||||
|     std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl; | ||||
|     for(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
| 	int o      = n*rhs._grid->_slice_stride[dimension]; | ||||
| @@ -254,8 +229,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs | ||||
|   int e2=rhs._grid->_slice_block[dimension]; | ||||
|   int stride = rhs._grid->_slice_stride[dimension]; | ||||
|   if(cbmask == 0x3 ){ | ||||
| PARALLEL_NESTED_LOOP2 | ||||
|     for(int n=0;n<e1;n++){ | ||||
|     parallel_for_nest2(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
|   | ||||
|         int o =n*stride+b; | ||||
| @@ -264,8 +238,7 @@ PARALLEL_NESTED_LOOP2 | ||||
|       } | ||||
|     } | ||||
|   } else {  | ||||
| PARALLEL_NESTED_LOOP2 | ||||
|     for(int n=0;n<e1;n++){ | ||||
|     parallel_for_nest2(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
|   | ||||
|         int o =n*stride+b; | ||||
| @@ -295,8 +268,8 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo | ||||
|   int e1=rhs._grid->_slice_nblock[dimension]; | ||||
|   int e2=rhs._grid->_slice_block [dimension]; | ||||
|   int stride = rhs._grid->_slice_stride[dimension]; | ||||
| PARALLEL_NESTED_LOOP2 | ||||
|   for(int n=0;n<e1;n++){ | ||||
|  | ||||
|   parallel_for_nest2(int n=0;n<e1;n++){ | ||||
|   for(int b=0;b<e2;b++){ | ||||
|  | ||||
|       int o  =n*stride; | ||||
| @@ -338,8 +311,8 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice | ||||
|   // Map to always positive shift modulo global full dimension. | ||||
|   shift = (shift+fd)%fd; | ||||
|  | ||||
|   ret.checkerboard = grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension); | ||||
|   // the permute type | ||||
|   ret.checkerboard = grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension); | ||||
|   int permute_dim =grid->PermuteDim(dimension); | ||||
|   int permute_type=grid->PermuteType(dimension); | ||||
|   int permute_type_dist; | ||||
| @@ -348,7 +321,6 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice | ||||
|  | ||||
|     int o   = 0; | ||||
|     int bo  = x * grid->_ostride[dimension]; | ||||
|      | ||||
|     int cb= (cbmask==0x2)? Odd : Even; | ||||
|  | ||||
|     int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); | ||||
| @@ -361,9 +333,23 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice | ||||
|     // wrap is whether sshift > rd. | ||||
|     //  num is sshift mod rd. | ||||
|     //  | ||||
|     //  shift 7 | ||||
|     // | ||||
|     //  XoXo YcYc  | ||||
|     //  oXoX cYcY | ||||
|     //  XoXo YcYc | ||||
|     //  oXoX cYcY | ||||
|     // | ||||
|     //  sshift --  | ||||
|     // | ||||
|     //  XX YY ; 3 | ||||
|     //  XX YY ; 0 | ||||
|     //  XX YY ; 3 | ||||
|     //  XX YY ; 0 | ||||
|     // | ||||
|     int permute_slice=0; | ||||
|     if(permute_dim){ | ||||
|       int wrap = sshift/rd; | ||||
|       int wrap = sshift/rd; wrap=wrap % ly; | ||||
|       int  num = sshift%rd; | ||||
|  | ||||
|       if ( x< rd-num ) permute_slice=wrap; | ||||
| @@ -375,7 +361,6 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice | ||||
|       } else { | ||||
| 	permute_type_dist = permute_type; | ||||
|       } | ||||
|        | ||||
|     } | ||||
|  | ||||
|     if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist); | ||||
|   | ||||
| @@ -74,7 +74,6 @@ template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &r | ||||
|   sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd); | ||||
|  | ||||
|   //  std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl; | ||||
|  | ||||
|   if ( sshift[0] == sshift[1] ) { | ||||
|     //    std::cout << "Single pass Cshift_comms" <<std::endl; | ||||
|     Cshift_comms(ret,rhs,dimension,shift,0x3); | ||||
| @@ -154,10 +153,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | ||||
| 			   (void *)&recv_buf[0], | ||||
| 			   recv_from_rank, | ||||
| 			   bytes); | ||||
|       grid->Barrier(); | ||||
|  | ||||
|       //      for(int i=0;i<words;i++){ | ||||
|       //	std::cout << "SendRecv ["<<i<<"] snd "<<send_buf[i]<<" rcv " << recv_buf[i] << "  0x" << cbmask<<std::endl; | ||||
|       //      } | ||||
|       Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); | ||||
|     } | ||||
|   } | ||||
| @@ -243,7 +240,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | ||||
| 			     (void *)&recv_buf_extract[i][0], | ||||
| 			     recv_from_rank, | ||||
| 			     bytes); | ||||
|  | ||||
| 	grid->Barrier(); | ||||
| 	rpointers[i] = &recv_buf_extract[i][0]; | ||||
|       } else {  | ||||
| 	rpointers[i] = &send_buf_extract[nbr_lane][0]; | ||||
|   | ||||
							
								
								
									
										12276
									
								
								lib/json/json.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12276
									
								
								lib/json/json.hpp
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -39,8 +39,7 @@ namespace Grid { | ||||
|     ret.checkerboard = lhs.checkerboard; | ||||
|     conformable(ret,rhs); | ||||
|     conformable(lhs,rhs); | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       obj1 tmp; | ||||
|       mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]); | ||||
| @@ -56,8 +55,7 @@ PARALLEL_FOR_LOOP | ||||
|     ret.checkerboard = lhs.checkerboard; | ||||
|     conformable(ret,rhs); | ||||
|     conformable(lhs,rhs); | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       obj1 tmp; | ||||
|       mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]); | ||||
| @@ -73,8 +71,7 @@ PARALLEL_FOR_LOOP | ||||
|     ret.checkerboard = lhs.checkerboard; | ||||
|     conformable(ret,rhs); | ||||
|     conformable(lhs,rhs); | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       obj1 tmp; | ||||
|       sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]); | ||||
| @@ -89,8 +86,7 @@ PARALLEL_FOR_LOOP | ||||
|     ret.checkerboard = lhs.checkerboard; | ||||
|     conformable(ret,rhs); | ||||
|     conformable(lhs,rhs); | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       obj1 tmp; | ||||
|       add(&tmp,&lhs._odata[ss],&rhs._odata[ss]); | ||||
| @@ -108,8 +104,7 @@ PARALLEL_FOR_LOOP | ||||
|     void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ | ||||
|     ret.checkerboard = lhs.checkerboard; | ||||
|     conformable(lhs,ret); | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
|       obj1 tmp; | ||||
|       mult(&tmp,&lhs._odata[ss],&rhs); | ||||
|       vstream(ret._odata[ss],tmp); | ||||
| @@ -120,8 +115,7 @@ PARALLEL_FOR_LOOP | ||||
|     void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ | ||||
|     ret.checkerboard = lhs.checkerboard; | ||||
|     conformable(ret,lhs); | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
|       obj1 tmp; | ||||
|       mac(&tmp,&lhs._odata[ss],&rhs); | ||||
|       vstream(ret._odata[ss],tmp); | ||||
| @@ -132,8 +126,7 @@ PARALLEL_FOR_LOOP | ||||
|     void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ | ||||
|     ret.checkerboard = lhs.checkerboard; | ||||
|     conformable(ret,lhs); | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       obj1 tmp; | ||||
|       sub(&tmp,&lhs._odata[ss],&rhs); | ||||
| @@ -147,8 +140,7 @@ PARALLEL_FOR_LOOP | ||||
|     void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ | ||||
|     ret.checkerboard = lhs.checkerboard; | ||||
|     conformable(lhs,ret); | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       obj1 tmp; | ||||
|       add(&tmp,&lhs._odata[ss],&rhs); | ||||
| @@ -166,8 +158,7 @@ PARALLEL_FOR_LOOP | ||||
|     void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ | ||||
|     ret.checkerboard = rhs.checkerboard; | ||||
|     conformable(ret,rhs); | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int ss=0;ss<rhs._grid->oSites();ss++){ | ||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       obj1 tmp; | ||||
|       mult(&tmp,&lhs,&rhs._odata[ss]); | ||||
| @@ -182,8 +173,7 @@ PARALLEL_FOR_LOOP | ||||
|     void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ | ||||
|     ret.checkerboard = rhs.checkerboard; | ||||
|     conformable(ret,rhs); | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int ss=0;ss<rhs._grid->oSites();ss++){ | ||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       obj1 tmp; | ||||
|       mac(&tmp,&lhs,&rhs._odata[ss]); | ||||
| @@ -198,8 +188,7 @@ PARALLEL_FOR_LOOP | ||||
|     void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ | ||||
|     ret.checkerboard = rhs.checkerboard; | ||||
|     conformable(ret,rhs); | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int ss=0;ss<rhs._grid->oSites();ss++){ | ||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       obj1 tmp; | ||||
|       sub(&tmp,&lhs,&rhs._odata[ss]); | ||||
| @@ -213,8 +202,7 @@ PARALLEL_FOR_LOOP | ||||
|     void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ | ||||
|     ret.checkerboard = rhs.checkerboard; | ||||
|     conformable(ret,rhs); | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int ss=0;ss<rhs._grid->oSites();ss++){ | ||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       obj1 tmp; | ||||
|       add(&tmp,&lhs,&rhs._odata[ss]); | ||||
| @@ -230,8 +218,7 @@ PARALLEL_FOR_LOOP | ||||
|     ret.checkerboard = x.checkerboard; | ||||
|     conformable(ret,x); | ||||
|     conformable(x,y); | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int ss=0;ss<x._grid->oSites();ss++){ | ||||
|     parallel_for(int ss=0;ss<x._grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       vobj tmp = a*x._odata[ss]+y._odata[ss]; | ||||
|       vstream(ret._odata[ss],tmp); | ||||
| @@ -245,8 +232,7 @@ PARALLEL_FOR_LOOP | ||||
|     ret.checkerboard = x.checkerboard; | ||||
|     conformable(ret,x); | ||||
|     conformable(x,y); | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int ss=0;ss<x._grid->oSites();ss++){ | ||||
|     parallel_for(int ss=0;ss<x._grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       vobj tmp = a*x._odata[ss]+b*y._odata[ss]; | ||||
|       vstream(ret._odata[ss],tmp); | ||||
|   | ||||
| @@ -121,8 +121,7 @@ public: | ||||
|     assert( (cb==Odd) || (cb==Even)); | ||||
|     checkerboard=cb; | ||||
|  | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int ss=0;ss<_grid->oSites();ss++){ | ||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       vobj tmp = eval(ss,expr); | ||||
|       vstream(_odata[ss] ,tmp); | ||||
| @@ -144,8 +143,7 @@ PARALLEL_FOR_LOOP | ||||
|     assert( (cb==Odd) || (cb==Even)); | ||||
|     checkerboard=cb; | ||||
|  | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int ss=0;ss<_grid->oSites();ss++){ | ||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       vobj tmp = eval(ss,expr); | ||||
|       vstream(_odata[ss] ,tmp); | ||||
| @@ -167,8 +165,7 @@ PARALLEL_FOR_LOOP | ||||
|     assert( (cb==Odd) || (cb==Even)); | ||||
|     checkerboard=cb; | ||||
|  | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int ss=0;ss<_grid->oSites();ss++){ | ||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       //vobj tmp = eval(ss,expr); | ||||
|       vstream(_odata[ss] ,eval(ss,expr)); | ||||
| @@ -191,8 +188,7 @@ PARALLEL_FOR_LOOP | ||||
|     checkerboard=cb; | ||||
|  | ||||
|     _odata.resize(_grid->oSites()); | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int ss=0;ss<_grid->oSites();ss++){ | ||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       vobj tmp = eval(ss,expr); | ||||
|       vstream(_odata[ss] ,tmp); | ||||
| @@ -213,8 +209,7 @@ PARALLEL_FOR_LOOP | ||||
|     checkerboard=cb; | ||||
|  | ||||
|     _odata.resize(_grid->oSites()); | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int ss=0;ss<_grid->oSites();ss++){ | ||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       vobj tmp = eval(ss,expr); | ||||
|       vstream(_odata[ss] ,tmp); | ||||
| @@ -235,72 +230,78 @@ PARALLEL_FOR_LOOP | ||||
|     checkerboard=cb; | ||||
|  | ||||
|     _odata.resize(_grid->oSites()); | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int ss=0;ss<_grid->oSites();ss++){ | ||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | ||||
|       vstream(_odata[ss] ,eval(ss,expr)); | ||||
|     } | ||||
|   }; | ||||
|  | ||||
|     ////////////////////////////////////////////////////////////////// | ||||
|     // Constructor requires "grid" passed. | ||||
|     // what about a default grid? | ||||
|     ////////////////////////////////////////////////////////////////// | ||||
|     Lattice(GridBase *grid) : _odata(grid->oSites()) { | ||||
|         _grid = grid; | ||||
|   ////////////////////////////////////////////////////////////////// | ||||
|   // Constructor requires "grid" passed. | ||||
|   // what about a default grid? | ||||
|   ////////////////////////////////////////////////////////////////// | ||||
|   Lattice(GridBase *grid) : _odata(grid->oSites()) { | ||||
|     _grid = grid; | ||||
|     //        _odata.reserve(_grid->oSites()); | ||||
|     //        _odata.resize(_grid->oSites()); | ||||
|     //      std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl; | ||||
|         assert((((uint64_t)&_odata[0])&0xF) ==0); | ||||
|         checkerboard=0; | ||||
|     assert((((uint64_t)&_odata[0])&0xF) ==0); | ||||
|     checkerboard=0; | ||||
|   } | ||||
|    | ||||
|   Lattice(const Lattice& r){ // copy constructor | ||||
|     _grid = r._grid; | ||||
|     checkerboard = r.checkerboard; | ||||
|     _odata.resize(_grid->oSites());// essential | ||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | ||||
|       _odata[ss]=r._odata[ss]; | ||||
|     }  	 | ||||
|   } | ||||
|    | ||||
|     Lattice(const Lattice& r){ // copy constructor | ||||
|     	_grid = r._grid; | ||||
|     	checkerboard = r.checkerboard; | ||||
|     	_odata.resize(_grid->oSites());// essential | ||||
|   		PARALLEL_FOR_LOOP | ||||
|         for(int ss=0;ss<_grid->oSites();ss++){ | ||||
|             _odata[ss]=r._odata[ss]; | ||||
|         }  	 | ||||
|    | ||||
|    | ||||
|   virtual ~Lattice(void) = default; | ||||
|      | ||||
|   void reset(GridBase* grid) { | ||||
|     if (_grid != grid) { | ||||
|       _grid = grid; | ||||
|       _odata.resize(grid->oSites()); | ||||
|       checkerboard = 0; | ||||
|     } | ||||
|   } | ||||
|    | ||||
|  | ||||
|  | ||||
|     virtual ~Lattice(void) = default; | ||||
|      | ||||
|     template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){ | ||||
| PARALLEL_FOR_LOOP | ||||
|         for(int ss=0;ss<_grid->oSites();ss++){ | ||||
|             this->_odata[ss]=r; | ||||
|         } | ||||
|         return *this; | ||||
|   template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){ | ||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | ||||
|       this->_odata[ss]=r; | ||||
|     } | ||||
|     template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){ | ||||
|       this->checkerboard = r.checkerboard; | ||||
|       conformable(*this,r); | ||||
|     return *this; | ||||
|   } | ||||
|    | ||||
| PARALLEL_FOR_LOOP | ||||
|         for(int ss=0;ss<_grid->oSites();ss++){ | ||||
|             this->_odata[ss]=r._odata[ss]; | ||||
|         } | ||||
|         return *this; | ||||
|     } | ||||
|   template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){ | ||||
|     this->checkerboard = r.checkerboard; | ||||
|     conformable(*this,r); | ||||
|      | ||||
|     // *=,+=,-= operators inherit behvour from correspond */+/- operation | ||||
|     template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) { | ||||
|         *this = (*this)*r; | ||||
|         return *this; | ||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | ||||
|       this->_odata[ss]=r._odata[ss]; | ||||
|     } | ||||
|     return *this; | ||||
|   } | ||||
|    | ||||
|     template<class T> strong_inline Lattice<vobj> &operator -=(const T &r) { | ||||
|         *this = (*this)-r; | ||||
|         return *this; | ||||
|     } | ||||
|     template<class T> strong_inline Lattice<vobj> &operator +=(const T &r) { | ||||
|         *this = (*this)+r; | ||||
|         return *this; | ||||
|     } | ||||
|  }; // class Lattice | ||||
|   // *=,+=,-= operators inherit behvour from correspond */+/- operation | ||||
|   template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) { | ||||
|     *this = (*this)*r; | ||||
|     return *this; | ||||
|   } | ||||
|    | ||||
|   template<class T> strong_inline Lattice<vobj> &operator -=(const T &r) { | ||||
|     *this = (*this)-r; | ||||
|     return *this; | ||||
|   } | ||||
|   template<class T> strong_inline Lattice<vobj> &operator +=(const T &r) { | ||||
|     *this = (*this)+r; | ||||
|     return *this; | ||||
|   } | ||||
| }; // class Lattice | ||||
|    | ||||
|   template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){ | ||||
|     std::vector<int> gcoor; | ||||
|   | ||||
| @@ -45,90 +45,87 @@ namespace Grid { | ||||
|   ////////////////////////////////////////////////////////////////////////// | ||||
|   template<class vfunctor,class lobj,class robj>   | ||||
|     inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs) | ||||
|     { | ||||
|       Lattice<vInteger> ret(rhs._grid); | ||||
| PARALLEL_FOR_LOOP | ||||
|         for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
| 	  ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]); | ||||
|         } | ||||
|         return ret; | ||||
|   { | ||||
|     Lattice<vInteger> ret(rhs._grid); | ||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
|       ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]); | ||||
|     } | ||||
|     return ret; | ||||
|   } | ||||
|   ////////////////////////////////////////////////////////////////////////// | ||||
|   // compare lattice to scalar | ||||
|   ////////////////////////////////////////////////////////////////////////// | ||||
|     template<class vfunctor,class lobj,class robj>  | ||||
|   template<class vfunctor,class lobj,class robj>  | ||||
|     inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs) | ||||
|     { | ||||
|       Lattice<vInteger> ret(lhs._grid); | ||||
| PARALLEL_FOR_LOOP | ||||
|         for(int ss=0;ss<lhs._grid->oSites(); ss++){ | ||||
| 	  ret._odata[ss]=op(lhs._odata[ss],rhs); | ||||
|         } | ||||
|         return ret; | ||||
|   { | ||||
|     Lattice<vInteger> ret(lhs._grid); | ||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){ | ||||
|       ret._odata[ss]=op(lhs._odata[ss],rhs); | ||||
|     } | ||||
|     return ret; | ||||
|   } | ||||
|   ////////////////////////////////////////////////////////////////////////// | ||||
|   // compare scalar to lattice | ||||
|   ////////////////////////////////////////////////////////////////////////// | ||||
|     template<class vfunctor,class lobj,class robj>  | ||||
|   template<class vfunctor,class lobj,class robj>  | ||||
|     inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs) | ||||
|     { | ||||
|       Lattice<vInteger> ret(rhs._grid); | ||||
| PARALLEL_FOR_LOOP | ||||
|         for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
| 	  ret._odata[ss]=op(lhs._odata[ss],rhs); | ||||
|         } | ||||
|         return ret; | ||||
|   { | ||||
|     Lattice<vInteger> ret(rhs._grid); | ||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
|       ret._odata[ss]=op(lhs._odata[ss],rhs); | ||||
|     } | ||||
|     return ret; | ||||
|   } | ||||
|    | ||||
|   ////////////////////////////////////////////////////////////////////////// | ||||
|   // Map to functors | ||||
|   ////////////////////////////////////////////////////////////////////////// | ||||
|     // Less than | ||||
|    template<class lobj,class robj> | ||||
|    inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { | ||||
|      return LLComparison(vlt<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|    template<class lobj,class robj> | ||||
|    inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) { | ||||
|      return LSComparison(vlt<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|    template<class lobj,class robj> | ||||
|    inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) { | ||||
|      return SLComparison(vlt<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|   // Less than | ||||
|   template<class lobj,class robj> | ||||
|     inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { | ||||
|     return LLComparison(vlt<lobj,robj>(),lhs,rhs); | ||||
|   } | ||||
|   template<class lobj,class robj> | ||||
|     inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) { | ||||
|     return LSComparison(vlt<lobj,robj>(),lhs,rhs); | ||||
|   } | ||||
|   template<class lobj,class robj> | ||||
|     inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) { | ||||
|     return SLComparison(vlt<lobj,robj>(),lhs,rhs); | ||||
|   } | ||||
|    | ||||
|    // Less than equal | ||||
|    template<class lobj,class robj> | ||||
|    inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { | ||||
|      return LLComparison(vle<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|    template<class lobj,class robj> | ||||
|    inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) { | ||||
|      return LSComparison(vle<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|    template<class lobj,class robj> | ||||
|    inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) { | ||||
|      return SLComparison(vle<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|   // Less than equal | ||||
|   template<class lobj,class robj> | ||||
|     inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { | ||||
|     return LLComparison(vle<lobj,robj>(),lhs,rhs); | ||||
|   } | ||||
|   template<class lobj,class robj> | ||||
|     inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) { | ||||
|     return LSComparison(vle<lobj,robj>(),lhs,rhs); | ||||
|   } | ||||
|   template<class lobj,class robj> | ||||
|     inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) { | ||||
|     return SLComparison(vle<lobj,robj>(),lhs,rhs); | ||||
|   } | ||||
|    | ||||
|    // Greater than  | ||||
|    template<class lobj,class robj> | ||||
|    inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { | ||||
|      return LLComparison(vgt<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|    template<class lobj,class robj> | ||||
|    inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) { | ||||
|      return LSComparison(vgt<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|    template<class lobj,class robj> | ||||
|    inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) { | ||||
|   // Greater than  | ||||
|   template<class lobj,class robj> | ||||
|     inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { | ||||
|     return LLComparison(vgt<lobj,robj>(),lhs,rhs); | ||||
|   } | ||||
|   template<class lobj,class robj> | ||||
|     inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) { | ||||
|     return LSComparison(vgt<lobj,robj>(),lhs,rhs); | ||||
|   } | ||||
|   template<class lobj,class robj> | ||||
|     inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) { | ||||
|      return SLComparison(vgt<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|   } | ||||
|    | ||||
|    | ||||
|    // Greater than equal | ||||
|   // Greater than equal | ||||
|    template<class lobj,class robj> | ||||
|    inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { | ||||
|      inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { | ||||
|      return LLComparison(vge<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|    template<class lobj,class robj> | ||||
| @@ -136,38 +133,37 @@ PARALLEL_FOR_LOOP | ||||
|      return LSComparison(vge<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|    template<class lobj,class robj> | ||||
|    inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) { | ||||
|      inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) { | ||||
|      return SLComparison(vge<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|     | ||||
|    // equal | ||||
|    template<class lobj,class robj> | ||||
|    inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { | ||||
|      inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { | ||||
|      return LLComparison(veq<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|    template<class lobj,class robj> | ||||
|    inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) { | ||||
|      inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) { | ||||
|      return LSComparison(veq<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|    template<class lobj,class robj> | ||||
|    inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) { | ||||
|      inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) { | ||||
|      return SLComparison(veq<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|     | ||||
|     | ||||
|    // not equal | ||||
|    template<class lobj,class robj> | ||||
|    inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { | ||||
|      inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { | ||||
|      return LLComparison(vne<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|    template<class lobj,class robj> | ||||
|    inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) { | ||||
|      inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) { | ||||
|      return LSComparison(vne<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|    template<class lobj,class robj> | ||||
|    inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) { | ||||
|      inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) { | ||||
|      return SLComparison(vne<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|  | ||||
| } | ||||
| #endif | ||||
|   | ||||
| @@ -34,47 +34,42 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
| namespace Grid { | ||||
|  | ||||
|     ///////////////////////////////////////////////////// | ||||
|     // Non site, reduced locally reduced routines | ||||
|     ///////////////////////////////////////////////////// | ||||
|   ///////////////////////////////////////////////////// | ||||
|   // Non site, reduced locally reduced routines | ||||
|   ///////////////////////////////////////////////////// | ||||
|    | ||||
|     // localNorm2, | ||||
|     template<class vobj> | ||||
|   // localNorm2, | ||||
|   template<class vobj> | ||||
|     inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced> | ||||
|     { | ||||
|       Lattice<typename vobj::tensor_reduced> ret(rhs._grid); | ||||
| PARALLEL_FOR_LOOP | ||||
|         for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
| 	  ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]); | ||||
|         } | ||||
|         return ret; | ||||
|       parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
| 	ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]); | ||||
|       } | ||||
|       return ret; | ||||
|     } | ||||
|    | ||||
|     // localInnerProduct | ||||
|     template<class vobj> | ||||
|   // localInnerProduct | ||||
|   template<class vobj> | ||||
|     inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced> | ||||
|     { | ||||
|       Lattice<typename vobj::tensor_reduced> ret(rhs._grid); | ||||
| PARALLEL_FOR_LOOP | ||||
|       for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
|       parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
| 	ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]); | ||||
|       } | ||||
|       return ret; | ||||
|     } | ||||
|    | ||||
|     // outerProduct Scalar x Scalar -> Scalar | ||||
|     //              Vector x Vector -> Matrix | ||||
|     template<class ll,class rr> | ||||
|   // outerProduct Scalar x Scalar -> Scalar | ||||
|   //              Vector x Vector -> Matrix | ||||
|   template<class ll,class rr> | ||||
|     inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> | ||||
|     { | ||||
|         Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid); | ||||
| PARALLEL_FOR_LOOP | ||||
|         for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
|             ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]); | ||||
|         } | ||||
|         return ret; | ||||
|      } | ||||
|  | ||||
|   { | ||||
|     Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid); | ||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
|       ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]); | ||||
|     } | ||||
|     return ret; | ||||
|   } | ||||
| } | ||||
|  | ||||
| #endif | ||||
|   | ||||
| @@ -37,8 +37,7 @@ namespace Grid { | ||||
|   inline Lattice<vobj> operator -(const Lattice<vobj> &r) | ||||
|   { | ||||
|     Lattice<vobj> ret(r._grid); | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int ss=0;ss<r._grid->oSites();ss++){ | ||||
|     parallel_for(int ss=0;ss<r._grid->oSites();ss++){ | ||||
|       vstream(ret._odata[ss], -r._odata[ss]); | ||||
|     } | ||||
|     return ret; | ||||
| @@ -74,8 +73,7 @@ PARALLEL_FOR_LOOP | ||||
|   inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])> | ||||
|   { | ||||
|     Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid); | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
|       decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss];  | ||||
|       vstream(ret._odata[ss],tmp); | ||||
| 	   //      ret._odata[ss]=lhs*rhs._odata[ss]; | ||||
| @@ -86,8 +84,7 @@ PARALLEL_FOR_LOOP | ||||
|     inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])> | ||||
|     { | ||||
|       Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid); | ||||
| PARALLEL_FOR_LOOP | ||||
|       for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
|       parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
| 	decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];   | ||||
| 	vstream(ret._odata[ss],tmp); | ||||
| 	//	ret._odata[ss]=lhs+rhs._odata[ss]; | ||||
| @@ -98,11 +95,9 @@ PARALLEL_FOR_LOOP | ||||
|     inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])> | ||||
|   { | ||||
|     Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid); | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
|       decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];   | ||||
|       vstream(ret._odata[ss],tmp); | ||||
|       //      ret._odata[ss]=lhs-rhs._odata[ss]; | ||||
|     } | ||||
|     return ret; | ||||
|   } | ||||
| @@ -110,8 +105,7 @@ PARALLEL_FOR_LOOP | ||||
|       inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)> | ||||
|     { | ||||
|       Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid); | ||||
| PARALLEL_FOR_LOOP | ||||
|       for(int ss=0;ss<lhs._grid->oSites(); ss++){ | ||||
|       parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){ | ||||
| 	decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs; | ||||
| 	vstream(ret._odata[ss],tmp); | ||||
| 	//            ret._odata[ss]=lhs._odata[ss]*rhs; | ||||
| @@ -122,8 +116,7 @@ PARALLEL_FOR_LOOP | ||||
|       inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)> | ||||
|     { | ||||
|         Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid); | ||||
| PARALLEL_FOR_LOOP | ||||
|         for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
| 	parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
| 	  decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs;  | ||||
| 	  vstream(ret._odata[ss],tmp); | ||||
| 	  //	  ret._odata[ss]=lhs._odata[ss]+rhs; | ||||
| @@ -134,15 +127,12 @@ PARALLEL_FOR_LOOP | ||||
|       inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)> | ||||
|     { | ||||
|       Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid); | ||||
| PARALLEL_FOR_LOOP | ||||
|       for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
|       parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
| 	  decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs; | ||||
| 	  vstream(ret._odata[ss],tmp); | ||||
| 	  //	ret._odata[ss]=lhs._odata[ss]-rhs; | ||||
|       } | ||||
|       return ret; | ||||
|     } | ||||
|  | ||||
|  | ||||
| } | ||||
| #endif | ||||
|   | ||||
| @@ -44,22 +44,20 @@ namespace Grid { | ||||
|     { | ||||
|       Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid); | ||||
|       ret.checkerboard=lhs.checkerboard; | ||||
| PARALLEL_FOR_LOOP | ||||
|         for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| 	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i); | ||||
|         } | ||||
|         return ret; | ||||
|       parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| 	ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i); | ||||
|       } | ||||
|       return ret; | ||||
|     }; | ||||
|     template<int Index,class vobj> | ||||
|        auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> | ||||
|       auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> | ||||
|     { | ||||
|       Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid); | ||||
|       ret.checkerboard=lhs.checkerboard; | ||||
| PARALLEL_FOR_LOOP | ||||
|         for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| 	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j); | ||||
|         } | ||||
|         return ret; | ||||
|       parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| 	ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j); | ||||
|       } | ||||
|       return ret; | ||||
|     }; | ||||
|  | ||||
|     //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| @@ -68,18 +66,16 @@ PARALLEL_FOR_LOOP | ||||
|     template<int Index,class vobj>  | ||||
|     void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0))> & rhs,int i) | ||||
|     { | ||||
| PARALLEL_FOR_LOOP | ||||
|         for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| 	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i); | ||||
| 	}       | ||||
|       parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| 	pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i); | ||||
|       }       | ||||
|     } | ||||
|     template<int Index,class vobj> | ||||
|       void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0,0))> & rhs,int i,int j) | ||||
|     { | ||||
| PARALLEL_FOR_LOOP | ||||
|         for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| 	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j); | ||||
| 	}       | ||||
|       parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| 	pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j); | ||||
|       }       | ||||
|     } | ||||
|  | ||||
|     ////////////////////////////////////////////////////// | ||||
| @@ -131,9 +127,6 @@ PARALLEL_FOR_LOOP | ||||
|  | ||||
|       assert( l.checkerboard == l._grid->CheckerBoard(site)); | ||||
|  | ||||
|       // FIXME | ||||
|       //      assert( sizeof(sobj)*Nsimd == sizeof(vobj)); | ||||
|  | ||||
|       int rank,odx,idx; | ||||
|       grid->GlobalCoorToRankIndex(rank,odx,idx,site); | ||||
|  | ||||
|   | ||||
| @@ -40,8 +40,7 @@ namespace Grid { | ||||
|  | ||||
|     template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){ | ||||
|         Lattice<vobj> ret(lhs._grid); | ||||
| PARALLEL_FOR_LOOP | ||||
|         for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| 	parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
|             ret._odata[ss] = adj(lhs._odata[ss]); | ||||
|         } | ||||
|         return ret; | ||||
| @@ -49,13 +48,10 @@ PARALLEL_FOR_LOOP | ||||
|  | ||||
|     template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){ | ||||
|         Lattice<vobj> ret(lhs._grid); | ||||
| PARALLEL_FOR_LOOP | ||||
|         for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
|             ret._odata[ss] = conjugate(lhs._odata[ss]); | ||||
| 	parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| 	  ret._odata[ss] = conjugate(lhs._odata[ss]); | ||||
|         } | ||||
|         return ret; | ||||
|     }; | ||||
|  | ||||
|  | ||||
| } | ||||
| #endif | ||||
|   | ||||
| @@ -1,159 +1,154 @@ | ||||
|     /************************************************************************************* | ||||
|  | ||||
|  /************************************************************************************* | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
|     Source file: ./lib/lattice/Lattice_reduction.h | ||||
|  | ||||
|     Copyright (C) 2015 | ||||
|  | ||||
| Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
|  | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
|  | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #ifndef GRID_LATTICE_REDUCTION_H | ||||
| #define GRID_LATTICE_REDUCTION_H | ||||
|  | ||||
| #include <Grid/Grid_Eigen_Dense.h> | ||||
|  | ||||
| namespace Grid { | ||||
| #ifdef GRID_WARN_SUBOPTIMAL | ||||
| #warning "Optimisation alert all these reduction loops are NOT threaded " | ||||
| #endif      | ||||
|  | ||||
|     //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|     // Deterministic Reduction operations | ||||
|     //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){ | ||||
|     ComplexD nrm = innerProduct(arg,arg); | ||||
|     return std::real(nrm);  | ||||
|   //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // Deterministic Reduction operations | ||||
|   //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){ | ||||
|   ComplexD nrm = innerProduct(arg,arg); | ||||
|   return std::real(nrm);  | ||||
| } | ||||
|  | ||||
| // Double inner product | ||||
| template<class vobj> | ||||
| inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)  | ||||
| { | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_typeD vector_type; | ||||
|   scalar_type  nrm; | ||||
|    | ||||
|   GridBase *grid = left._grid; | ||||
|    | ||||
|   std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize()); | ||||
|    | ||||
|   parallel_for(int thr=0;thr<grid->SumArraySize();thr++){ | ||||
|     int nwork, mywork, myoff; | ||||
|     GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); | ||||
|      | ||||
|     decltype(innerProductD(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation | ||||
|     for(int ss=myoff;ss<mywork+myoff; ss++){ | ||||
|       vnrm = vnrm + innerProductD(left._odata[ss],right._odata[ss]); | ||||
|     } | ||||
|     sumarray[thr]=TensorRemove(vnrm) ; | ||||
|   } | ||||
|    | ||||
|     template<class vobj> | ||||
|     inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)  | ||||
|     { | ||||
|       typedef typename vobj::scalar_type scalar_type; | ||||
|       typedef typename vobj::vector_type vector_type; | ||||
|       scalar_type  nrm; | ||||
|   vector_type vvnrm; vvnrm=zero;  // sum across threads | ||||
|   for(int i=0;i<grid->SumArraySize();i++){ | ||||
|     vvnrm = vvnrm+sumarray[i]; | ||||
|   }  | ||||
|   nrm = Reduce(vvnrm);// sum across simd | ||||
|   right._grid->GlobalSum(nrm); | ||||
|   return nrm; | ||||
| } | ||||
|   | ||||
|       GridBase *grid = left._grid; | ||||
| template<class Op,class T1> | ||||
| inline auto sum(const LatticeUnaryExpression<Op,T1> & expr) | ||||
|   ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object | ||||
| { | ||||
|   return sum(closure(expr)); | ||||
| } | ||||
|  | ||||
|       std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize()); | ||||
|       for(int i=0;i<grid->SumArraySize();i++){ | ||||
| 	sumarray[i]=zero; | ||||
|       } | ||||
|  | ||||
| PARALLEL_FOR_LOOP | ||||
|       for(int thr=0;thr<grid->SumArraySize();thr++){ | ||||
| 	int nwork, mywork, myoff; | ||||
| 	GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); | ||||
| 	 | ||||
| 	decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation | ||||
|         for(int ss=myoff;ss<mywork+myoff; ss++){ | ||||
| 	  vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]); | ||||
| 	} | ||||
| 	sumarray[thr]=TensorRemove(vnrm) ; | ||||
|       } | ||||
|      | ||||
|       vector_type vvnrm; vvnrm=zero;  // sum across threads | ||||
|       for(int i=0;i<grid->SumArraySize();i++){ | ||||
| 	vvnrm = vvnrm+sumarray[i]; | ||||
|       }  | ||||
|       nrm = Reduce(vvnrm);// sum across simd | ||||
|       right._grid->GlobalSum(nrm); | ||||
|       return nrm; | ||||
|     } | ||||
|  | ||||
|     template<class Op,class T1> | ||||
|       inline auto sum(const LatticeUnaryExpression<Op,T1> & expr) | ||||
|       ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object | ||||
|     { | ||||
|       return sum(closure(expr)); | ||||
|     } | ||||
|  | ||||
|     template<class Op,class T1,class T2> | ||||
|       inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr) | ||||
| template<class Op,class T1,class T2> | ||||
| inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr) | ||||
|       ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object | ||||
|     { | ||||
|       return sum(closure(expr)); | ||||
|     } | ||||
|  | ||||
|  | ||||
|     template<class Op,class T1,class T2,class T3> | ||||
|       inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) | ||||
|       ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)), | ||||
| 				 eval(0,std::get<1>(expr.second)), | ||||
| 				 eval(0,std::get<2>(expr.second)) | ||||
| 				 ))::scalar_object | ||||
|     { | ||||
|       return sum(closure(expr)); | ||||
|     } | ||||
|  | ||||
|     template<class vobj> | ||||
|     inline typename vobj::scalar_object sum(const Lattice<vobj> &arg){ | ||||
|  | ||||
|       GridBase *grid=arg._grid; | ||||
|       int Nsimd = grid->Nsimd(); | ||||
|  | ||||
|       std::vector<vobj,alignedAllocator<vobj> > sumarray(grid->SumArraySize()); | ||||
|       for(int i=0;i<grid->SumArraySize();i++){ | ||||
| 	sumarray[i]=zero; | ||||
|       } | ||||
|  | ||||
| PARALLEL_FOR_LOOP | ||||
|       for(int thr=0;thr<grid->SumArraySize();thr++){ | ||||
| 	int nwork, mywork, myoff; | ||||
| 	GridThread::GetWork(grid->oSites(),thr,mywork,myoff); | ||||
|  | ||||
| 	vobj vvsum=zero; | ||||
|         for(int ss=myoff;ss<mywork+myoff; ss++){ | ||||
| 	  vvsum = vvsum + arg._odata[ss]; | ||||
| 	} | ||||
| 	sumarray[thr]=vvsum; | ||||
|       } | ||||
|  | ||||
|       vobj vsum=zero;  // sum across threads | ||||
|       for(int i=0;i<grid->SumArraySize();i++){ | ||||
| 	vsum = vsum+sumarray[i]; | ||||
|       }  | ||||
|  | ||||
|       typedef typename vobj::scalar_object sobj; | ||||
|       sobj ssum=zero; | ||||
|  | ||||
|       std::vector<sobj>               buf(Nsimd); | ||||
|       extract(vsum,buf); | ||||
|  | ||||
|       for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i]; | ||||
|       arg._grid->GlobalSum(ssum); | ||||
|  | ||||
|       return ssum; | ||||
| { | ||||
|   return sum(closure(expr)); | ||||
| } | ||||
|  | ||||
|  | ||||
| template<class Op,class T1,class T2,class T3> | ||||
| inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) | ||||
|   ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)), | ||||
| 				      eval(0,std::get<1>(expr.second)), | ||||
| 				      eval(0,std::get<2>(expr.second)) | ||||
| 				      ))::scalar_object | ||||
| { | ||||
|   return sum(closure(expr)); | ||||
| } | ||||
|  | ||||
| template<class vobj> | ||||
| inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) | ||||
| { | ||||
|   GridBase *grid=arg._grid; | ||||
|   int Nsimd = grid->Nsimd(); | ||||
|    | ||||
|   std::vector<vobj,alignedAllocator<vobj> > sumarray(grid->SumArraySize()); | ||||
|   for(int i=0;i<grid->SumArraySize();i++){ | ||||
|     sumarray[i]=zero; | ||||
|   } | ||||
|    | ||||
|   parallel_for(int thr=0;thr<grid->SumArraySize();thr++){ | ||||
|     int nwork, mywork, myoff; | ||||
|     GridThread::GetWork(grid->oSites(),thr,mywork,myoff); | ||||
|      | ||||
|     vobj vvsum=zero; | ||||
|     for(int ss=myoff;ss<mywork+myoff; ss++){ | ||||
|       vvsum = vvsum + arg._odata[ss]; | ||||
|     } | ||||
|     sumarray[thr]=vvsum; | ||||
|   } | ||||
|    | ||||
|   vobj vsum=zero;  // sum across threads | ||||
|   for(int i=0;i<grid->SumArraySize();i++){ | ||||
|     vsum = vsum+sumarray[i]; | ||||
|   }  | ||||
|    | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   sobj ssum=zero; | ||||
|    | ||||
|   std::vector<sobj>               buf(Nsimd); | ||||
|   extract(vsum,buf); | ||||
|    | ||||
|   for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i]; | ||||
|   arg._grid->GlobalSum(ssum); | ||||
|    | ||||
|   return ssum; | ||||
| } | ||||
|  | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc... | ||||
| ////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|  | ||||
| template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim) | ||||
| { | ||||
|   /////////////////////////////////////////////////////// | ||||
|   // FIXME precision promoted summation | ||||
|   // may be important for correlation functions | ||||
|   // But easily avoided by using double precision fields | ||||
|   /////////////////////////////////////////////////////// | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   GridBase  *grid = Data._grid; | ||||
|   assert(grid!=NULL); | ||||
|  | ||||
|   // FIXME | ||||
|   // std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl; | ||||
|  | ||||
|   const int    Nd = grid->_ndimension; | ||||
|   const int Nsimd = grid->Nsimd(); | ||||
|  | ||||
| @@ -165,22 +160,30 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector< | ||||
|   int rd=grid->_rdimensions[orthogdim]; | ||||
|  | ||||
|   std::vector<vobj,alignedAllocator<vobj> > lvSum(rd); // will locally sum vectors first | ||||
|   std::vector<sobj> lsSum(ld,zero); // sum across these down to scalars | ||||
|   std::vector<sobj> extracted(Nsimd);     // splitting the SIMD | ||||
|   std::vector<sobj> lsSum(ld,zero);                    // sum across these down to scalars | ||||
|   std::vector<sobj> extracted(Nsimd);                  // splitting the SIMD | ||||
|  | ||||
|   result.resize(fd); // And then global sum to return the same vector to every node for IO to file | ||||
|   result.resize(fd); // And then global sum to return the same vector to every node  | ||||
|   for(int r=0;r<rd;r++){ | ||||
|     lvSum[r]=zero; | ||||
|   } | ||||
|  | ||||
|   std::vector<int>  coor(Nd);   | ||||
|   int e1=    grid->_slice_nblock[orthogdim]; | ||||
|   int e2=    grid->_slice_block [orthogdim]; | ||||
|   int stride=grid->_slice_stride[orthogdim]; | ||||
|  | ||||
|   // sum over reduced dimension planes, breaking out orthog dir | ||||
|   // Parallel over orthog direction | ||||
|   parallel_for(int r=0;r<rd;r++){ | ||||
|  | ||||
|   for(int ss=0;ss<grid->oSites();ss++){ | ||||
|     Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions); | ||||
|     int r = coor[orthogdim]; | ||||
|     lvSum[r]=lvSum[r]+Data._odata[ss]; | ||||
|     int so=r*grid->_ostride[orthogdim]; // base offset for start of plane  | ||||
|  | ||||
|     for(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
| 	int ss= so+n*stride+b; | ||||
| 	lvSum[r]=lvSum[r]+Data._odata[ss]; | ||||
|       } | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   // Sum across simd lanes in the plane, breaking out orthog dir. | ||||
| @@ -216,10 +219,305 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector< | ||||
|  | ||||
|     result[t]=gsum; | ||||
|   } | ||||
| } | ||||
|  | ||||
| template<class vobj> | ||||
| static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)  | ||||
| { | ||||
|   typedef typename vobj::vector_type   vector_type; | ||||
|   typedef typename vobj::scalar_type   scalar_type; | ||||
|   GridBase  *grid = lhs._grid; | ||||
|   assert(grid!=NULL); | ||||
|   conformable(grid,rhs._grid); | ||||
|  | ||||
|   const int    Nd = grid->_ndimension; | ||||
|   const int Nsimd = grid->Nsimd(); | ||||
|  | ||||
|   assert(orthogdim >= 0); | ||||
|   assert(orthogdim < Nd); | ||||
|  | ||||
|   int fd=grid->_fdimensions[orthogdim]; | ||||
|   int ld=grid->_ldimensions[orthogdim]; | ||||
|   int rd=grid->_rdimensions[orthogdim]; | ||||
|  | ||||
|   std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first | ||||
|   std::vector<scalar_type > lsSum(ld,scalar_type(0.0));                    // sum across these down to scalars | ||||
|   std::vector<iScalar<scalar_type> > extracted(Nsimd);                  // splitting the SIMD | ||||
|  | ||||
|   result.resize(fd); // And then global sum to return the same vector to every node for IO to file | ||||
|   for(int r=0;r<rd;r++){ | ||||
|     lvSum[r]=zero; | ||||
|   } | ||||
|  | ||||
|   int e1=    grid->_slice_nblock[orthogdim]; | ||||
|   int e2=    grid->_slice_block [orthogdim]; | ||||
|   int stride=grid->_slice_stride[orthogdim]; | ||||
|  | ||||
|   parallel_for(int r=0;r<rd;r++){ | ||||
|  | ||||
|     int so=r*grid->_ostride[orthogdim]; // base offset for start of plane  | ||||
|  | ||||
|     for(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
| 	int ss= so+n*stride+b; | ||||
| 	vector_type vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss])); | ||||
| 	lvSum[r]=lvSum[r]+vv; | ||||
|       } | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   // Sum across simd lanes in the plane, breaking out orthog dir. | ||||
|   std::vector<int> icoor(Nd); | ||||
|   for(int rt=0;rt<rd;rt++){ | ||||
|  | ||||
|     iScalar<vector_type> temp;  | ||||
|     temp._internal = lvSum[rt]; | ||||
|     extract(temp,extracted); | ||||
|  | ||||
|     for(int idx=0;idx<Nsimd;idx++){ | ||||
|  | ||||
|       grid->iCoorFromIindex(icoor,idx); | ||||
|  | ||||
|       int ldx =rt+icoor[orthogdim]*rd; | ||||
|  | ||||
|       lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal; | ||||
|  | ||||
|     } | ||||
|   } | ||||
|    | ||||
|   // sum over nodes. | ||||
|   scalar_type gsum; | ||||
|   for(int t=0;t<fd;t++){ | ||||
|     int pt = t/ld; // processor plane | ||||
|     int lt = t%ld; | ||||
|     if ( pt == grid->_processor_coor[orthogdim] ) { | ||||
|       gsum=lsSum[lt]; | ||||
|     } else { | ||||
|       gsum=scalar_type(0.0); | ||||
|     } | ||||
|  | ||||
|     grid->GlobalSum(gsum); | ||||
|  | ||||
|     result[t]=gsum; | ||||
|   } | ||||
| } | ||||
| template<class vobj> | ||||
| static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Orthog)  | ||||
| { | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|    | ||||
|   int Nblock = rhs._grid->GlobalDimensions()[Orthog]; | ||||
|   std::vector<ComplexD> ip(Nblock); | ||||
|   sn.resize(Nblock); | ||||
|    | ||||
|   sliceInnerProductVector(ip,rhs,rhs,Orthog); | ||||
|   for(int ss=0;ss<Nblock;ss++){ | ||||
|     sn[ss] = real(ip[ss]); | ||||
|   } | ||||
| }; | ||||
|  | ||||
|  | ||||
| template<class vobj> | ||||
| static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y, | ||||
| 			    int orthogdim,RealD scale=1.0)  | ||||
| {     | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|   typedef typename vobj::tensor_reduced tensor_reduced; | ||||
|    | ||||
|   scalar_type zscale(scale); | ||||
|  | ||||
|   GridBase *grid  = X._grid; | ||||
|  | ||||
|   int Nsimd  =grid->Nsimd(); | ||||
|   int Nblock =grid->GlobalDimensions()[orthogdim]; | ||||
|  | ||||
|   int fd     =grid->_fdimensions[orthogdim]; | ||||
|   int ld     =grid->_ldimensions[orthogdim]; | ||||
|   int rd     =grid->_rdimensions[orthogdim]; | ||||
|  | ||||
|   int e1     =grid->_slice_nblock[orthogdim]; | ||||
|   int e2     =grid->_slice_block [orthogdim]; | ||||
|   int stride =grid->_slice_stride[orthogdim]; | ||||
|  | ||||
|   std::vector<int> icoor; | ||||
|  | ||||
|   for(int r=0;r<rd;r++){ | ||||
|  | ||||
|     int so=r*grid->_ostride[orthogdim]; // base offset for start of plane  | ||||
|  | ||||
|     vector_type    av; | ||||
|  | ||||
|     for(int l=0;l<Nsimd;l++){ | ||||
|       grid->iCoorFromIindex(icoor,l); | ||||
|       int ldx =r+icoor[orthogdim]*rd; | ||||
|       scalar_type *as =(scalar_type *)&av; | ||||
|       as[l] = scalar_type(a[ldx])*zscale; | ||||
|     } | ||||
|  | ||||
|     tensor_reduced at; at=av; | ||||
|  | ||||
|     parallel_for_nest2(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
| 	int ss= so+n*stride+b; | ||||
| 	R._odata[ss] = at*X._odata[ss]+Y._odata[ss]; | ||||
|       } | ||||
|     } | ||||
|   } | ||||
| }; | ||||
|  | ||||
|  | ||||
| /* | ||||
| template<class vobj> | ||||
| static void sliceMaddVectorSlow (Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y, | ||||
| 			     int Orthog,RealD scale=1.0)  | ||||
| {     | ||||
|   // FIXME: Implementation is slow | ||||
|   // Best base the linear combination by constructing a  | ||||
|   // set of vectors of size grid->_rdimensions[Orthog]. | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|    | ||||
|   int Nblock = X._grid->GlobalDimensions()[Orthog]; | ||||
|    | ||||
|   GridBase *FullGrid  = X._grid; | ||||
|   GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); | ||||
|    | ||||
|   Lattice<vobj> Xslice(SliceGrid); | ||||
|   Lattice<vobj> Rslice(SliceGrid); | ||||
|   // If we based this on Cshift it would work for spread out | ||||
|   // but it would be even slower | ||||
|   for(int i=0;i<Nblock;i++){ | ||||
|     ExtractSlice(Rslice,Y,i,Orthog); | ||||
|     ExtractSlice(Xslice,X,i,Orthog); | ||||
|     Rslice = Rslice + Xslice*(scale*a[i]); | ||||
|     InsertSlice(Rslice,R,i,Orthog); | ||||
|   } | ||||
| }; | ||||
| template<class vobj> | ||||
| static void sliceInnerProductVectorSlow( std::vector<ComplexD> & vec, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)  | ||||
|   { | ||||
|     // FIXME: Implementation is slow | ||||
|     // Look at localInnerProduct implementation, | ||||
|     // and do inside a site loop with block strided iterators | ||||
|     typedef typename vobj::scalar_object sobj; | ||||
|     typedef typename vobj::scalar_type scalar_type; | ||||
|     typedef typename vobj::vector_type vector_type; | ||||
|     typedef typename vobj::tensor_reduced scalar; | ||||
|     typedef typename scalar::scalar_object  scomplex; | ||||
|    | ||||
|     int Nblock = lhs._grid->GlobalDimensions()[Orthog]; | ||||
|     vec.resize(Nblock); | ||||
|     std::vector<scomplex> sip(Nblock); | ||||
|     Lattice<scalar> IP(lhs._grid);  | ||||
|     IP=localInnerProduct(lhs,rhs); | ||||
|     sliceSum(IP,sip,Orthog); | ||||
|    | ||||
|     for(int ss=0;ss<Nblock;ss++){ | ||||
|       vec[ss] = TensorRemove(sip[ss]); | ||||
|     } | ||||
|   } | ||||
| */ | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // FIXME: Implementation is slow | ||||
| // If we based this on Cshift it would work for spread out | ||||
| // but it would be even slower | ||||
| // | ||||
| // Repeated extract slice is inefficient | ||||
| // | ||||
| // Best base the linear combination by constructing a  | ||||
| // set of vectors of size grid->_rdimensions[Orthog]. | ||||
| ////////////////////////////////////////////////////////////////////////////////////////// | ||||
|  | ||||
| inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) | ||||
| { | ||||
|   int NN    = BlockSolverGrid->_ndimension; | ||||
|   int nsimd = BlockSolverGrid->Nsimd(); | ||||
|    | ||||
|   std::vector<int> latt_phys(0); | ||||
|   std::vector<int> simd_phys(0); | ||||
|   std::vector<int>  mpi_phys(0); | ||||
|    | ||||
|   for(int d=0;d<NN;d++){ | ||||
|     if( d!=Orthog ) {  | ||||
|       latt_phys.push_back(BlockSolverGrid->_fdimensions[d]); | ||||
|       simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); | ||||
|       mpi_phys.push_back(BlockSolverGrid->_processors[d]); | ||||
|     } | ||||
|   } | ||||
|   return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);  | ||||
| } | ||||
|  | ||||
|  | ||||
| template<class vobj> | ||||
| static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)  | ||||
| {     | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|  | ||||
|   int Nblock = X._grid->GlobalDimensions()[Orthog]; | ||||
|    | ||||
|   GridBase *FullGrid  = X._grid; | ||||
|   GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); | ||||
|    | ||||
|   Lattice<vobj> Xslice(SliceGrid); | ||||
|   Lattice<vobj> Rslice(SliceGrid); | ||||
|    | ||||
|   for(int i=0;i<Nblock;i++){ | ||||
|     ExtractSlice(Rslice,Y,i,Orthog); | ||||
|     for(int j=0;j<Nblock;j++){ | ||||
|       ExtractSlice(Xslice,X,j,Orthog); | ||||
|       Rslice = Rslice + Xslice*(scale*aa(j,i)); | ||||
|     } | ||||
|     InsertSlice(Rslice,R,i,Orthog); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template<class vobj> | ||||
| static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)  | ||||
| { | ||||
|   // FIXME: Implementation is slow | ||||
|   // Not sure of best solution.. think about it | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|    | ||||
|   GridBase *FullGrid  = lhs._grid; | ||||
|   GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); | ||||
|    | ||||
|   int Nblock = FullGrid->GlobalDimensions()[Orthog]; | ||||
|    | ||||
|   Lattice<vobj> Lslice(SliceGrid); | ||||
|   Lattice<vobj> Rslice(SliceGrid); | ||||
|    | ||||
|   mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); | ||||
|    | ||||
|   for(int i=0;i<Nblock;i++){ | ||||
|     ExtractSlice(Lslice,lhs,i,Orthog); | ||||
|     for(int j=0;j<Nblock;j++){ | ||||
|       ExtractSlice(Rslice,rhs,j,Orthog); | ||||
|       mat(i,j) = innerProduct(Lslice,Rslice); | ||||
|     } | ||||
|   } | ||||
| #undef FORCE_DIAG | ||||
| #ifdef FORCE_DIAG | ||||
|   for(int i=0;i<Nblock;i++){ | ||||
|     for(int j=0;j<Nblock;j++){ | ||||
|       if ( i != j ) mat(i,j)=0.0; | ||||
|     } | ||||
|   } | ||||
| #endif | ||||
|   return; | ||||
| } | ||||
|  | ||||
| } /*END NAMESPACE GRID*/ | ||||
| #endif | ||||
|  | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -6,8 +6,8 @@ | ||||
|  | ||||
|     Copyright (C) 2015 | ||||
|  | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|     Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     Author: Guido Cossu <guido.cossu@ed.ac.uk> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
| @@ -30,11 +30,19 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
| #define GRID_LATTICE_RNG_H | ||||
|  | ||||
| #include <random> | ||||
|  | ||||
| #ifdef RNG_SITMO | ||||
| #include <Grid/sitmo_rng/sitmo_prng_engine.hpp> | ||||
| #endif  | ||||
|  | ||||
| #if defined(RNG_SITMO) | ||||
| #define RNG_FAST_DISCARD | ||||
| #else  | ||||
| #undef  RNG_FAST_DISCARD | ||||
| #endif | ||||
|  | ||||
| namespace Grid { | ||||
|  | ||||
|  | ||||
|   ////////////////////////////////////////////////////////////// | ||||
|   // Allow the RNG state to be less dense than the fine grid | ||||
|   ////////////////////////////////////////////////////////////// | ||||
| @@ -64,115 +72,188 @@ namespace Grid { | ||||
|  | ||||
|       multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d];  | ||||
|     } | ||||
|  | ||||
|     return multiplicity; | ||||
|   } | ||||
|  | ||||
|    | ||||
| // merge of April 11 2017 | ||||
| //<<<<<<< HEAD | ||||
|  | ||||
|  | ||||
|   // this function is necessary for the LS vectorised field | ||||
|   inline int RNGfillable_general(GridBase *coarse,GridBase *fine) | ||||
|   { | ||||
|     int rngdims = coarse->_ndimension; | ||||
|      | ||||
|     // trivially extended in higher dims, with locality guaranteeing RNG state is local to node | ||||
|     int lowerdims   = fine->_ndimension - coarse->_ndimension;  assert(lowerdims >= 0); | ||||
|     // assumes that the higher dimensions are not using more processors | ||||
|     // all further divisions are local | ||||
|     for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1); | ||||
|     for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]); | ||||
|      | ||||
|  | ||||
|     // then divide the number of local sites | ||||
|     // check that the total number of sims agree, meanse the iSites are the same | ||||
|     assert(fine->Nsimd() == coarse->Nsimd()); | ||||
|  | ||||
|     // check that the two grids divide cleanly | ||||
|     assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() ); | ||||
|  | ||||
|     return fine->lSites() / coarse->lSites(); | ||||
|   } | ||||
|  | ||||
|   /* | ||||
|   // Wrap seed_seq to give common interface with random_device | ||||
|   class fixedSeed { | ||||
|   public: | ||||
|  | ||||
|     typedef std::seed_seq::result_type result_type; | ||||
|  | ||||
|     std::seed_seq src; | ||||
|      | ||||
|     fixedSeed(const std::vector<int> &seeds) : src(seeds.begin(),seeds.end()) {}; | ||||
|  | ||||
|     result_type operator () (void){ | ||||
|  | ||||
|       std::vector<result_type> list(1); | ||||
|  | ||||
|       src.generate(list.begin(),list.end()); | ||||
|  | ||||
|       return list[0]; | ||||
|  | ||||
|     } | ||||
|  | ||||
|   }; | ||||
|  | ||||
| ======= | ||||
| >>>>>>> develop | ||||
|   */ | ||||
|    | ||||
|   // real scalars are one component | ||||
|   template<class scalar,class distribution,class generator> void fillScalar(scalar &s,distribution &dist,generator & gen) | ||||
|   template<class scalar,class distribution,class generator>  | ||||
|   void fillScalar(scalar &s,distribution &dist,generator & gen) | ||||
|   { | ||||
|     s=dist(gen); | ||||
|   } | ||||
|   template<class distribution,class generator> void fillScalar(ComplexF &s,distribution &dist, generator &gen) | ||||
|   template<class distribution,class generator>  | ||||
|   void fillScalar(ComplexF &s,distribution &dist, generator &gen) | ||||
|   { | ||||
|     s=ComplexF(dist(gen),dist(gen)); | ||||
|   } | ||||
|   template<class distribution,class generator> void fillScalar(ComplexD &s,distribution &dist,generator &gen) | ||||
|   template<class distribution,class generator>  | ||||
|   void fillScalar(ComplexD &s,distribution &dist,generator &gen) | ||||
|   { | ||||
|     s=ComplexD(dist(gen),dist(gen)); | ||||
|   } | ||||
|    | ||||
|   class GridRNGbase { | ||||
|  | ||||
|   public: | ||||
|  | ||||
|     int _seeded; | ||||
|     // One generator per site. | ||||
|     // Uniform and Gaussian distributions from these generators. | ||||
| #ifdef RNG_RANLUX | ||||
|     typedef uint64_t      RngStateType; | ||||
|     typedef std::ranlux48 RngEngine; | ||||
|     typedef uint64_t      RngStateType; | ||||
|     static const int RngStateCount = 15; | ||||
| #elif RNG_MT19937  | ||||
| #endif  | ||||
| #ifdef RNG_MT19937  | ||||
|     typedef std::mt19937 RngEngine; | ||||
|     typedef uint32_t     RngStateType; | ||||
|     static const int     RngStateCount = std::mt19937::state_size; | ||||
| #elif RNG_SITMO | ||||
| #endif | ||||
| #ifdef RNG_SITMO | ||||
|     typedef sitmo::prng_engine 	RngEngine; | ||||
|     typedef uint64_t    	RngStateType; | ||||
|     static const int    	RngStateCount = 4; | ||||
|     static const int    	RngStateCount = 13; | ||||
| #endif | ||||
|     std::vector<RngEngine>                             _generators; | ||||
|     std::vector<std::uniform_real_distribution<RealD>> _uniform; | ||||
|     std::vector<std::normal_distribution<RealD>>       _gaussian; | ||||
|     std::vector<std::discrete_distribution<int32_t>>     _bernoulli; | ||||
|  | ||||
|     void GetState(std::vector<RngStateType> & saved,int gen) { | ||||
|     std::vector<RngEngine>                             _generators; | ||||
|     std::vector<std::uniform_real_distribution<RealD> > _uniform; | ||||
|     std::vector<std::normal_distribution<RealD> >       _gaussian; | ||||
|     std::vector<std::discrete_distribution<int32_t> >   _bernoulli; | ||||
|     std::vector<std::uniform_int_distribution<uint32_t> > _uid; | ||||
|  | ||||
|     /////////////////////// | ||||
|     // support for parallel init | ||||
|     /////////////////////// | ||||
| #ifdef RNG_FAST_DISCARD | ||||
|     static void Skip(RngEngine &eng) | ||||
|     { | ||||
|       ///////////////////////////////////////////////////////////////////////////////////// | ||||
|       // Skip by 2^40 elements between successive lattice sites | ||||
|       // This goes by 10^12. | ||||
|       // Consider quenched updating; likely never exceeding rate of 1000 sweeps | ||||
|       // per second on any machine. This gives us of order 10^9 seconds, or 100 years | ||||
|       // skip ahead. | ||||
|       // For HMC unlikely to go at faster than a solve per second, and  | ||||
|       // tens of seconds per trajectory so this is clean in all reasonable cases, | ||||
|       // and margin of safety is orders of magnitude. | ||||
|       // We could hack Sitmo to skip in the higher order words of state if necessary | ||||
|       ///////////////////////////////////////////////////////////////////////////////////// | ||||
|       uint64_t skip = 0x1; skip = skip<<40; | ||||
|       eng.discard(skip); | ||||
|     }  | ||||
| #endif | ||||
|     static RngEngine Reseed(RngEngine &eng) | ||||
|     { | ||||
|       std::vector<uint32_t> newseed; | ||||
|       std::uniform_int_distribution<uint32_t> uid; | ||||
|       return Reseed(eng,newseed,uid); | ||||
|     } | ||||
|     static RngEngine Reseed(RngEngine &eng,std::vector<uint32_t> & newseed, | ||||
| 			    std::uniform_int_distribution<uint32_t> &uid) | ||||
|     { | ||||
|       const int reseeds=4; | ||||
|        | ||||
|       newseed.resize(reseeds); | ||||
|       for(int i=0;i<reseeds;i++){ | ||||
| 	newseed[i] = uid(eng); | ||||
|       } | ||||
|       std::seed_seq sseq(newseed.begin(),newseed.end()); | ||||
|       return RngEngine(sseq); | ||||
|     }     | ||||
|  | ||||
|     void GetState(std::vector<RngStateType> & saved,RngEngine &eng) { | ||||
|       saved.resize(RngStateCount); | ||||
|       std::stringstream ss; | ||||
|       ss<<_generators[gen]; | ||||
|       ss<<eng; | ||||
|       ss.seekg(0,ss.beg); | ||||
|       for(int i=0;i<RngStateCount;i++){ | ||||
| 	ss>>saved[i]; | ||||
|         ss>>saved[i]; | ||||
|       } | ||||
|     } | ||||
|     void SetState(std::vector<RngStateType> & saved,int gen){ | ||||
|     void GetState(std::vector<RngStateType> & saved,int gen) { | ||||
|       GetState(saved,_generators[gen]); | ||||
|     } | ||||
|     void SetState(std::vector<RngStateType> & saved,RngEngine &eng){ | ||||
|       assert(saved.size()==RngStateCount); | ||||
|       std::stringstream ss; | ||||
|       for(int i=0;i<RngStateCount;i++){ | ||||
| 	ss<< saved[i]<<" "; | ||||
|         ss<< saved[i]<<" "; | ||||
|       } | ||||
|       ss.seekg(0,ss.beg); | ||||
|       ss>>_generators[gen]; | ||||
|       ss>>eng; | ||||
|     } | ||||
|     void SetState(std::vector<RngStateType> & saved,int gen){ | ||||
|       SetState(saved,_generators[gen]); | ||||
|     } | ||||
|     void SetEngine(RngEngine &Eng, int gen){ | ||||
|       _generators[gen]=Eng; | ||||
|     } | ||||
|     void GetEngine(RngEngine &Eng, int gen){ | ||||
|       Eng=_generators[gen]; | ||||
|     } | ||||
|     template<class source> void Seed(source &src, int gen) | ||||
|     { | ||||
|       _generators[gen] = RngEngine(src); | ||||
|     }     | ||||
|   }; | ||||
|  | ||||
|   class GridSerialRNG : public GridRNGbase { | ||||
|   public: | ||||
|  | ||||
|     // FIXME ... do we require lockstep draws of randoms  | ||||
|     // from all nodes keeping seeds consistent. | ||||
|     // place a barrier/broadcast in the fill routine | ||||
|     template<class source> void Seed(source &src) | ||||
|     { | ||||
|       typename source::result_type init = src(); | ||||
|       CartesianCommunicator::BroadcastWorld(0,(void *)&init,sizeof(init)); | ||||
|       _generators[0] = RngEngine(init); | ||||
|       _seeded=1; | ||||
|     }     | ||||
|  | ||||
|     GridSerialRNG() : GridRNGbase() { | ||||
|       _generators.resize(1); | ||||
|       _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1}); | ||||
|       _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) ); | ||||
|       _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1}); | ||||
|       _seeded=0; | ||||
|       _uid.resize(1,std::uniform_int_distribution<uint32_t>() ); | ||||
|     } | ||||
|  | ||||
|  | ||||
|  | ||||
|     template <class sobj,class distribution> inline void fill(sobj &l,std::vector<distribution> &dist){ | ||||
|  | ||||
|       typedef typename sobj::scalar_type scalar_type; | ||||
| @@ -183,7 +264,7 @@ namespace Grid { | ||||
|  | ||||
|       dist[0].reset(); | ||||
|       for(int idx=0;idx<words;idx++){ | ||||
| 	fillScalar(buf[idx],dist[0],_generators[0]); | ||||
|   fillScalar(buf[idx],dist[0],_generators[0]); | ||||
|       } | ||||
|  | ||||
|       CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); | ||||
| @@ -215,7 +296,7 @@ namespace Grid { | ||||
|       RealF *pointer=(RealF *)&l; | ||||
|       dist[0].reset(); | ||||
|       for(int i=0;i<2*vComplexF::Nsimd();i++){ | ||||
| 	fillScalar(pointer[i],dist[0],_generators[0]); | ||||
|   fillScalar(pointer[i],dist[0],_generators[0]); | ||||
|       } | ||||
|       CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); | ||||
|     } | ||||
| @@ -223,7 +304,7 @@ namespace Grid { | ||||
|       RealD *pointer=(RealD *)&l; | ||||
|       dist[0].reset(); | ||||
|       for(int i=0;i<2*vComplexD::Nsimd();i++){ | ||||
| 	fillScalar(pointer[i],dist[0],_generators[0]); | ||||
|   fillScalar(pointer[i],dist[0],_generators[0]); | ||||
|       } | ||||
|       CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); | ||||
|     } | ||||
| @@ -231,7 +312,7 @@ namespace Grid { | ||||
|       RealF *pointer=(RealF *)&l; | ||||
|       dist[0].reset(); | ||||
|       for(int i=0;i<vRealF::Nsimd();i++){ | ||||
| 	fillScalar(pointer[i],dist[0],_generators[0]); | ||||
|   fillScalar(pointer[i],dist[0],_generators[0]); | ||||
|       } | ||||
|       CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); | ||||
|     } | ||||
| @@ -244,155 +325,184 @@ namespace Grid { | ||||
|       CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); | ||||
|     } | ||||
|      | ||||
|  | ||||
|     void SeedRandomDevice(void){ | ||||
|       std::random_device rd; | ||||
|       Seed(rd); | ||||
|     } | ||||
|     void SeedFixedIntegers(const std::vector<int> &seeds){ | ||||
|       fixedSeed src(seeds); | ||||
|       Seed(src); | ||||
|       CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size()); | ||||
|       std::seed_seq src(seeds.begin(),seeds.end()); | ||||
|       Seed(src,0); | ||||
|     } | ||||
|  | ||||
|   }; | ||||
|  | ||||
|   class GridParallelRNG : public GridRNGbase { | ||||
|  | ||||
|     double _time_counter; | ||||
|  | ||||
|   public: | ||||
|  | ||||
|     GridBase *_grid; | ||||
|     int _vol; | ||||
|     unsigned int _vol; | ||||
|  | ||||
|     int generator_idx(int os,int is){ | ||||
|     int generator_idx(int os,int is) { | ||||
|       return is*_grid->oSites()+os; | ||||
|     } | ||||
|  | ||||
|     GridParallelRNG(GridBase *grid) : GridRNGbase() { | ||||
|       _grid=grid; | ||||
|       _vol =_grid->iSites()*_grid->oSites(); | ||||
|       _grid = grid; | ||||
|       _vol  =_grid->iSites()*_grid->oSites(); | ||||
|  | ||||
|       _generators.resize(_vol); | ||||
|       _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1}); | ||||
|       _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) ); | ||||
|       _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1}); | ||||
|       _seeded=0; | ||||
|       _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() ); | ||||
|     } | ||||
|  | ||||
|  | ||||
|     // This loop could be made faster to avoid the Ahmdahl by | ||||
|     // i)  seed generators on each timeslice, for x=y=z=0; | ||||
|     // ii) seed generators on each z for x=y=0 | ||||
|     // iii)seed generators on each y,z for x=0 | ||||
|     // iv) seed generators on each y,z,x  | ||||
|     // made possible by physical indexing. | ||||
|     template<class source> void Seed(source &src) | ||||
|     { | ||||
|       std::vector<int> gcoor; | ||||
|  | ||||
|       int gsites = _grid->_gsites; | ||||
|  | ||||
|       typename source::result_type init = src(); | ||||
|       RngEngine pseeder(init); | ||||
|       std::uniform_int_distribution<uint64_t> ui; | ||||
|  | ||||
|       for(int gidx=0;gidx<gsites;gidx++){ | ||||
|  | ||||
| 	int rank,o_idx,i_idx; | ||||
| 	_grid->GlobalIndexToGlobalCoor(gidx,gcoor); | ||||
| 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); | ||||
|          | ||||
| 	int l_idx=generator_idx(o_idx,i_idx); | ||||
|  | ||||
| 	const int num_rand_seed=16; | ||||
| 	std::vector<int> site_seeds(num_rand_seed); | ||||
| 	for(int i=0;i<site_seeds.size();i++){ | ||||
| 	  site_seeds[i]= ui(pseeder); | ||||
| 	} | ||||
|  | ||||
| 	_grid->Broadcast(0,(void *)&site_seeds[0],sizeof(int)*site_seeds.size()); | ||||
|  | ||||
| 	if( rank == _grid->ThisRank() ){ | ||||
| 	  fixedSeed ssrc(site_seeds); | ||||
| 	  typename source::result_type sinit = ssrc(); | ||||
| 	  _generators[l_idx] = RngEngine(sinit); | ||||
| 	} | ||||
|       } | ||||
|       _seeded=1; | ||||
|     }     | ||||
|  | ||||
|     //FIXME implement generic IO and create state save/restore | ||||
|     //void SaveState(const std::string<char> &file); | ||||
|     //void LoadState(const std::string<char> &file); | ||||
|  | ||||
|     template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){ | ||||
|  | ||||
|       typedef typename vobj::scalar_object scalar_object; | ||||
|       typedef typename vobj::scalar_type scalar_type; | ||||
|       typedef typename vobj::vector_type vector_type; | ||||
|  | ||||
|       int multiplicity = RNGfillable(_grid,l._grid); | ||||
|       double inner_time_counter = usecond(); | ||||
|  | ||||
|       int     Nsimd =_grid->Nsimd(); | ||||
|       int     osites=_grid->oSites(); | ||||
|       int words=sizeof(scalar_object)/sizeof(scalar_type); | ||||
|       int multiplicity = RNGfillable_general(_grid, l._grid); // l has finer or same grid | ||||
|       int Nsimd  = _grid->Nsimd();  // guaranteed to be the same for l._grid too | ||||
|       int osites = _grid->oSites();  // guaranteed to be <= l._grid->oSites() by a factor multiplicity | ||||
|       int words  = sizeof(scalar_object) / sizeof(scalar_type); | ||||
|  | ||||
|       parallel_for(int ss=0;ss<osites;ss++){ | ||||
|         std::vector<scalar_object> buf(Nsimd); | ||||
|         for (int m = 0; m < multiplicity; m++) {  // Draw from same generator multiplicity times | ||||
|  | ||||
| PARALLEL_FOR_LOOP | ||||
|       for(int ss=0;ss<osites;ss++){ | ||||
|           int sm = multiplicity * ss + m;  // Maps the generator site to the fine site | ||||
|  | ||||
| 	std::vector<scalar_object> buf(Nsimd); | ||||
| 	for(int m=0;m<multiplicity;m++) {// Draw from same generator multiplicity times | ||||
|           for (int si = 0; si < Nsimd; si++) { | ||||
|              | ||||
| 	  int sm=multiplicity*ss+m;      // Maps the generator site to the fine site | ||||
|  | ||||
| 	  for(int si=0;si<Nsimd;si++){ | ||||
| 	    int gdx = generator_idx(ss,si); // index of generator state | ||||
| 	    scalar_type *pointer = (scalar_type *)&buf[si]; | ||||
| 	    dist[gdx].reset(); | ||||
| 	    for(int idx=0;idx<words;idx++){ | ||||
| 	      fillScalar(pointer[idx],dist[gdx],_generators[gdx]); | ||||
| 	    } | ||||
| 	  } | ||||
|  | ||||
| 	  // merge into SIMD lanes | ||||
| 	  merge(l._odata[sm],buf); | ||||
| 	} | ||||
|             int gdx = generator_idx(ss, si);  // index of generator state | ||||
|             scalar_type *pointer = (scalar_type *)&buf[si]; | ||||
|             dist[gdx].reset(); | ||||
|             for (int idx = 0; idx < words; idx++)  | ||||
|               fillScalar(pointer[idx], dist[gdx], _generators[gdx]); | ||||
|           } | ||||
|           // merge into SIMD lanes, FIXME suboptimal implementation | ||||
|           merge(l._odata[sm], buf); | ||||
|         } | ||||
|       } | ||||
|  | ||||
|       _time_counter += usecond()- inner_time_counter; | ||||
|     }; | ||||
|  | ||||
|     void SeedRandomDevice(void){ | ||||
|       std::random_device rd; | ||||
|       Seed(rd); | ||||
|     } | ||||
|     void SeedFixedIntegers(const std::vector<int> &seeds){ | ||||
|       fixedSeed src(seeds); | ||||
|       Seed(src); | ||||
|  | ||||
|       // Everyone generates the same seed_seq based on input seeds | ||||
|       CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size()); | ||||
|  | ||||
|       std::seed_seq source(seeds.begin(),seeds.end()); | ||||
|  | ||||
|       RngEngine master_engine(source); | ||||
|  | ||||
| #ifdef RNG_FAST_DISCARD | ||||
|       //////////////////////////////////////////////// | ||||
|       // Skip ahead through a single stream. | ||||
|       // Applicable to SITMO and other has based/crypto RNGs | ||||
|       // Should be applicable to Mersenne Twister, but the C++11 | ||||
|       // MT implementation does not implement fast discard even though | ||||
|       // in principle this is possible | ||||
|       //////////////////////////////////////////////// | ||||
|       std::vector<int> gcoor; | ||||
|       int rank,o_idx,i_idx; | ||||
|  | ||||
|       // Everybody loops over global volume. | ||||
|       for(int gidx=0;gidx<_grid->_gsites;gidx++){ | ||||
|  | ||||
| 	Skip(master_engine); // Skip to next RNG sequence | ||||
|  | ||||
| 	// Where is it? | ||||
| 	_grid->GlobalIndexToGlobalCoor(gidx,gcoor); | ||||
| 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); | ||||
|  | ||||
| 	// If this is one of mine we take it | ||||
| 	if( rank == _grid->ThisRank() ){ | ||||
| 	  int l_idx=generator_idx(o_idx,i_idx); | ||||
| 	  _generators[l_idx] = master_engine; | ||||
| 	} | ||||
|  | ||||
|       } | ||||
| #else  | ||||
|       //////////////////////////////////////////////////////////////// | ||||
|       // Machine and thread decomposition dependent seeding is efficient | ||||
|       // and maximally parallel; but NOT reproducible from machine to machine.  | ||||
|       // Not ideal, but fastest way to reseed all nodes. | ||||
|       //////////////////////////////////////////////////////////////// | ||||
|       { | ||||
| 	// Obtain one Reseed per processor | ||||
| 	int Nproc = _grid->ProcessorCount(); | ||||
| 	std::vector<RngEngine> seeders(Nproc); | ||||
| 	int me= _grid->ThisRank(); | ||||
| 	for(int p=0;p<Nproc;p++){ | ||||
| 	  seeders[p] = Reseed(master_engine); | ||||
| 	} | ||||
| 	master_engine = seeders[me]; | ||||
|       } | ||||
|  | ||||
|       { | ||||
| 	// Obtain one reseeded generator per thread | ||||
| 	int Nthread = GridThread::GetThreads(); | ||||
| 	std::vector<RngEngine> seeders(Nthread); | ||||
| 	for(int t=0;t<Nthread;t++){ | ||||
| 	  seeders[t] = Reseed(master_engine); | ||||
| 	} | ||||
|  | ||||
| 	parallel_for(int t=0;t<Nthread;t++) { | ||||
| 	  // set up one per local site in threaded fashion | ||||
| 	  std::vector<uint32_t> newseeds; | ||||
| 	  std::uniform_int_distribution<uint32_t> uid;	 | ||||
| 	  for(int l=0;l<_grid->lSites();l++) { | ||||
| 	    if ( (l%Nthread)==t ) { | ||||
| 	      _generators[l] = Reseed(seeders[t],newseeds,uid); | ||||
| 	    } | ||||
| 	  } | ||||
| 	} | ||||
|       } | ||||
| #endif | ||||
|     } | ||||
|  | ||||
|     void Report(){ | ||||
|       std::cout << GridLogMessage << "Time spent in the fill() routine by GridParallelRNG: "<< _time_counter/1e3 << " ms" << std::endl; | ||||
|     } | ||||
|  | ||||
|  | ||||
|     //////////////////////////////////////////////////////////////////////// | ||||
|     // Support for rigorous test of RNG's | ||||
|     // Return uniform random uint32_t from requested site generator | ||||
|     //////////////////////////////////////////////////////////////////////// | ||||
|     uint32_t GlobalU01(int gsite){ | ||||
|  | ||||
|       uint32_t the_number; | ||||
|       // who | ||||
|       std::vector<int> gcoor; | ||||
|       int rank,o_idx,i_idx; | ||||
|       _grid->GlobalIndexToGlobalCoor(gsite,gcoor); | ||||
|       _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); | ||||
|  | ||||
|       // draw | ||||
|       int l_idx=generator_idx(o_idx,i_idx); | ||||
|       if( rank == _grid->ThisRank() ){ | ||||
| 	the_number = _uid[l_idx](_generators[l_idx]); | ||||
|       } | ||||
|        | ||||
|       // share & return | ||||
|       _grid->Broadcast(rank,(void *)&the_number,sizeof(the_number)); | ||||
|       return the_number; | ||||
|     } | ||||
|  | ||||
|   }; | ||||
|  | ||||
|   template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l){ | ||||
|     rng.fill(l,rng._uniform); | ||||
|   } | ||||
|   template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l)   { rng.fill(l,rng._uniform);  } | ||||
|   template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); } | ||||
|   template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);} | ||||
|  | ||||
|   template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l){ | ||||
|     rng.fill(l,rng._gaussian); | ||||
|   } | ||||
|    | ||||
|   template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ | ||||
|     rng.fill(l,rng._bernoulli); | ||||
|   } | ||||
|  | ||||
|   template <class sobj> inline void random(GridSerialRNG &rng,sobj &l){ | ||||
|     rng.fill(l,rng._uniform); | ||||
|   } | ||||
|    | ||||
|   template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l){ | ||||
|     rng.fill(l,rng._gaussian); | ||||
|   } | ||||
|    | ||||
|   template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ | ||||
|     rng.fill(l,rng._bernoulli); | ||||
|   } | ||||
|   template <class sobj> inline void random(GridSerialRNG &rng,sobj &l)   { rng.fill(l,rng._uniform  ); } | ||||
|   template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); } | ||||
|   template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); } | ||||
|  | ||||
| } | ||||
| #endif | ||||
|   | ||||
| @@ -42,8 +42,7 @@ namespace Grid { | ||||
|       -> Lattice<decltype(trace(lhs._odata[0]))> | ||||
|     { | ||||
|       Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid); | ||||
| PARALLEL_FOR_LOOP | ||||
|         for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
|       parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
|             ret._odata[ss] = trace(lhs._odata[ss]); | ||||
|         } | ||||
|         return ret; | ||||
| @@ -56,8 +55,7 @@ PARALLEL_FOR_LOOP | ||||
|     inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> | ||||
|     { | ||||
|       Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid); | ||||
| PARALLEL_FOR_LOOP | ||||
|       for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
|       parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| 	ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]); | ||||
|       } | ||||
|       return ret; | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -51,7 +51,7 @@ inline void subdivides(GridBase *coarse,GridBase *fine) | ||||
|   template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){ | ||||
|     half.checkerboard = cb; | ||||
|     int ssh=0; | ||||
|     //PARALLEL_FOR_LOOP | ||||
|     //parallel_for | ||||
|     for(int ss=0;ss<full._grid->oSites();ss++){ | ||||
|       std::vector<int> coor; | ||||
|       int cbos; | ||||
| @@ -68,7 +68,7 @@ inline void subdivides(GridBase *coarse,GridBase *fine) | ||||
|   template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){ | ||||
|     int cb = half.checkerboard; | ||||
|     int ssh=0; | ||||
|     //PARALLEL_FOR_LOOP | ||||
|     //parallel_for | ||||
|     for(int ss=0;ss<full._grid->oSites();ss++){ | ||||
|       std::vector<int> coor; | ||||
|       int cbos; | ||||
| @@ -153,8 +153,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ, | ||||
|     assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]); | ||||
|   } | ||||
|  | ||||
| PARALLEL_FOR_LOOP | ||||
|   for(int sf=0;sf<fine->oSites();sf++){ | ||||
|   parallel_for(int sf=0;sf<fine->oSites();sf++){ | ||||
|      | ||||
|     int sc; | ||||
|     std::vector<int> coor_c(_ndimension); | ||||
| @@ -186,8 +185,7 @@ template<class vobj,class CComplex> | ||||
|  | ||||
|   fine_inner = localInnerProduct(fineX,fineY); | ||||
|   blockSum(coarse_inner,fine_inner); | ||||
| PARALLEL_FOR_LOOP | ||||
|   for(int ss=0;ss<coarse->oSites();ss++){ | ||||
|   parallel_for(int ss=0;ss<coarse->oSites();ss++){ | ||||
|     CoarseInner._odata[ss] = coarse_inner._odata[ss]; | ||||
|   } | ||||
| } | ||||
| @@ -333,9 +331,6 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out) | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   typedef typename vvobj::scalar_object ssobj; | ||||
|  | ||||
|   sobj s; | ||||
|   ssobj ss; | ||||
|  | ||||
|   GridBase *ig = in._grid; | ||||
|   GridBase *og = out._grid; | ||||
|  | ||||
| @@ -347,10 +342,13 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out) | ||||
|   for(int d=0;d<no;d++){ | ||||
|     assert(ig->_processors[d]  == og->_processors[d]); | ||||
|     assert(ig->_ldimensions[d] == og->_ldimensions[d]); | ||||
|     assert(ig->lSites() == og->lSites()); | ||||
|   } | ||||
|  | ||||
|   //PARALLEL_FOR_LOOP | ||||
|   for(int idx=0;idx<ig->lSites();idx++){ | ||||
|   parallel_for(int idx=0;idx<ig->lSites();idx++){ | ||||
|     sobj s; | ||||
|     ssobj ss; | ||||
|  | ||||
|     std::vector<int> lcoor(ni); | ||||
|     ig->LocalIndexToLocalCoor(idx,lcoor); | ||||
|     peekLocalSite(s,in,lcoor); | ||||
| @@ -361,10 +359,9 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out) | ||||
|  | ||||
|  | ||||
| template<class vobj> | ||||
| void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog) | ||||
| void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog) | ||||
| { | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   sobj s; | ||||
|  | ||||
|   GridBase *lg = lowDim._grid; | ||||
|   GridBase *hg = higherDim._grid; | ||||
| @@ -386,17 +383,16 @@ void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int | ||||
|   } | ||||
|  | ||||
|   // the above should guarantee that the operations are local | ||||
|   // Guido: check the threading here | ||||
|   //PARALLEL_FOR_LOOP | ||||
|   for(int idx=0;idx<lg->lSites();idx++){ | ||||
|   parallel_for(int idx=0;idx<lg->lSites();idx++){ | ||||
|     sobj s; | ||||
|     std::vector<int> lcoor(nl); | ||||
|     std::vector<int> hcoor(nh); | ||||
|     lg->LocalIndexToLocalCoor(idx,lcoor); | ||||
|     dl=0; | ||||
|     int ddl=0; | ||||
|     hcoor[orthog] = slice; | ||||
|     for(int d=0;d<nh;d++){ | ||||
|       if ( d!=orthog ) {  | ||||
| 	hcoor[d]=lcoor[dl++]; | ||||
| 	hcoor[d]=lcoor[ddl++]; | ||||
|       } | ||||
|     } | ||||
|     peekLocalSite(s,lowDim,lcoor); | ||||
| @@ -405,10 +401,9 @@ void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int | ||||
| } | ||||
|  | ||||
| template<class vobj> | ||||
| void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, int orthog) | ||||
| void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slice, int orthog) | ||||
| { | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   sobj s; | ||||
|  | ||||
|   GridBase *lg = lowDim._grid; | ||||
|   GridBase *hg = higherDim._grid; | ||||
| @@ -429,16 +424,16 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in | ||||
|     } | ||||
|   } | ||||
|   // the above should guarantee that the operations are local | ||||
|   //PARALLEL_FOR_LOOP | ||||
|   for(int idx=0;idx<lg->lSites();idx++){ | ||||
|   parallel_for(int idx=0;idx<lg->lSites();idx++){ | ||||
|     sobj s; | ||||
|     std::vector<int> lcoor(nl); | ||||
|     std::vector<int> hcoor(nh); | ||||
|     lg->LocalIndexToLocalCoor(idx,lcoor); | ||||
|     dl=0; | ||||
|     int ddl=0; | ||||
|     hcoor[orthog] = slice; | ||||
|     for(int d=0;d<nh;d++){ | ||||
|       if ( d!=orthog ) {  | ||||
| 	hcoor[d]=lcoor[dl++]; | ||||
| 	hcoor[d]=lcoor[ddl++]; | ||||
|       } | ||||
|     } | ||||
|     peekLocalSite(s,higherDim,hcoor); | ||||
| @@ -449,10 +444,9 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in | ||||
|  | ||||
|  | ||||
| template<class vobj> | ||||
| void InsertSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog) | ||||
| void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog) | ||||
| { | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   sobj s; | ||||
|  | ||||
|   GridBase *lg = lowDim._grid; | ||||
|   GridBase *hg = higherDim._grid; | ||||
| @@ -469,8 +463,8 @@ void InsertSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice | ||||
|   } | ||||
|  | ||||
|   // the above should guarantee that the operations are local | ||||
|   //PARALLEL_FOR_LOOP | ||||
|   for(int idx=0;idx<lg->lSites();idx++){ | ||||
|   parallel_for(int idx=0;idx<lg->lSites();idx++){ | ||||
|     sobj s; | ||||
|     std::vector<int> lcoor(nl); | ||||
|     std::vector<int> hcoor(nh); | ||||
|     lg->LocalIndexToLocalCoor(idx,lcoor); | ||||
| @@ -488,7 +482,6 @@ template<class vobj> | ||||
| void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog) | ||||
| { | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   sobj s; | ||||
|  | ||||
|   GridBase *lg = lowDim._grid; | ||||
|   GridBase *hg = higherDim._grid; | ||||
| @@ -505,8 +498,8 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slic | ||||
|   } | ||||
|  | ||||
|   // the above should guarantee that the operations are local | ||||
|   //PARALLEL_FOR_LOOP | ||||
|   for(int idx=0;idx<lg->lSites();idx++){ | ||||
|   parallel_for(int idx=0;idx<lg->lSites();idx++){ | ||||
|     sobj s; | ||||
|     std::vector<int> lcoor(nl); | ||||
|     std::vector<int> hcoor(nh); | ||||
|     lg->LocalIndexToLocalCoor(idx,lcoor); | ||||
| @@ -558,7 +551,10 @@ void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine) | ||||
|  | ||||
| //Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order | ||||
| template<typename vobj, typename sobj> | ||||
| typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>::value, void>::type unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in){ | ||||
| typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>::value, void>::type  | ||||
| unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in) | ||||
| { | ||||
|  | ||||
|   typedef typename vobj::vector_type vtype; | ||||
|    | ||||
|   GridBase* in_grid = in._grid; | ||||
| @@ -574,8 +570,7 @@ typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj> | ||||
|     in_grid->iCoorFromIindex(in_icoor[lane], lane); | ||||
|   } | ||||
|    | ||||
| PARALLEL_FOR_LOOP | ||||
|   for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index | ||||
|   parallel_for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index | ||||
|     //Assemble vector of pointers to output elements | ||||
|     std::vector<sobj*> out_ptrs(in_nsimd); | ||||
|  | ||||
| @@ -598,6 +593,54 @@ PARALLEL_FOR_LOOP | ||||
|     extract1(in_vobj, out_ptrs, 0); | ||||
|   } | ||||
| } | ||||
| //Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order | ||||
| template<typename vobj, typename sobj> | ||||
| typename std::enable_if<isSIMDvectorized<vobj>::value  | ||||
|                     && !isSIMDvectorized<sobj>::value, void>::type  | ||||
| vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out) | ||||
| { | ||||
|  | ||||
|   typedef typename vobj::vector_type vtype; | ||||
|    | ||||
|   GridBase* grid = out._grid; | ||||
|   assert(in.size()==grid->lSites()); | ||||
|    | ||||
|   int ndim     = grid->Nd(); | ||||
|   int nsimd    = vtype::Nsimd(); | ||||
|  | ||||
|   std::vector<std::vector<int> > icoor(nsimd); | ||||
|        | ||||
|   for(int lane=0; lane < nsimd; lane++){ | ||||
|     icoor[lane].resize(ndim); | ||||
|     grid->iCoorFromIindex(icoor[lane],lane); | ||||
|   } | ||||
|    | ||||
|   parallel_for(uint64_t oidx = 0; oidx < grid->oSites(); oidx++){ //loop over outer index | ||||
|     //Assemble vector of pointers to output elements | ||||
|     std::vector<sobj*> ptrs(nsimd); | ||||
|  | ||||
|     std::vector<int> ocoor(ndim); | ||||
|     grid->oCoorFromOindex(ocoor, oidx); | ||||
|  | ||||
|     std::vector<int> lcoor(grid->Nd()); | ||||
|        | ||||
|     for(int lane=0; lane < nsimd; lane++){ | ||||
|  | ||||
|       for(int mu=0;mu<ndim;mu++){ | ||||
| 	lcoor[mu] = ocoor[mu] + grid->_rdimensions[mu]*icoor[lane][mu]; | ||||
|       } | ||||
|  | ||||
|       int lex; | ||||
|       Lexicographic::IndexFromCoor(lcoor, lex, grid->_ldimensions); | ||||
|       ptrs[lane] = &in[lex]; | ||||
|     } | ||||
|      | ||||
|     //pack from those ptrs | ||||
|     vobj vecobj; | ||||
|     merge1(vecobj, ptrs, 0); | ||||
|     out._odata[oidx] = vecobj;  | ||||
|   } | ||||
| } | ||||
|  | ||||
| //Convert a Lattice from one precision to another | ||||
| template<class VobjOut, class VobjIn> | ||||
| @@ -623,8 +666,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){ | ||||
|   std::vector<SobjOut> in_slex_conv(in_grid->lSites()); | ||||
|   unvectorizeToLexOrdArray(in_slex_conv, in); | ||||
|      | ||||
|   PARALLEL_FOR_LOOP | ||||
|   for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){ | ||||
|   parallel_for(uint64_t out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){ | ||||
|     std::vector<int> out_ocoor(ndim); | ||||
|     out_grid->oCoorFromOindex(out_ocoor, out_oidx); | ||||
|  | ||||
| @@ -643,9 +685,5 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){ | ||||
|   } | ||||
| } | ||||
|   | ||||
|  | ||||
|    | ||||
|  | ||||
|   | ||||
| } | ||||
| #endif | ||||
|   | ||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user