mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-24 17:54:47 +01:00 
			
		
		
		
	Compare commits
	
		
			1571 Commits
		
	
	
		
			0.8.2
			...
			feature/a2
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | d8c0c0ba0a | ||
|  | c6cf918d4c | ||
|  | 6d0a907c5c | ||
|  | 3276aa67dc | ||
|  | 7cf7f11e1a | ||
| f1f655d92b | |||
| 43334e88c3 | |||
| 4f1e66b044 | |||
| 64fe5b21b4 | |||
|  | ee9889821d | ||
| eb470aa6dc | |||
| 77af9a3ddc | |||
| 102089798c | |||
| 39cea8b5a7 | |||
| a65f66d2db | |||
|  | 936c5ecf69 | ||
|  | 22cfbdbbb3 | ||
|  | 093d1ee21b | ||
|  | d6ba2581ce | ||
|  | 577c064184 | ||
|  | 2ff1fa6fad | ||
|  | 70be1bd8be | ||
| 4ef50ba31f | |||
| 3e97a26f90 | |||
| 599f28f6ef | |||
|  | c48da35921 | ||
|  | 6c5fa8dcd8 | ||
|  | 0d2f913a1a | ||
|  | 1a74816c25 | ||
|  | 73de335256 | ||
|  | 228fd450ce | ||
|  | b949cf6b12 | ||
|  | 11bc1aeadc | ||
|  | 66005929af | ||
|  | ff7c847735 | ||
|  | 1aa988b2af | ||
|  | edf17708a8 | ||
|  | f46f029dbb | ||
|  | 3dccd7aa2c | ||
|  | 65e6e7da6f | ||
|  | b5e87e8d97 | ||
|  | 5f5807d60a | ||
|  | 7974acff54 | ||
| f0d17d2b49 | |||
| 244c003a1b | |||
| 0174f5f742 | |||
|  | 32b2b59be4 | ||
|  | 86bb0cc24b | ||
|  | 84c19587e7 | ||
|  | 237ce92540 | ||
|  | a7ffc61e82 | ||
|  | fd97f64612 | ||
|  | 8720aecb80 | ||
|  | cdf0a04fc5 | ||
|  | 616d3dd737 | ||
|  | 8b066baca8 | ||
|  | e97f3688db | ||
|  | 89a1e78390 | ||
|  | ffbb3fc02c | ||
|  | 5a73ef3647 | ||
|  | 87e5d2f4b7 | ||
|  | d720f10758 | ||
|  | 14fcd0912a | ||
|  | 3111c0bd4f | ||
|  | e03064490e | ||
|  | 1a4c8c3387 | ||
|  | 2b1e259441 | ||
|  | f39c2a240b | ||
|  | 0d95805cde | ||
|  | f67830587f | ||
|  | 6bf7f839ff | ||
|  | e3147881a9 | ||
|  | fb559614ad | ||
|  | e93e12b6a4 | ||
|  | 0c3112cd94 | ||
|  | 8cfd5d2639 | ||
|  | 1c9f20b15e | ||
|  | 32237895bd | ||
|  | c5c2dbc0ce | ||
|  | 9fcb47ee63 | ||
|  | 1d252d0922 | ||
|  | 006cc8a8f1 | ||
|  | cf2938688a | ||
|  | ee63721bad | ||
|  | 22c5168d70 | ||
|  | 949ac3cd24 | ||
|  | 7bc0166c1c | ||
|  | cb0d1b3399 | ||
|  | d1f1ccc705 | ||
|  | c7519a237a | ||
|  | 32be2b13d3 | ||
|  | 92b342a477 | ||
|  | 556da86ac3 | ||
|  | 8285e41574 | ||
|  | f999408e92 | ||
|  | a7abda89e2 | ||
|  | 7860a50f70 | ||
|  | 6c6812a5ca | ||
|  | 8358ee38c4 | ||
|  | 1f154fe652 | ||
|  | d708c0258d | ||
|  | a7635fd5ba | ||
|  | ebb60330c9 | ||
| 5aa60be17d | |||
|  | 32fbdf4fb1 | ||
|  | a9847aa866 | ||
|  | 2e652431e5 | ||
|  | 8b5b55b682 | ||
|  | 0e3c49f687 | ||
|  | cb7ee37562 | ||
|  | 82f71643a4 | ||
|  | d24d8e8398 | ||
|  | 162e4bb567 | ||
|  | 07c0c02f8c | ||
|  | 8c31c065b5 | ||
|  | b1c86900b2 | ||
|  | bbbee5660d | ||
|  | ea08f193e7 | ||
|  | 2bb2c68e15 | ||
|  | efe5bc6a3c | ||
|  | 384da487bd | ||
|  | ee1de82a53 | ||
|  | 2b576fc185 | ||
|  | 52081acfa5 | ||
|  | b01b7f761a | ||
|  | c83471bfd0 | ||
|  | ab0c5d77fb | ||
|  | 779e3c7442 | ||
|  | 0c570824f2 | ||
|  | f8b8e00090 | ||
|  | 0dd1bdfa94 | ||
|  | 1d65e2f62c | ||
|  | 93920c4811 | ||
|  | 6859a3e1d4 | ||
|  | 21ca182c36 | ||
| 053b4dd495 | |||
|  | 42bb5f0721 | ||
|  | 253bcc3426 | ||
| a887206413 | |||
|  | 591ebb6213 | ||
|  | 56e2f7d088 | ||
|  | 525418abfb | ||
|  | 5f780806c2 | ||
|  | 3c6ffcb48c | ||
|  | 87984ece7d | ||
|  | e9b295f967 | ||
|  | 224cbf0453 | ||
|  | c1e57d4357 | ||
|  | 28a1fcaaff | ||
|  | 6b64727161 | ||
|  | 04863f8f38 | ||
|  | 04927d2e40 | ||
|  | 7caed4edd9 | ||
|  | 59c51d2c35 | ||
|  | ff53b231c8 | ||
|  | fc19cf905b | ||
|  | 2a1387e992 | ||
|  | 9bfa51bffb | ||
|  | 38532753f4 | ||
|  | 949be9605c | ||
|  | 63cf201ee7 | ||
|  | c8af498a2a | ||
|  | ddb192bac7 | ||
|  | 7666300a6f | ||
|  | 4a4b9e305d | ||
|  | 9b2d2d0fc3 | ||
|  | 5011753f4f | ||
|  | dbaeefaeef | ||
|  | dee96cbf82 | ||
|  | dd3ebc2ce4 | ||
|  | 103e7ae2f0 | ||
|  | 29ae5615c0 | ||
|  | 6240e02619 | ||
|  | f4033ad8cb | ||
|  | f1fe444d4f | ||
|  | dae820aa96 | ||
|  | 5daf176f4a | ||
|  | e96c86ec14 | ||
|  | c2c3cad20d | ||
|  | edec9ee2e2 | ||
|  | ed70cce542 | ||
|  | 4701201b5f | ||
|  | 0782b76ed4 | ||
|  | 0896f2cead | ||
|  | 181709bba4 | ||
|  | 091d5c605e | ||
|  | 90229cfb0f | ||
|  | 0475c46ecb | ||
|  | 3cca10e617 | ||
|  | 327da332bb | ||
| 43dc2814dd | |||
|  | f3a8d039a2 | ||
| 4e864e56c9 | |||
|  | 014dbfa464 | ||
|  | 3b0e07882f | ||
|  | 8e81a811d0 | ||
|  | aa13118127 | ||
|  | 6cdb09c884 | ||
|  | a65bc64f10 | ||
|  | 11dec4883c | ||
|  | afa458c812 | ||
|  | dc50190b8f | ||
|  | 96e8e44fd4 | ||
|  | 5fc8a273e7 | ||
| d671a63e78 | |||
| 2c22db841a | |||
|  | 856d168e41 | ||
| 6235c7ba98 | |||
| 7e13724882 | |||
|  | b6cbdd2aa3 | ||
|  | a2188ea875 | ||
|  | 989af65807 | ||
|  | 60db3133d3 | ||
|  | c9b737a4e7 | ||
|  | 037bb6ea73 | ||
| 05ebc458e2 | |||
|  | 3753508957 | ||
|  | c1677fccf6 | ||
| 35e8e31749 | |||
| 34813e9b04 | |||
|  | 373cf61abb | ||
| 4e8fbc4b49 | |||
|  | 516ac1d4d5 | ||
| 318f63eb34 | |||
| 16503d7532 | |||
|  | 0fa93383b7 | ||
|  | 0a827aa7bf | ||
|  | 165c68e28e | ||
|  | b32b1ca642 | ||
|  | 9479bc8486 | ||
|  | 8a5c13d5fb | ||
|  | bdccb0c91f | ||
|  | 68b45f6444 | ||
|  | ef9b3e658a | ||
|  | b9ca40cc44 | ||
|  | 2f421a5db1 | ||
|  | 10192dfc71 | ||
|  | c69a3b6ef6 | ||
|  | 852fc1b001 | ||
|  | 2b5de5bba5 | ||
|  | 2e85cae74e | ||
|  | 76c823781e | ||
|  | 114db3b99d | ||
|  | 49e123dbda | ||
|  | 8cec294ec9 | ||
|  | eb5b720e94 | ||
|  | b2736ec80b | ||
|  | 086256a032 | ||
|  | afc7426f39 | ||
|  | 7c061e20c9 | ||
|  | e5d1c09665 | ||
|  | 8016a465ae | ||
|  | d8b9742092 | ||
|  | 1bd87c35d7 | ||
|  | fa856c9669 | ||
|  | 48008e4d8b | ||
|  | 55cdb17691 | ||
|  | 2ed39ebb7a | ||
|  | 96671bbb24 | ||
|  | 554542b773 | ||
|  | 03da4040e2 | ||
|  | e583035614 | ||
|  | 3c3d6a94f3 | ||
|  | 205ea4bbb2 | ||
|  | 039eb7b2eb | ||
|  | f7e4bd1f6d | ||
|  | 0afecfcae7 | ||
|  | ba40a3f763 | ||
|  | aa920aa532 | ||
|  | c0d8e4dce5 | ||
|  | 0ca1992151 | ||
|  | df2b0c4e79 | ||
|  | 9cfd64c604 | ||
|  | e478404291 | ||
|  | 9aafd20468 | ||
|  | 5d834486c9 | ||
|  | f7373e97a4 | ||
|  | 9e15474999 | ||
|  | 152b525a4d | ||
|  | d18994eddc | ||
|  | b8bd8cd2ae | ||
|  | 736b19485e | ||
|  | c7637a84ad | ||
|  | a7772c827b | ||
| 8e83398861 | |||
|  | 843ca9350a | ||
| f47b2b6e13 | |||
|  | 5bfd1470ad | ||
|  | 6957b0b58a | ||
|  | d73f0b8618 | ||
|  | 0b3a3562c3 | ||
|  | 710fee5d26 | ||
|  | bab0bf2e93 | ||
|  | 848079e8ba | ||
|  | f2a4f13111 | ||
|  | 4180a4a8a7 | ||
| b9b9fcbfa0 | |||
| bbe48998a8 | |||
| 6446671a9c | |||
| 110373ea79 | |||
| a986786192 | |||
|  | edd1c924eb | ||
|  | 9b6b0caa55 | ||
|  | 2a48617ac5 | ||
|  | 876d9c957c | ||
|  | 295e535f93 | ||
|  | 58a31f0763 | ||
|  | 3d2fe80780 | ||
|  | e43fce1083 | ||
|  | 0dfdf80407 | ||
|  | 2912071f83 | ||
|  | 26605ef387 | ||
| 1e5ac576d9 | |||
| d5492b426f | |||
| d428858c9d | |||
|  | f7698b93ca | ||
|  | 7ce77690b8 | ||
|  | 164ed9c434 | ||
|  | a54157e682 | ||
|  | 58b6a0d8d1 | ||
|  | 1a5e562bde | ||
|  | 45be26cf3f | ||
|  | 5227ffccb7 | ||
| a0b47cc0be | |||
|  | b766038810 | ||
|  | cd9fd80a5d | ||
| d6100cc35a | |||
|  | 29a1530510 | ||
|  | 15119eaf03 | ||
| 188e12ffbb | |||
|  | e940f4db7e | ||
|  | 9c7f269489 | ||
|  | 07feaf9531 | ||
|  | 7983ff2fdd | ||
|  | 2db814f2b7 | ||
|  | 6418f06771 | ||
| 8a5576f73c | |||
|  | 997790ad24 | ||
|  | 900d6fad21 | ||
| 799ff0c96e | |||
| 5fd5c25114 | |||
| 62b3799c77 | |||
|  | d1a89af8c9 | ||
|  | d91ba1f6cc | ||
|  | f4d27e7090 | ||
|  | feb1ff3494 | ||
|  | 8ef6175acc | ||
|  | e4399e3ee1 | ||
|  | 98ea67b636 | ||
|  | 421a4395af | ||
|  | cf95a460a5 | ||
| a60e20f265 | |||
|  | 9261c0da89 | ||
|  | b350a24ded | ||
|  | 13a0db7162 | ||
|  | 18177d9709 | ||
|  | 7bf42b9c0e | ||
|  | 2d6f4e0c09 | ||
|  | 7f06c40107 | ||
|  | 9f75065205 | ||
|  | 271a02230e | ||
|  | b1e8b5b5ce | ||
|  | 25d2521d77 | ||
|  | 500ef17143 | ||
|  | ee9dd22643 | ||
|  | a977d9901b | ||
|  | 667ffb70db | ||
|  | 65b3059bd7 | ||
|  | 5238808ccd | ||
|  | 8f88fee680 | ||
|  | fcc412a1c2 | ||
|  | 12e415330f | ||
|  | 66e0811317 | ||
|  | 55e743aad6 | ||
|  | e2ab0d671e | ||
|  | 7a4c5dbbd5 | ||
|  | 3f00b8f6c7 | ||
|  | 6d7043e0c2 | ||
|  | b0f24ec302 | ||
|  | fb2834bf82 | ||
|  | 78f75b0e9f | ||
|  | 62dd0bfe58 | ||
|  | db952993fa | ||
|  | b8f0878981 | ||
|  | df586a142d | ||
|  | 7a446d5b7f | ||
|  | e7d7ea4f8f | ||
|  | f8e1941327 | ||
| 65aa54804e | |||
|  | 293bfe17d1 | ||
|  | a8f3a111a5 | ||
|  | 5c23abe507 | ||
|  | 22c654182a | ||
|  | 6f0439c0e4 | ||
|  | 4f9a7c5d76 | ||
|  | fcd90705bc | ||
|  | 4bcdb4ff95 | ||
|  | 1c10933db1 | ||
|  | 52d8d576d0 | ||
|  | ada0a7a83b | ||
|  | efe2f2d48b | ||
|  | 45d4cf0971 | ||
|  | ac614cbc53 | ||
|  | ec8e060ec7 | ||
|  | 5c54f27ac1 | ||
|  | 4ed9379535 | ||
|  | 858e348a6d | ||
|  | 3b3680c64e | ||
|  | 2a926b3dc6 | ||
|  | 845a045493 | ||
|  | eb8848a071 | ||
|  | f31e3278a6 | ||
|  | ca234325bc | ||
| c97f780784 | |||
|  | 78bdb0ff6a | ||
|  | decab587a0 | ||
| 202f025fc7 | |||
|  | 3c702b510b | ||
|  | 519ce19128 | ||
|  | 8d166a81c0 | ||
|  | aa62ca9046 | ||
|  | 2dee4791db | ||
|  | 548b3bf43c | ||
|  | a55d0ba8fe | ||
|  | 5de9547db5 | ||
|  | 6a3b09cf02 | ||
|  | 10de4bfc23 | ||
|  | 2ce7f2b4d8 | ||
|  | 88d6ff8f1d | ||
|  | 803329af99 | ||
|  | 9d96899aa8 | ||
|  | 86939dbf1a | ||
| 317645aaeb | |||
|  | e280ec6b0b | ||
| d5a180d914 | |||
| d2928761dd | |||
| f2a74c603f | |||
| 5f22810f55 | |||
|  | 92e25488f8 | ||
|  | 89ef2b7dc2 | ||
|  | 7606554b76 | ||
|  | c8fc0b3e0c | ||
|  | ccb5e8374b | ||
|  | b88fd436e7 | ||
|  | 155bcd4ff3 | ||
|  | d1daab601a | ||
|  | e5d7910fa7 | ||
|  | 94b9a9474c | ||
|  | bf62ec163d | ||
|  | 8415e23fc6 | ||
|  | 76c93aa44e | ||
|  | 3137628222 | ||
|  | ce965ee6bb | ||
|  | 911fbb0f36 | ||
|  | eb293e9909 | ||
|  | f548114ff6 | ||
|  | dab8c01c3d | ||
|  | 2f3dd0703d | ||
|  | 2e963d1a78 | ||
|  | bf52e7cc96 | ||
|  | 61d017d0a5 | ||
|  | 04a661cafe | ||
|  | a7fa86dc29 | ||
|  | 0c1efa5235 | ||
|  | fdd9b14e82 | ||
|  | e66669d300 | ||
|  | 0efaf3c4fa | ||
|  | 3ef519aaa4 | ||
|  | b473405652 | ||
|  | 114ebb7914 | ||
|  | 9b7a6d197f | ||
|  | 59cd7f3b70 | ||
|  | 28d6be2a4e | ||
| 6b6c5aa626 | |||
| 9210b0aa6e | |||
| ad01290545 | |||
|  | 25150eb2e0 | ||
|  | 95f66cc93c | ||
|  | 12eb2a6a34 | ||
|  | 7c8902b04f | ||
|  | 4278caa030 | ||
|  | be37dfb6f8 | ||
|  | 5e8437029f | ||
|  | e279b2be29 | ||
|  | 48e6efc7c9 | ||
|  | 3e49dc8a67 | ||
|  | 96ac56cace | ||
|  | 2b037e3daa | ||
|  | 2d2de7aede | ||
|  | ce97638bac | ||
|  | 53e3ab4131 | ||
|  | d566637cec | ||
|  | 51bed48cd2 | ||
|  | b875edceab | ||
|  | 29df60c0cb | ||
|  | 8d97e2a02a | ||
|  | ed23f6be20 | ||
|  | cad76827b0 | ||
|  | 310867d46a | ||
|  | e598178d94 | ||
|  | 723457d467 | ||
|  | 6f40021842 | ||
|  | 9cd33a7b9c | ||
|  | 639dc1ab21 | ||
|  | 9117f61109 | ||
|  | 622d5eaa3e | ||
|  | e66d48c142 | ||
|  | 9dad7a0094 | ||
|  | 1282e1067f | ||
|  | f5ad4f3de8 | ||
|  | 275c1c920f | ||
|  | fe700a183a | ||
|  | 34108296cd | ||
|  | ce255ec359 | ||
|  | 1c096626cb | ||
|  | ce8b247426 | ||
|  | 80481f81be | ||
|  | d85dcc72df | ||
|  | 3fedcd6d52 | ||
|  | e7050a7aed | ||
|  | e138bc7204 | ||
|  | 25ba4c5f80 | ||
|  | 6d4fb35d84 | ||
|  | 775eaee199 | ||
|  | 0fd2827d5d | ||
|  | bdd79f9ef8 | ||
|  | 0695f8cec2 | ||
|  | 9fa705c5a0 | ||
|  | 56cefadf9b | ||
|  | 9d82855c5d | ||
|  | 97d61f2564 | ||
|  | 331f5a53dc | ||
|  | a23dc295ac | ||
|  | 11a8668d19 | ||
|  | cded7670d0 | ||
|  | feb029fb66 | ||
|  | 08904f830e | ||
|  | fa9cd50c5b | ||
|  | 5a62ebe7b1 | ||
|  | 7c11525d1a | ||
|  | 42c1dbb1d1 | ||
|  | 6179acfda0 | ||
|  | fa747173d1 | ||
|  | 07601ac1f5 | ||
|  | 705a8098b2 | ||
|  | a29b43d755 | ||
|  | 368c8369ce | ||
|  | c0d89a2dbb | ||
|  | 78ebd93281 | ||
|  | 3d58daf70f | ||
|  | bd155ca5c0 | ||
|  | 91e2cf9b40 | ||
|  | 3cc9947731 | ||
|  | f15eeb0283 | ||
|  | 0996ba9396 | ||
|  | 12afb0395f | ||
|  | ec4aa978ab | ||
|  | 966a203dcb | ||
|  | 44170cc15f | ||
|  | 7bc4a06f3f | ||
|  | cd659525e1 | ||
|  | dc2240d2d8 | ||
|  | 98cf20cf06 | ||
|  | cc3346073e | ||
|  | 3848da7c50 | ||
|  | b7d0cf6751 | ||
|  | 2c1a077369 | ||
|  | 6e3c3214a3 | ||
|  | d6ffadb33b | ||
|  | ae3abbe53d | ||
|  | 5fc0188205 | ||
|  | 4c3225412b | ||
|  | b8f7bfbb26 | ||
|  | 7b7c470917 | ||
|  | 532e226b22 | ||
|  | 6a13731818 | ||
|  | 67690df3bd | ||
|  | ce29b18dc9 | ||
|  | 421a0a8a36 | ||
|  | ac530636ca | ||
|  | 2d940a598c | ||
|  | c28c5fc61b | ||
|  | 015340d60c | ||
|  | 1cd4ee0706 | ||
|  | b8f71b6777 | ||
|  | 703dc20377 | ||
|  | d976e5c514 | ||
|  | d7b3efe893 | ||
|  | f710d7bd45 | ||
|  | cb336aa8f8 | ||
|  | 462900b48d | ||
|  | 0561c2edeb | ||
|  | 0184719216 | ||
|  | 24202dbc51 | ||
|  | d763c303c5 | ||
|  | 8e394d3bf9 | ||
|  | b881d5489b | ||
|  | 82306913a8 | ||
|  | 49f90cc7eb | ||
|  | b77af0210b | ||
|  | 5254ede2d8 | ||
|  | 16e5d7945e | ||
|  | decc99ca76 | ||
|  | 464cd65931 | ||
|  | a1ec2f4723 | ||
|  | ea9662ec85 | ||
|  | 52c74f1cac | ||
|  | 9a13d2992c | ||
|  | b0449ae270 | ||
|  | 1299225105 | ||
|  | 5925e7f405 | ||
|  | be1fd4930f | ||
|  | 377fa5dec1 | ||
|  | e8b78f596e | ||
|  | 09720c40cd | ||
|  | bb024dd114 | ||
|  | 52456b9ec7 | ||
|  | b285138be4 | ||
|  | c7dbf4c87e | ||
|  | 1e889c93b8 | ||
|  | 7379047482 | ||
|  | d836ce3b78 | ||
|  | cefaacbc07 | ||
|  | 0074ef7f69 | ||
|  | 20359ca15f | ||
|  | 736358b0cb | ||
|  | 6b692aa726 | ||
|  | 7f99e1cd3b | ||
|  | f3c89df948 | ||
|  | b7e6d111d7 | ||
|  | f39cf69c33 | ||
|  | 8e27338df2 | ||
|  | bcbb5e9d26 | ||
|  | 0ea7f5279d | ||
|  | 18e5de426d | ||
|  | e896d81235 | ||
|  | 7b8ccff4f4 | ||
|  | 68541606ab | ||
|  | 339ea10cc7 | ||
|  | d0d8dc8042 | ||
|  | 81eb1fd9f2 | ||
|  | cb93d32cd9 | ||
|  | 8f223962ff | ||
|  | 9a8a63467e | ||
|  | 36f06555a2 | ||
|  | d6c0e0756d | ||
|  | 3e41b1055c | ||
|  | 9fbcfe612c | ||
|  | e78a5e7838 | ||
|  | da8d87e9da | ||
|  | 8e3a05d89b | ||
|  | 8adc5da7dd | ||
|  | 29a244e423 | ||
|  | 18cbfecf02 | ||
|  | c933ac2248 | ||
|  | ad2c433574 | ||
|  | 86e7fb6e86 | ||
|  | fb91dda7be | ||
|  | 82cf7bc5ab | ||
|  | e452cc0a22 | ||
|  | 4d2b938166 | ||
|  | 10d16ab76c | ||
|  | 1f997fa484 | ||
|  | dc5024e88c | ||
|  | 6d77941990 | ||
|  | 0ee6e77cbc | ||
|  | 18d3cde29a | ||
|  | 7323099966 | ||
|  | 6379651cdd | ||
|  | ba4fd756b9 | ||
|  | d185fc1ebf | ||
|  | 96b36d8367 | ||
|  | 899f8b5065 | ||
|  | c8d0483fe9 | ||
|  | 0f214e5f76 | ||
|  | 8eea568426 | ||
|  | 9636324069 | ||
|  | 8a5489d9e6 | ||
|  | 8113845f9c | ||
|  | b47f73c222 | ||
|  | 5720ced0fd | ||
|  | 2c87b56b53 | ||
|  | dbad48d802 | ||
|  | 4557a1365a | ||
|  | 16e9b87d98 | ||
|  | 685eea3d0f | ||
|  | 65b48831fb | ||
|  | 57396fc595 | ||
|  | a2e199df50 | ||
|  | 020346c848 | ||
|  | c2625a127e | ||
|  | 8794d35c78 | ||
|  | 24bff6dbe6 | ||
|  | 45b15d10d3 | ||
|  | 33d6bbe32b | ||
|  | 7a1569bd46 | ||
|  | 6e2e904a0e | ||
|  | d92a17f359 | ||
|  | 47c063f984 | ||
|  | 7e27a5213a | ||
|  | fe72dc099b | ||
|  | ade4a126da | ||
|  | 7b59ab5bd7 | ||
|  | fcd8cfe257 | ||
|  | b4b53812cb | ||
|  | 085cac583f | ||
|  | 25e3b8640c | ||
|  | 54edb9906e | ||
|  | 44bbec50b0 | ||
|  | ec68b67d5d | ||
|  | 778450e0c8 | ||
|  | 567aa5f366 | ||
|  | 2ab7e2b175 | ||
|  | 6f61be044d | ||
|  | 269e00509e | ||
|  | a5e90b0ddc | ||
|  | 5622faf226 | ||
|  | 82ecd520c7 | ||
|  | 9ff459816f | ||
|  | eb737daeb5 | ||
|  | ffde81f22a | ||
|  | d8098f1ecd | ||
|  | aca788cf4f | ||
|  | a0e9f3b0a0 | ||
|  | a9342c6ae5 | ||
|  | ee6f96d85c | ||
|  | 4e9df9e93c | ||
|  | 9fe68857a9 | ||
|  | 37336c9e0c | ||
|  | 6c4da3bbc7 | ||
|  | a584b16c4a | ||
|  | 8ce7ebdca3 | ||
|  | 435653490e | ||
|  | 10a052d695 | ||
|  | acd5a01b65 | ||
|  | 1a82533d22 | ||
|  | ec7d96ce3b | ||
|  | e3c56fd9b3 | ||
|  | 955cc7790f | ||
|  | 1179123ac2 | ||
|  | 22e35c9ddd | ||
|  | 698b45e163 | ||
|  | f1744b3f01 | ||
|  | 2b3c22f03d | ||
|  | 8423a05940 | ||
|  | c16916cc45 | ||
|  | a865caf0d2 | ||
|  | 9ae4d369f3 | ||
|  | ec24a1f828 | ||
|  | 0efe63f6fa | ||
|  | b7ead6c16a | ||
|  | d9438627d9 | ||
|  | b23305dbe2 | ||
|  | d3b5c02e2d | ||
|  | 8b6541fb60 | ||
|  | 6da9aa9971 | ||
|  | 44e0360b97 | ||
|  | 9003c4a07c | ||
|  | b52fa38f8c | ||
|  | 3f1c4d8789 | ||
|  | 62692b68b9 | ||
|  | 311c35a15c | ||
|  | a3fe57f430 | ||
|  | 8dc0587621 | ||
|  | cfe5fa7a35 | ||
|  | e72e26c899 | ||
|  | 334f29becb | ||
|  | e56ead55ef | ||
|  | d74d443d1b | ||
|  | 4203105104 | ||
|  | ac19c0e04f | ||
|  | b48ca8a6ef | ||
|  | c48ae4f3ad | ||
|  | fb74de0798 | ||
|  | adc1eaee68 | ||
|  | 60330e05a3 | ||
|  | f9b8c0cccf | ||
|  | 3cad67e569 | ||
|  | 170ba4e619 | ||
|  | 204a090497 | ||
|  | 3c717c47ef | ||
|  | 5aca4e8670 | ||
|  | e223d0b99f | ||
|  | 2e220456d3 | ||
|  | 4333d97958 | ||
|  | 55c9c45d4b | ||
|  | e70e03f560 | ||
|  | ff5e2e0f47 | ||
|  | 4f3d1ea6e8 | ||
|  | b1768ba820 | ||
|  | 3ac5a69a57 | ||
|  | 50a74eaea3 | ||
|  | 8419fbb335 | ||
|  | 23a9b93cda | ||
|  | ecdc3ddebf | ||
|  | 606698511c | ||
|  | a97b814f0c | ||
|  | 7214681e11 | ||
|  | 143b75956c | ||
|  | 4a4203c610 | ||
|  | 2b598294c9 | ||
|  | d111c70c38 | ||
|  | ed2427d5f7 | ||
|  | ea2f34de7b | ||
|  | 63dc0fa7e9 | ||
|  | 5e6104e683 | ||
|  | 25e4ee3a49 | ||
|  | 4161429dcc | ||
|  | b5eb97206b | ||
|  | 0da906cf66 | ||
|  | 3decb5f886 | ||
|  | faa8bb9bc6 | ||
|  | 4c02ed6d0c | ||
|  | f757b80e1c | ||
|  | b8581be1da | ||
|  | 9fce1263be | ||
|  | ae565b006a | ||
|  | 8502660023 | ||
|  | 625a97a466 | ||
|  | bce2766fef | ||
|  | ce501afec6 | ||
|  | 1d10a3b3de | ||
|  | d1e02f50ff | ||
|  | 48b03c4590 | ||
|  | b3b9e608e1 | ||
|  | 4e87cbd400 | ||
|  | 4fc045b563 | ||
|  | fbf286b0e3 | ||
|  | 9dc3fe9922 | ||
|  | 6c9029fab7 | ||
|  | 8700dd4d0d | ||
|  | 9c16391e55 | ||
|  | 685d9bafef | ||
|  | d2d26b302d | ||
|  | 88cb004731 | ||
|  | a66bb8acba | ||
|  | 4ae35000a9 | ||
|  | 02b96b4602 | ||
|  | 11dded61e8 | ||
|  | 24cf3b9df5 | ||
|  | 9c8aa2047d | ||
|  | 204cfa1c5a | ||
|  | fe6845d38b | ||
| bff4eeec41 | |||
|  | d1fe4dce33 | ||
|  | 50ca3101de | ||
|  | 0faf40e207 | ||
|  | 5313e44d11 | ||
|  | 6bb9b67c93 | ||
| a0405c6d84 | |||
|  | c2a3231cdf | ||
|  | 5fb2ee89bb | ||
|  | 608a98d870 | ||
|  | 2df396380d | ||
|  | 64ba664637 | ||
|  | 4a70b2ffd4 | ||
| 2d659015ff | |||
| e63019ac50 | |||
|  | dde118fed9 | ||
|  | 1538bf8c34 | ||
|  | 4abc498ae3 | ||
|  | 93dfbfbfcd | ||
| f9e273d4bf | |||
| 584fa0a633 | |||
|  | 73cdca3973 | ||
|  | d716f8a0c9 | ||
| aa24f04911 | |||
| 1880e6d12d | |||
| 4a00513e65 | |||
|  | 7718ee199a | ||
|  | d7c7bff065 | ||
|  | 802675f062 | ||
| d56d8c923f | |||
| 00c3c6fc54 | |||
| b3d4ba8657 | |||
|  | a4d578bd5d | ||
|  | 7653649389 | ||
| a344a2227e | |||
| 4b9200b35c | |||
| 91be028507 | |||
| 3b05f91f5c | |||
|  | 8804271339 | ||
| 6d9f377913 | |||
| 18b603c5ae | |||
|  | e9784572af | ||
|  | f168a9e7ee | ||
|  | 50b6db75da | ||
|  | df065f1d57 | ||
| 578eb177e7 | |||
| 81b3f3d2ca | |||
| 7c7ffa3b10 | |||
|  | 1f098ceecf | ||
|  | c47c1a2472 | ||
| ec45b16840 | |||
| 9288019789 | |||
|  | 9c04139362 | ||
|  | cfc14a7432 | ||
| 31e40c26fa | |||
|  | 3f2fe5c7e7 | ||
|  | 76b6e8a01e | ||
| f9543982e4 | |||
| 3c9f2d4106 | |||
|  | cad26a736e | ||
|  | 4f2ac433f1 | ||
|  | f9e505108b | ||
|  | d2aced13da | ||
|  | 03d031d623 | ||
| 44a2d4854a | |||
| 292ff33f7f | |||
| 55886cf9db | |||
| c640923159 | |||
| 752530f352 | |||
| 34b9450fc9 | |||
|  | 5d6462b706 | ||
| f70c5b004a | |||
| 5bb9de9242 | |||
| 982a24514b | |||
|  | 97c6f770b4 | ||
|  | 4522f1e831 | ||
| c14547ddbe | |||
| 63c97db414 | |||
| 6ebb32ffbf | |||
| 07c97cb424 | |||
| 04b58de5de | |||
|  | 6e822b7201 | ||
|  | 625ccfcd72 | ||
| c77069244d | |||
| 9815ddb853 | |||
| 74a3a5b825 | |||
| 00e9416e0a | |||
| b6803a070a | |||
|  | bfd2770657 | ||
|  | 668b1e77c7 | ||
|  | e51744260f | ||
|  | e0987d7d81 | ||
| 26b94d7bda | |||
| df0c8b5d84 | |||
|  | a111d814db | ||
| e8bd8767c0 | |||
| 8cb96cb693 | |||
| b9bee45277 | |||
| bee24655cd | |||
| 886c895f81 | |||
| 59c8cc1588 | |||
| 11467a994d | |||
|  | 9f2ca98dfc | ||
| bf434b6bef | |||
| 41ff592515 | |||
|  | 48ec937c55 | ||
|  | 65731546b7 | ||
| 76c6a6772a | |||
| e7048231bc | |||
| 49babeab19 | |||
| fb2cb3015e | |||
|  | 53f45d2c7e | ||
|  | d889cebc60 | ||
| 9a225235b6 | |||
| dff7d9261d | |||
| 6f2663edf6 | |||
| d5024bd07e | |||
| 9c4189484a | |||
| 3720103f41 | |||
|  | c4d27ee30f | ||
|  | d26a5dce12 | ||
|  | 5843a943d9 | ||
|  | c1341b8ed2 | ||
| 6a4515d0cd | |||
| a0a39e4b00 | |||
| b9fb16077c | |||
| 4b3c566c89 | |||
|  | cbd2dfe53f | ||
|  | 6cdb1eb62c | ||
| ed7175076b | |||
| 27677b3870 | |||
| 7423f5af1a | |||
|  | 21d6dbe0b6 | ||
|  | 1ee84509b5 | ||
| 57e57d162f | |||
| 5b0870bb19 | |||
| 7f5354630a | |||
| 008ac6b5ae | |||
| c7aa4e0c1f | |||
| 43bd918a47 | |||
| 7eda54bb87 | |||
| bd75b843fa | |||
| 8865bf5d7c | |||
|  | caabbcd951 | ||
| 48528c5b1d | |||
| f7b90a0c14 | |||
|  | a9848becb0 | ||
| 7cc13f48d5 | |||
| b6b267fd4b | |||
| 9671a61bb2 | |||
| d7dc617746 | |||
| 32cb2e1a9a | |||
| 3d31113337 | |||
| 48b6f7e6ad | |||
| 0da411fe60 | |||
| d7b9ed199d | |||
| 7e74f7bec4 | |||
| dae7b30b92 | |||
| f7e4661ca0 | |||
|  | 7b66197534 | ||
|  | c3273eff20 | ||
|  | 67a3d7aeed | ||
|  | d8831fe925 | ||
| c7ceff6a21 | |||
|  | 5580b3a7d1 | ||
| 33d8fb2dd9 | |||
| 9f6f776460 | |||
|  | 84fe36d084 | ||
|  | 3438dde8df | ||
|  | aea49bc349 | ||
|  | 9ef6f9878e | ||
| 708ca8585a | |||
| d15bf4b8e1 | |||
|  | 7496da0987 | ||
|  | 2568f5b925 | ||
| 577cdf1d72 | |||
| f92ed659a7 | |||
| dfb7fb1d9f | |||
| a4c1ab6147 | |||
| cf85f0388d | |||
| 00b0f75b0d | |||
| b45586e81c | |||
| 2c7e6bf58b | |||
| 7c5a06f6d0 | |||
|  | 068ef85b05 | ||
|  | a6ab742fdb | ||
| 2062a8d578 | |||
| 3d3e8f4f9f | |||
|  | 2756f16a5e | ||
|  | d7908c33de | ||
|  | 4cc2ebc9e4 | ||
|  | b8afa7314c | ||
|  | be5605931c | ||
| 09fa821510 | |||
|  | f45d2d5dcc | ||
|  | 0a82fae45c | ||
|  | 46b05aa9c5 | ||
|  | 813c1ab1f1 | ||
|  | b1c27a141d | ||
| 81bb361299 | |||
|  | 79d533550d | ||
|  | b8c106f320 | ||
| b74492a805 | |||
| c93a43f158 | |||
|  | 0ff410ae19 | ||
|  | ced30b61e2 | ||
|  | 2b782df290 | ||
|  | f0f1ba0307 | ||
| 2343e621e6 | |||
|  | 2568504821 | ||
| b821dde020 | |||
| ae3b053334 | |||
|  | c5e081d69c | ||
|  | 535a6aaf05 | ||
|  | e73b909a48 | ||
|  | a4d9200293 | ||
|  | 350508bdb3 | ||
|  | 38852737e4 | ||
|  | 802404c78c | ||
|  | 0e9b591c1c | ||
|  | c43a2b599a | ||
|  | 8c91e82ee8 | ||
|  | 9d866d062a | ||
|  | 3a4e397e72 | ||
|  | 2b6cfe555f | ||
|  | 7df58dd883 | ||
|  | 4bf86ae60a | ||
|  | 07ee87ff5a | ||
|  | 0c2498fe2f | ||
|  | ad2e65dad5 | ||
|  | 715babeac8 | ||
|  | 3eae9a9e3f | ||
|  | 186aad065f | ||
|  | bf5685eb11 | ||
|  | 4a96c067ae | ||
|  | ab063f33c0 | ||
|  | 9efcc535bc | ||
|  | 231b61d012 | ||
|  | e898f4f0b0 | ||
|  | d5db5f5242 | ||
|  | 2fcedb13dd | ||
|  | 35ed1defac | ||
|  | 4e95accf80 | ||
|  | 422764757d | ||
|  | afc462bd58 | ||
|  | b57a4d32aa | ||
|  | adbdc4e65b | ||
|  | e4deea4b94 | ||
|  | 94d721a20b | ||
|  | 7bf82f5b37 | ||
|  | f02c7ea534 | ||
|  | bc503b60e6 | ||
|  | 704ca162c1 | ||
|  | b5329d8852 | ||
|  | f27b9347ff | ||
|  | b4967f0231 | ||
|  | 6d0f1aabb1 | ||
|  | f4bfeb835d | ||
|  | 394b7b6276 | ||
|  | da17a015c7 | ||
|  | 1fd08c21ac | ||
|  | 28db0631ff | ||
|  | b35401b86b | ||
|  | a0714de8ec | ||
|  | 21a1710b43 | ||
|  | b2b5137d28 | ||
|  | 2cc07450f4 | ||
|  | c0e8bc9da9 | ||
|  | b1265ae867 | ||
|  | 32bb85ea4c | ||
|  | ca0607b6ef | ||
|  | 19b527e83f | ||
|  | 4730d4692a | ||
|  | 1bb456c0c5 | ||
|  | 4b04ae3611 | ||
|  | 2f776d51c6 | ||
|  | 3a50afe7e7 | ||
|  | f8e880b445 | ||
|  | 3e947527cb | ||
|  | 31f65beac8 | ||
|  | 38e2a32ac9 | ||
|  | efa84ca50a | ||
|  | 5e96d6d04c | ||
|  | df30bdc599 | ||
|  | 7f45222924 | ||
|  | dd891f5e3b | ||
|  | 6c97a6a071 | ||
|  | 73bb2d5128 | ||
|  | b710fec6ea | ||
|  | b2a8cd60f5 | ||
|  | 867ee364ab | ||
|  | 25becc9324 | ||
|  | 94d1ae4c82 | ||
|  | 2075b177ef | ||
|  | 847c761ccc | ||
|  | 8287ed8383 | ||
|  | e6be7416f4 | ||
|  | 26863b6d95 | ||
|  | ebd730bd54 | ||
|  | 066be31a3b | ||
|  | 7a4c142955 | ||
|  | eb7d34a4cc | ||
|  | aab27a655a | ||
|  | 93280bae85 | ||
|  | c5f93abcd7 | ||
|  | d5deef782d | ||
|  | 5f50473c0d | ||
|  | 13f50406e3 | ||
|  | 09cd46d337 | ||
|  | d3f51065c2 | ||
|  | 925ac4173d | ||
|  | eb921041d0 | ||
|  | 87c5c0271b | ||
|  | a3f5a13591 | ||
|  | 9fe28f00eb | ||
|  | a8a0bb85cc | ||
|  | 6411caad67 | ||
|  | 7533035a99 | ||
|  | b15db11c60 | ||
|  | f6077f9d48 | ||
|  | 572954ef12 | ||
|  | cedeaae7db | ||
|  | e6cf0b1e17 | ||
|  | 5412628ea6 | ||
|  | 1f70cedbab | ||
|  | b50f37cfb4 | ||
|  | cb0d2a1b03 | ||
|  | 6fe9b28a82 | ||
|  | b002587d7c | ||
|  | 6c08385782 | ||
|  | 4e1272fabf | ||
|  | 607dc2d3c6 | ||
|  | 23c880b009 | ||
|  | 334bb6792f | ||
|  | a3690071b4 | ||
|  | 299d119013 | ||
|  | 55be842d23 | ||
|  | 9875c446c6 | ||
|  | 9c25eb35ca | ||
|  | 5ac96dbdc6 | ||
|  | 5cc9aca85d | ||
|  | ac29ebcb95 | ||
|  | a5cfb89304 | ||
|  | f04a7251cc | ||
|  | d4ce7d9905 | ||
|  | 8a1d303ab9 | ||
|  | bf0a4de919 | ||
|  | 6fe5885fe4 | ||
|  | 17ac309e84 | ||
|  | 7467a1c027 | ||
|  | fdfb8a26a8 | ||
|  | 2df4e422ad | ||
|  | 3a3e3cac40 | ||
|  | b1c02ec310 | ||
|  | 38eadee2c9 | ||
|  | 42c70437be | ||
|  | 65274b4d7f | ||
|  | 7e8be32755 | ||
|  | ff761ea4e6 | ||
|  | a31d3e60d8 | ||
|  | 4d60b92b7f | ||
|  | c159c70c84 | ||
|  | 28b5572755 | ||
|  | 5fac7080bc | ||
|  | 4548523ecc | ||
|  | 4154fc6f44 | ||
|  | 4e3458516a | ||
|  | 90a2efb9b3 | ||
|  | 40699221e2 | ||
|  | 3cb1b545d0 | ||
|  | e199ba7e88 | ||
|  | 4d53703c67 | ||
|  | d506c59efa | ||
|  | 44188a5c6f | ||
|  | 2018077770 | ||
|  | 984e06e2b5 | ||
|  | aead94e9a7 | ||
|  | 3277bda130 | ||
|  | 442b0b406c | ||
|  | 8824a54269 | ||
|  | c03423250f | ||
|  | 317fd0da44 | ||
|  | 783795a44a | ||
|  | 0e6197fbed | ||
|  | dad7862f91 | ||
|  | c89a883448 | ||
|  | c204288fbc | ||
|  | ad739f042a | ||
|  | db988301d0 | ||
|  | 9b1f29c4c2 | ||
|  | e5ea04ee0c | ||
|  | c92a3c6068 | ||
|  | 03f8da8fbc | ||
|  | 78a9e31ff0 | ||
|  | c1fc947bb8 | ||
|  | ff7b19a71b | ||
|  | 1c16ffa1c1 | ||
|  | 4962f59477 | ||
|  | e158b60bce | ||
|  | 34820bec27 | ||
|  | eed9aa9f0c | ||
|  | 8792ff6439 | ||
|  | 078901278c | ||
|  | bf5fb89aff | ||
|  | 7574c18cef | ||
|  | 36ea5f6b77 | ||
|  | 285deab432 | ||
|  | bb7d87d0a0 | ||
|  | b9b5bdfc3a | ||
|  | 51eb2c5dfc | ||
|  | ede0dff794 | ||
|  | aa6de818e2 | ||
|  | dcf6517a93 | ||
|  | a308dff410 | ||
|  | 14ba20898a | ||
|  | a53d3ee19a | ||
|  | 5df435319d | ||
|  | 0da2d3e222 | ||
|  | 9c9dfbfa78 | ||
|  | e4df025d01 | ||
|  | cfeda9d536 | ||
|  | 4450b1993a | ||
|  | d03ce5c2a4 | ||
|  | 7d6522c1ef | ||
|  | b96832a922 | ||
|  | 5d7af47b05 | ||
|  | 053ef25c90 | ||
|  | 8ae77d3706 | ||
|  | 79b50feacf | ||
|  | c67c1544cd | ||
|  | e657f9a344 | ||
|  | b6ebf35af5 | ||
|  | 604c05f4b8 | ||
|  | 70e276e1ab | ||
|  | 9472b02771 | ||
|  | 9597ab94eb | ||
|  | ce4da83bc2 | ||
|  | d557f3ef77 | ||
|  | f574c20118 | ||
|  | f102897385 | ||
|  | d6fce3e498 | ||
|  | 2d0bcc2606 | ||
|  | 45df59720e | ||
|  | 44ef5bc207 | ||
|  | 98af36217a | ||
|  | be7b37b9c9 | ||
|  | c4f82e072b | ||
|  | 3f9654e397 | ||
|  | 912b50f6fa | ||
|  | 2a4a0e43c1 | ||
|  | 32523a229c | ||
|  | 1ebd56c3fb | ||
|  | 8dccffdfd5 | ||
|  | 5642ea270f | ||
|  | 43cea62855 | ||
|  | 2b4067bb71 | ||
|  | 85771e97e9 | ||
|  | 8b371ffa94 | ||
|  | bf659dfd92 | ||
|  | 76a4dd36d9 | ||
|  | f4010023ca | ||
|  | 24a4589def | ||
|  | c904822e74 | ||
|  | 40ee1e1957 | ||
|  | 461df78a3f | ||
|  | db9c9475d4 | ||
|  | 214f7a6f13 | ||
|  | c844cfcda8 | ||
|  | a3e3034e6f | ||
|  | e7cba358c2 | ||
|  | 99329197ee | ||
|  | 421401af55 | ||
|  | 0626c1e39e | ||
|  | 725f03e2e2 | ||
|  | 65f77112e0 | ||
|  | 408b868475 | ||
|  | 1c797deb04 | ||
|  | b9d5a42b57 | ||
|  | e737591918 | ||
|  | ba5ea5830b | ||
|  | 43f244badf | ||
|  | e9c8ba5ef7 | ||
|  | d70709a8e8 | ||
|  | 733f8ff0b2 | ||
|  | 0bfa5bb213 | ||
|  | 1f26a234f9 | ||
|  | 13f0116425 | ||
|  | 25f589b064 | ||
|  | 210c50a278 | ||
|  | 549a143e78 | ||
|  | 277301486d | ||
|  | c851b39a49 | ||
|  | 15cc12eb6c | ||
|  | ae4f1f8c12 | ||
|  | 5609624b44 | ||
|  | b5a947dd79 | ||
|  | ee16f62322 | ||
|  | 3318de27d6 | ||
|  | ac56965306 | ||
|  | 8e99264f40 | ||
|  | 69327db9a9 | ||
|  | 7331ee2d80 | ||
|  | 918c105c57 | ||
|  | be1511d469 | ||
|  | f1c31df9d2 | ||
|  | ff7b587fad | ||
|  | 4e1135b214 | ||
|  | acd4955a18 | ||
|  | bd08dc4f45 | ||
|  | 22d137d4e5 | ||
|  | 87ee592176 | ||
|  | 063603b1ea | ||
|  | f292106db6 | ||
|  | 9d08aebea9 | ||
|  | 4e30739093 | ||
|  | 90ea472411 | ||
|  | 56999474e2 | ||
|  | d74c21a386 | ||
|  | ca6bdd7302 | ||
|  | 6f20f1d224 | ||
|  | d0e357ef89 | ||
|  | 21251f2e1b | ||
|  | fcf1ccf669 | ||
|  | 49cce514f1 | ||
|  | 695af98a1d | ||
|  | f8cb46d360 | ||
|  | 0da64dea90 | ||
|  | 2cceebbf12 | ||
|  | 40232dcefe | ||
|  | dbd86bb95b | ||
|  | b8fd2c161f | ||
|  | df9b979583 | ||
|  | 23ef0e3e19 | ||
|  | ae9175735a | ||
|  | 2d13ea1a22 | ||
|  | 8c675064bd | ||
|  | 550b905bb8 | ||
|  | edb79dc088 | ||
|  | 88e635c5d1 | ||
|  | ecb4a24de8 | ||
|  | c8c1d36710 | ||
|  | b4bb428d9b | ||
|  | e9ef7e3852 | ||
|  | 31cbbfc07e | ||
|  | 4eb0552d1d | ||
|  | 08f2a4564f | ||
|  | 7e00f643f8 | ||
|  | c19ccdad7c | ||
|  | 8aed4181e1 | ||
|  | 06ab7f5661 | ||
|  | 645ec8eba0 | ||
|  | 72ffa8a88e | ||
|  | 4c829b410e | ||
|  | eda4fd9912 | ||
|  | 041d9137c0 | ||
|  | eeacdfe031 | ||
|  | e5535f4d72 | ||
|  | 044a292281 | ||
|  | fe0467df1e | ||
|  | 19234fb40e | ||
|  | f445257d28 | ||
|  | bdc2a987aa | ||
|  | 72acb0e48f | ||
|  | b4e9211df7 | ||
|  | 97019d2997 | ||
|  | 83c5f05094 | ||
|  | 1619e42d90 | ||
|  | 9f6cebe5ff | ||
|  | a84ebe5624 | ||
|  | c527e39881 | ||
|  | a0f4687887 | ||
|  | 3ef7b2389e | ||
|  | 7dfa3d0b50 | ||
|  | bf629dddce | ||
|  | 7747b95430 | ||
|  | ccd75c039a | ||
|  | 493ea80208 | ||
|  | 229baf3aba | ||
|  | 0ce4ecfc84 | ||
|  | ddfaae8ea6 | ||
|  | 70c5b781e5 | ||
|  | 901e359d28 | ||
|  | e857d4d4c8 | ||
|  | e5b77c7fd8 | ||
|  | 3b5d629048 | ||
|  | 08772d5e0c | ||
|  | 017dcd69a6 | ||
|  | 8178a17b88 | ||
|  | c5c1b53e54 | ||
|  | 440f9e2013 | ||
|  | c98657d588 | ||
|  | f450857716 | ||
|  | 9ec238df9e | ||
|  | 3ba8eb1500 | ||
|  | 8da49c5a34 | ||
|  | e04f61b1fa | ||
|  | 115e13b227 | ||
|  | 75f3062a80 | ||
|  | b460cd3ef1 | ||
|  | 0e6727a33b | ||
|  | 4c6745cb4c | ||
|  | efdd0e572c | ||
|  | ca60a218ac | ||
|  | 03633d709e | ||
|  | 4de58c4aab | ||
|  | 4f8b1c1940 | ||
|  | dec39b313d | ||
|  | dc835ad1cb | ||
|  | 71c8c9e4fb | ||
|  | a935ef7b39 | ||
|  | a97ad1a51d | ||
|  | 5ab9129db3 | ||
|  | 634943c11f | ||
|  | e598e65f69 | ||
|  | 291407dc7f | ||
|  | 641a28aa1d | ||
|  | 75207fa010 | ||
|  | c2b0e0269a | ||
|  | 7828887604 | ||
|  | e6efc93a7c | ||
|  | ff7e773d5e | ||
|  | a0380fad72 | ||
|  | 61e9a33777 | ||
|  | 3e139b52d3 | ||
|  | fd6031b005 | ||
|  | fe44fc50d9 | ||
|  | 2dd88cf3f8 | ||
|  | 6b7e82f1a9 | ||
|  | be612b3931 | ||
|  | f5e74033f9 | ||
|  | 8d52e0a349 | ||
|  | a60f6d353e | ||
|  | 5d3b574325 | ||
|  | 6ee5ea6b32 | ||
|  | cc349c6512 | ||
|  | fde2e07bf4 | ||
|  | 2f38fe8d45 | ||
|  | 813af84ae8 | ||
|  | cfe6c6838f | ||
|  | 12a7216dfe | ||
|  | 71ebd61327 | ||
|  | 2c2da60cc2 | ||
|  | 7631ed9c56 | ||
|  | 65669b116e | ||
|  | ae2a6cfc6e | ||
|  | c36223055e | ||
|  | e42de105c5 | ||
|  | b08dae0809 | ||
|  | 3bf8fddbb5 | ||
|  | d29fa23ebc | ||
|  | c978c88521 | ||
|  | 93f09818da | ||
|  | 54a8ea93ec | ||
|  | 56e87d6e55 | ||
|  | df29cc19ab | ||
|  | e61189db3f | ||
|  | 361ce948c3 | ||
|  | 049b4a4631 | ||
|  | 9f2f294a27 | ||
|  | 81dcd0e6ea | ||
|  | 34a788331f | ||
|  | e2c39945b3 | ||
|  | 1591d391b9 | ||
|  | f4c06ed8c0 | ||
|  | 1f49f781bf | ||
|  | 3a9f746421 | ||
|  | 4491d87766 | ||
|  | 0e080a7abc | ||
|  | 8bf78846ee | ||
|  | 9aa34dc803 | ||
|  | fdcbe0a0d1 | ||
|  | 6a62a9c6a5 | ||
|  | b331ecea78 | ||
|  | 66f8a2f082 | ||
|  | d58b7cf9b9 | ||
|  | 0d749becff | ||
|  | 1dbea9aa69 | ||
|  | c1438cbbe3 | ||
|  | f4623fd551 | ||
|  | 59ba9ff3bb | ||
|  | 1fbab4032b | ||
|  | c037244874 | ||
|  | f4272aa6fd | ||
|  | 8cb7a1a887 | ||
|  | b45bd8e097 | ||
|  | 5e48b701ec | ||
|  | 7f6bffe5ad | ||
|  | 6bf5fb1924 | ||
|  | 086db7bd19 | ||
|  | c0a9b38c02 | ||
|  | 6d7bdfb5f5 | ||
|  | be5d70ae6e | ||
|  | ab1068044e | ||
|  | dda151250f | ||
|  | 18daf85069 | ||
|  | 81cc28f6ca | ||
|  | c01a1e02fe | ||
|  | 7e70f4ed9c | ||
|  | 1056e36f11 | ||
|  | 0b8a88978b | ||
|  | 59b31b6bb8 | ||
|  | 69496482fc | ||
|  | 4be31ad1f6 | ||
|  | 176a021ce9 | ||
|  | b673174b71 | ||
|  | e6f7a5a818 | ||
|  | 68b69a2ac0 | ||
|  | bd15c38ae8 | ||
|  | b815f5f764 | ||
|  | 4da437431e | ||
|  | 3c7bf211a9 | ||
|  | 347d5404dd | ||
|  | 5e2cd0d07c | ||
|  | 62fcee72c5 | ||
|  | 0a6168eef0 | ||
|  | 63865e4232 | ||
|  | c64deedf74 | ||
|  | 3281559ec3 | ||
|  | 6a2eca2ec2 | ||
|  | d8ff895e74 | ||
|  | 00c49d4c17 | ||
|  | ec89714cce | ||
|  | 6ab744c720 | ||
|  | bbb657da5c | ||
|  | fbc2380cb8 | ||
|  | 08682c5461 | ||
|  | 13bce2a6bf | ||
|  | 70e689900b | 
| @@ -30,8 +30,34 @@ directory | ||||
| #ifndef DISABLE_WARNINGS_H | ||||
| #define DISABLE_WARNINGS_H | ||||
|  | ||||
|  | ||||
|  | ||||
| #if defined __GNUC__ && __GNUC__>=6 | ||||
| #pragma GCC diagnostic ignored "-Wignored-attributes" | ||||
| #endif | ||||
|  | ||||
|  //disables and intel compiler specific warning (in json.hpp) | ||||
| #pragma warning disable 488   | ||||
|  | ||||
| #ifdef __NVCC__ | ||||
|  //disables nvcc specific warning in json.hpp | ||||
| #pragma clang diagnostic ignored "-Wdeprecated-register" | ||||
| #pragma diag_suppress unsigned_compare_with_zero | ||||
| #pragma diag_suppress cast_to_qualified_type | ||||
|  | ||||
|  //disables nvcc specific warning in many files | ||||
| #pragma diag_suppress esa_on_defaulted_function_ignored | ||||
| #pragma diag_suppress extra_semicolon | ||||
|  | ||||
| //Eigen only | ||||
| #endif | ||||
|  | ||||
| // Disable vectorisation in Eigen on the Power8/9 and PowerPC | ||||
| #ifdef  __ALTIVEC__ | ||||
| #define  EIGEN_DONT_VECTORIZE | ||||
| #endif | ||||
| #ifdef  __VSX__ | ||||
| #define  EIGEN_DONT_VECTORIZE | ||||
| #endif | ||||
|  | ||||
| #endif | ||||
|   | ||||
| @@ -38,16 +38,19 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
| #ifndef GRID_BASE_H | ||||
| #define GRID_BASE_H | ||||
|  | ||||
| #include <Grid/GridStd.h> | ||||
|  | ||||
| #include <Grid/DisableWarnings.h> | ||||
| #include <Grid/Namespace.h> | ||||
| #include <Grid/GridStd.h> | ||||
| #include <Grid/threads/Pragmas.h> | ||||
| #include <Grid/perfmon/Timer.h> | ||||
| #include <Grid/perfmon/PerfCount.h> | ||||
| #include <Grid/log/Log.h> | ||||
| #include <Grid/allocator/AlignedAllocator.h> | ||||
| #include <Grid/simd/Simd.h> | ||||
| #include <Grid/serialisation/Serialisation.h> | ||||
| #include <Grid/threads/Threads.h> | ||||
| #include <Grid/util/Util.h> | ||||
| #include <Grid/log/Log.h> | ||||
| #include <Grid/allocator/Allocator.h> | ||||
| #include <Grid/simd/Simd.h> | ||||
| #include <Grid/threads/ThreadReduction.h> | ||||
| #include <Grid/serialisation/Serialisation.h> | ||||
| #include <Grid/util/Sha.h> | ||||
| #include <Grid/communicator/Communicator.h>  | ||||
| #include <Grid/cartesian/Cartesian.h>     | ||||
| @@ -57,5 +60,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
| #include <Grid/stencil/Stencil.h>       | ||||
| #include <Grid/parallelIO/BinaryIO.h> | ||||
| #include <Grid/algorithms/Algorithms.h>    | ||||
| NAMESPACE_CHECK(GridCore) | ||||
|  | ||||
| #endif | ||||
|   | ||||
| @@ -38,5 +38,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
| #include <Grid/qcd/spin/Spin.h> | ||||
| #include <Grid/qcd/utils/Utils.h> | ||||
| #include <Grid/qcd/representations/Representations.h> | ||||
| NAMESPACE_CHECK(GridQCDCore); | ||||
|  | ||||
| #endif | ||||
|   | ||||
| @@ -6,7 +6,9 @@ | ||||
| /////////////////// | ||||
| #include <cassert> | ||||
| #include <complex> | ||||
| #include <memory> | ||||
| #include <vector> | ||||
| #include <array> | ||||
| #include <string> | ||||
| #include <iostream> | ||||
| #include <iomanip> | ||||
|   | ||||
| @@ -1,14 +1,59 @@ | ||||
| #include <Grid/GridCore.h> | ||||
| #pragma once | ||||
| // Force Eigen to use MKL if Grid has been configured with --enable-mkl | ||||
| #ifdef USE_MKL | ||||
| #define EIGEN_USE_MKL_ALL | ||||
| #endif | ||||
|  | ||||
|  | ||||
| #if defined __GNUC__ | ||||
| #pragma GCC diagnostic push | ||||
| #pragma GCC diagnostic ignored "-Wdeprecated-declarations" | ||||
| #endif | ||||
|  | ||||
| /* NVCC save and restore compile environment*/ | ||||
| #ifdef __NVCC__ | ||||
| #pragma push | ||||
| #pragma diag_suppress code_is_unreachable | ||||
| #pragma push_macro("__CUDA_ARCH__") | ||||
| #pragma push_macro("__NVCC__") | ||||
| #pragma push_macro("__CUDACC__") | ||||
| #undef __CUDA_ARCH__ | ||||
| #undef __NVCC__ | ||||
| #undef __CUDACC__ | ||||
| #define __NVCC__REDEFINE__ | ||||
| #endif  | ||||
|  | ||||
| /* SYCL save and restore compile environment*/ | ||||
| #ifdef GRID_SYCL | ||||
| #pragma push | ||||
| #pragma push_macro("__SYCL_DEVICE_ONLY__") | ||||
| #undef __SYCL_DEVICE_ONLY__ | ||||
| #define EIGEN_DONT_VECTORIZE | ||||
| //#undef EIGEN_USE_SYCL | ||||
| #define __SYCL__REDEFINE__ | ||||
| #endif | ||||
|  | ||||
|  | ||||
| #include <Grid/Eigen/Dense> | ||||
| #include <Grid/Eigen/unsupported/CXX11/Tensor> | ||||
|  | ||||
| /* NVCC restore */ | ||||
| #ifdef __NVCC__REDEFINE__ | ||||
| #pragma pop_macro("__CUDACC__") | ||||
| #pragma pop_macro("__NVCC__") | ||||
| #pragma pop_macro("GRID_SIMT") | ||||
| #pragma pop | ||||
| #endif | ||||
|  | ||||
| /*SYCL restore*/ | ||||
| #ifdef __SYCL__REDEFINE__ | ||||
| #pragma pop_macro("__SYCL_DEVICE_ONLY__") | ||||
| #pragma pop | ||||
| #endif | ||||
|  | ||||
| #if defined __GNUC__ | ||||
| #pragma GCC diagnostic pop | ||||
| #endif | ||||
|  | ||||
|  | ||||
|   | ||||
							
								
								
									
										1
									
								
								Grid/Grid_Eigen_Tensor.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								Grid/Grid_Eigen_Tensor.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| #include <Grid/Grid_Eigen_Dense.h> | ||||
| @@ -21,7 +21,7 @@ if BUILD_HDF5 | ||||
|   extra_headers+=serialisation/Hdf5Type.h | ||||
| endif | ||||
|  | ||||
| all: version-cache | ||||
| all: version-cache Version.h | ||||
|  | ||||
| version-cache: | ||||
| 	@if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\ | ||||
| @@ -42,7 +42,7 @@ version-cache: | ||||
| 	fi;\ | ||||
| 	rm -f vertmp | ||||
|  | ||||
| Version.h: | ||||
| Version.h: version-cache | ||||
| 	cp version-cache Version.h | ||||
|  | ||||
| .PHONY: version-cache | ||||
|   | ||||
| @@ -1,12 +1,11 @@ | ||||
| /*************************************************************************************
 | ||||
| 
 | ||||
| Grid physics library, www.github.com/paboyle/Grid  | ||||
| Grid physics library, www.github.com/paboyle/Grid | ||||
| 
 | ||||
| Source file: Hadrons/Modules/MNPR/FourQuark.cc | ||||
| Source file: ./lib/Namespace.h | ||||
| 
 | ||||
| Copyright (C) 2015-2019 | ||||
| Copyright (C) 2016 | ||||
| 
 | ||||
| Author: Antonin Portelli <antonin.portelli@me.com> | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| 
 | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| @@ -23,14 +22,17 @@ You should have received a copy of the GNU General Public License along | ||||
| with this program; if not, write to the Free Software Foundation, Inc., | ||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
| 
 | ||||
| See the full license in the file "LICENSE" in the top level distribution directory | ||||
| See the full license in the file "LICENSE" in the top level distribution | ||||
| directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #include <Hadrons/Modules/MNPR/FourQuark.hpp> | ||||
| #pragma once | ||||
| 
 | ||||
| using namespace Grid; | ||||
| using namespace Hadrons; | ||||
| using namespace MNPR; | ||||
| 
 | ||||
| template class Grid::Hadrons::MNPR::TFourQuark<FIMPL,FIMPL>; | ||||
| #include <type_traits> | ||||
| #include <cassert> | ||||
| 
 | ||||
| #define NAMESPACE_BEGIN(A) namespace A { | ||||
| #define NAMESPACE_END(A)   } | ||||
| #define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid) | ||||
| #define GRID_NAMESPACE_END   NAMESPACE_END(Grid) | ||||
| #define NAMESPACE_CHECK(x) struct namespaceTEST##x {};  static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at"  );  | ||||
| @@ -29,23 +29,32 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| #ifndef GRID_ALGORITHMS_H | ||||
| #define GRID_ALGORITHMS_H | ||||
|  | ||||
| NAMESPACE_CHECK(algorithms); | ||||
| #include <Grid/algorithms/SparseMatrix.h> | ||||
| #include <Grid/algorithms/LinearOperator.h> | ||||
| #include <Grid/algorithms/Preconditioner.h> | ||||
| NAMESPACE_CHECK(SparseMatrix); | ||||
|  | ||||
| #include <Grid/algorithms/approx/Zolotarev.h> | ||||
| #include <Grid/algorithms/approx/Chebyshev.h> | ||||
| #include <Grid/algorithms/approx/JacobiPolynomial.h> | ||||
| #include <Grid/algorithms/approx/Remez.h> | ||||
| #include <Grid/algorithms/approx/MultiShiftFunction.h> | ||||
| #include <Grid/algorithms/approx/Forecast.h> | ||||
|  | ||||
| #include <Grid/algorithms/approx/RemezGeneral.h> | ||||
| #include <Grid/algorithms/approx/ZMobius.h> | ||||
| NAMESPACE_CHECK(approx); | ||||
| #include <Grid/algorithms/iterative/Deflation.h> | ||||
| #include <Grid/algorithms/iterative/ConjugateGradient.h> | ||||
| NAMESPACE_CHECK(ConjGrad); | ||||
| #include <Grid/algorithms/iterative/BiCGSTAB.h> | ||||
| NAMESPACE_CHECK(BiCGSTAB); | ||||
| #include <Grid/algorithms/iterative/ConjugateResidual.h> | ||||
| #include <Grid/algorithms/iterative/NormalEquations.h> | ||||
| #include <Grid/algorithms/iterative/SchurRedBlack.h> | ||||
| #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h> | ||||
| #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h> | ||||
| #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h> | ||||
| #include <Grid/algorithms/iterative/BlockConjugateGradient.h> | ||||
| #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h> | ||||
| #include <Grid/algorithms/iterative/MinimalResidual.h> | ||||
| @@ -57,7 +66,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h> | ||||
| #include <Grid/algorithms/iterative/PowerMethod.h> | ||||
|  | ||||
| NAMESPACE_CHECK(PowerMethod); | ||||
| #include <Grid/algorithms/CoarsenedMatrix.h> | ||||
| NAMESPACE_CHECK(CoarsendMatrix); | ||||
| #include <Grid/algorithms/FFT.h> | ||||
|  | ||||
| #endif | ||||
|   | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,5 +1,4 @@ | ||||
|  | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -24,8 +23,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef _GRID_FFT_H_ | ||||
| #define _GRID_FFT_H_ | ||||
|  | ||||
| @@ -37,65 +36,64 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| #endif | ||||
| #endif | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| namespace Grid { | ||||
|  | ||||
|   template<class scalar> struct FFTW { }; | ||||
| template<class scalar> struct FFTW { }; | ||||
|  | ||||
| #ifdef HAVE_FFTW	 | ||||
|   template<> struct FFTW<ComplexD> { | ||||
|   public: | ||||
| template<> struct FFTW<ComplexD> { | ||||
| public: | ||||
|  | ||||
|     typedef fftw_complex FFTW_scalar; | ||||
|     typedef fftw_plan    FFTW_plan; | ||||
|   typedef fftw_complex FFTW_scalar; | ||||
|   typedef fftw_plan    FFTW_plan; | ||||
|  | ||||
|     static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany, | ||||
| 					FFTW_scalar *in, const int *inembed,		 | ||||
| 					int istride, int idist,		 | ||||
| 					FFTW_scalar *out, const int *onembed,		 | ||||
| 					int ostride, int odist,		 | ||||
| 					int sign, unsigned flags) { | ||||
|       return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags); | ||||
|     }	   | ||||
|   static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany, | ||||
| 				      FFTW_scalar *in, const int *inembed,		 | ||||
| 				      int istride, int idist,		 | ||||
| 				      FFTW_scalar *out, const int *onembed,		 | ||||
| 				      int ostride, int odist,		 | ||||
| 				      int sign, unsigned flags) { | ||||
|     return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags); | ||||
|   }	   | ||||
|      | ||||
|     static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){ | ||||
|       ::fftw_flops(p,add,mul,fmas); | ||||
|     } | ||||
|   static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){ | ||||
|     ::fftw_flops(p,add,mul,fmas); | ||||
|   } | ||||
|  | ||||
|     inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) { | ||||
|       ::fftw_execute_dft(p,in,out); | ||||
|     } | ||||
|     inline static void fftw_destroy_plan(const FFTW_plan p) { | ||||
|       ::fftw_destroy_plan(p); | ||||
|     } | ||||
|   }; | ||||
|   inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) { | ||||
|     ::fftw_execute_dft(p,in,out); | ||||
|   } | ||||
|   inline static void fftw_destroy_plan(const FFTW_plan p) { | ||||
|     ::fftw_destroy_plan(p); | ||||
|   } | ||||
| }; | ||||
|  | ||||
|   template<> struct FFTW<ComplexF> { | ||||
|   public: | ||||
| template<> struct FFTW<ComplexF> { | ||||
| public: | ||||
|  | ||||
|     typedef fftwf_complex FFTW_scalar; | ||||
|     typedef fftwf_plan    FFTW_plan; | ||||
|   typedef fftwf_complex FFTW_scalar; | ||||
|   typedef fftwf_plan    FFTW_plan; | ||||
|  | ||||
|     static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany, | ||||
| 					FFTW_scalar *in, const int *inembed,		 | ||||
| 					int istride, int idist,		 | ||||
| 					FFTW_scalar *out, const int *onembed,		 | ||||
| 					int ostride, int odist,		 | ||||
| 					int sign, unsigned flags) { | ||||
|       return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags); | ||||
|     }	   | ||||
|   static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany, | ||||
| 				      FFTW_scalar *in, const int *inembed,		 | ||||
| 				      int istride, int idist,		 | ||||
| 				      FFTW_scalar *out, const int *onembed,		 | ||||
| 				      int ostride, int odist,		 | ||||
| 				      int sign, unsigned flags) { | ||||
|     return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags); | ||||
|   }	   | ||||
|      | ||||
|     static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){ | ||||
|       ::fftwf_flops(p,add,mul,fmas); | ||||
|     } | ||||
|   static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){ | ||||
|     ::fftwf_flops(p,add,mul,fmas); | ||||
|   } | ||||
|  | ||||
|     inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) { | ||||
|       ::fftwf_execute_dft(p,in,out); | ||||
|     } | ||||
|     inline static void fftw_destroy_plan(const FFTW_plan p) { | ||||
|       ::fftwf_destroy_plan(p); | ||||
|     } | ||||
|   }; | ||||
|   inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) { | ||||
|     ::fftwf_execute_dft(p,in,out); | ||||
|   } | ||||
|   inline static void fftw_destroy_plan(const FFTW_plan p) { | ||||
|     ::fftwf_destroy_plan(p); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| #endif | ||||
|  | ||||
| @@ -104,203 +102,195 @@ namespace Grid { | ||||
| #define FFTW_BACKWARD (+1) | ||||
| #endif | ||||
|  | ||||
|   class FFT { | ||||
|   private: | ||||
| class FFT { | ||||
| private: | ||||
|      | ||||
|     GridCartesian *vgrid; | ||||
|     GridCartesian *sgrid; | ||||
|   GridCartesian *vgrid; | ||||
|   GridCartesian *sgrid; | ||||
|      | ||||
|     int Nd; | ||||
|     double flops; | ||||
|     double flops_call; | ||||
|     uint64_t usec; | ||||
|   int Nd; | ||||
|   double flops; | ||||
|   double flops_call; | ||||
|   uint64_t usec; | ||||
|      | ||||
|     std::vector<int> dimensions; | ||||
|     std::vector<int> processors; | ||||
|     std::vector<int> processor_coor; | ||||
|   Coordinate dimensions; | ||||
|   Coordinate processors; | ||||
|   Coordinate processor_coor; | ||||
|      | ||||
|   public: | ||||
| public: | ||||
|      | ||||
|     static const int forward=FFTW_FORWARD; | ||||
|     static const int backward=FFTW_BACKWARD; | ||||
|   static const int forward=FFTW_FORWARD; | ||||
|   static const int backward=FFTW_BACKWARD; | ||||
|      | ||||
|     double Flops(void) {return flops;} | ||||
|     double MFlops(void) {return flops/usec;} | ||||
|     double USec(void)   {return (double)usec;}     | ||||
|   double Flops(void) {return flops;} | ||||
|   double MFlops(void) {return flops/usec;} | ||||
|   double USec(void)   {return (double)usec;}     | ||||
|  | ||||
|     FFT ( GridCartesian * grid ) : | ||||
|   FFT ( GridCartesian * grid ) : | ||||
|     vgrid(grid), | ||||
|     Nd(grid->_ndimension), | ||||
|     dimensions(grid->_fdimensions), | ||||
|     processors(grid->_processors), | ||||
|     processor_coor(grid->_processor_coor) | ||||
|     { | ||||
|       flops=0; | ||||
|       usec =0; | ||||
|       std::vector<int> layout(Nd,1); | ||||
|       sgrid = new GridCartesian(dimensions,layout,processors); | ||||
|     }; | ||||
|      | ||||
|     ~FFT ( void)  { | ||||
|       delete sgrid; | ||||
|     } | ||||
|      | ||||
|     template<class vobj> | ||||
|     void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,std::vector<int> mask,int sign){ | ||||
|  | ||||
|       conformable(result._grid,vgrid); | ||||
|       conformable(source._grid,vgrid); | ||||
|       Lattice<vobj> tmp(vgrid); | ||||
|       tmp = source; | ||||
|       for(int d=0;d<Nd;d++){ | ||||
| 	if( mask[d] ) { | ||||
| 	  FFT_dim(result,tmp,d,sign); | ||||
| 	  tmp=result; | ||||
| 	} | ||||
|       } | ||||
|     } | ||||
|  | ||||
|     template<class vobj> | ||||
|     void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){ | ||||
|       std::vector<int> mask(Nd,1); | ||||
|       FFT_dim_mask(result,source,mask,sign); | ||||
|     } | ||||
|  | ||||
|  | ||||
|     template<class vobj> | ||||
|     void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){ | ||||
| #ifndef HAVE_FFTW | ||||
|       assert(0); | ||||
| #else | ||||
|       conformable(result._grid,vgrid); | ||||
|       conformable(source._grid,vgrid); | ||||
|  | ||||
|       int L = vgrid->_ldimensions[dim]; | ||||
|       int G = vgrid->_fdimensions[dim]; | ||||
|        | ||||
|       std::vector<int> layout(Nd,1); | ||||
|       std::vector<int> pencil_gd(vgrid->_fdimensions); | ||||
|        | ||||
|       pencil_gd[dim] = G*processors[dim]; | ||||
|        | ||||
|       // Pencil global vol LxLxGxLxL per node | ||||
|       GridCartesian pencil_g(pencil_gd,layout,processors); | ||||
|        | ||||
|       // Construct pencils | ||||
|       typedef typename vobj::scalar_object sobj; | ||||
|       typedef typename sobj::scalar_type   scalar; | ||||
|        | ||||
|       Lattice<sobj> pgbuf(&pencil_g); | ||||
|        | ||||
|  | ||||
|       typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar; | ||||
|       typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan; | ||||
|        | ||||
|       int Ncomp = sizeof(sobj)/sizeof(scalar); | ||||
|       int Nlow  = 1; | ||||
|       for(int d=0;d<dim;d++){ | ||||
|         Nlow*=vgrid->_ldimensions[d]; | ||||
|       } | ||||
|        | ||||
|       int rank = 1;  /* 1d transforms */ | ||||
|       int n[] = {G}; /* 1d transforms of length G */ | ||||
|       int howmany = Ncomp; | ||||
|       int odist,idist,istride,ostride; | ||||
|       idist   = odist   = 1;          /* Distance between consecutive FT's */ | ||||
|       istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */ | ||||
|       int *inembed = n, *onembed = n; | ||||
|        | ||||
|       scalar div; | ||||
| 	  if ( sign == backward ) div = 1.0/G; | ||||
| 	  else if ( sign == forward ) div = 1.0; | ||||
| 	  else assert(0); | ||||
|        | ||||
|       FFTW_plan p; | ||||
|       { | ||||
|         FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[0]; | ||||
|         FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[0]; | ||||
|         p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany, | ||||
|                                              in,inembed, | ||||
|                                              istride,idist, | ||||
|                                              out,onembed, | ||||
|                                              ostride, odist, | ||||
|                                              sign,FFTW_ESTIMATE); | ||||
|       } | ||||
|        | ||||
|       // Barrel shift and collect global pencil | ||||
|       std::vector<int> lcoor(Nd), gcoor(Nd); | ||||
|       result = source; | ||||
|       int pc = processor_coor[dim]; | ||||
|       for(int p=0;p<processors[dim];p++) { | ||||
|         PARALLEL_REGION | ||||
|         { | ||||
|           std::vector<int> cbuf(Nd); | ||||
|           sobj s; | ||||
|            | ||||
|           PARALLEL_FOR_LOOP_INTERN | ||||
|           for(int idx=0;idx<sgrid->lSites();idx++) { | ||||
|             sgrid->LocalIndexToLocalCoor(idx,cbuf); | ||||
|             peekLocalSite(s,result,cbuf); | ||||
| 	    cbuf[dim]+=((pc+p) % processors[dim])*L; | ||||
| 	    //            cbuf[dim]+=p*L; | ||||
|             pokeLocalSite(s,pgbuf,cbuf); | ||||
|           } | ||||
|         } | ||||
|         if (p != processors[dim] - 1) | ||||
|         { | ||||
|           result = Cshift(result,dim,L); | ||||
|         } | ||||
|       } | ||||
|        | ||||
|       // Loop over orthog coords | ||||
|       int NN=pencil_g.lSites(); | ||||
|       GridStopWatch timer; | ||||
|       timer.Start(); | ||||
|       PARALLEL_REGION | ||||
|       { | ||||
|         std::vector<int> cbuf(Nd); | ||||
|          | ||||
|         PARALLEL_FOR_LOOP_INTERN | ||||
|         for(int idx=0;idx<NN;idx++) { | ||||
|           pencil_g.LocalIndexToLocalCoor(idx, cbuf); | ||||
|           if ( cbuf[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0 | ||||
|             FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[idx]; | ||||
|             FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[idx]; | ||||
|             FFTW<scalar>::fftw_execute_dft(p,in,out); | ||||
|           } | ||||
|         } | ||||
|       } | ||||
|       timer.Stop(); | ||||
|        | ||||
|       // performance counting | ||||
|       double add,mul,fma; | ||||
|       FFTW<scalar>::fftw_flops(p,&add,&mul,&fma); | ||||
|       flops_call = add+mul+2.0*fma; | ||||
|       usec += timer.useconds(); | ||||
|       flops+= flops_call*NN; | ||||
|        | ||||
|       // writing out result | ||||
|       PARALLEL_REGION | ||||
|       { | ||||
|         std::vector<int> clbuf(Nd), cgbuf(Nd); | ||||
|         sobj s; | ||||
|          | ||||
|         PARALLEL_FOR_LOOP_INTERN | ||||
|         for(int idx=0;idx<sgrid->lSites();idx++) { | ||||
|           sgrid->LocalIndexToLocalCoor(idx,clbuf); | ||||
|           cgbuf = clbuf; | ||||
|           cgbuf[dim] = clbuf[dim]+L*pc; | ||||
|           peekLocalSite(s,pgbuf,cgbuf); | ||||
|           pokeLocalSite(s,result,clbuf); | ||||
|         } | ||||
|       } | ||||
|       result = result*div; | ||||
|        | ||||
|       // destroying plan | ||||
|       FFTW<scalar>::fftw_destroy_plan(p); | ||||
| #endif | ||||
|     } | ||||
|   { | ||||
|     flops=0; | ||||
|     usec =0; | ||||
|     Coordinate layout(Nd,1); | ||||
|     sgrid = new GridCartesian(dimensions,layout,processors); | ||||
|   }; | ||||
| } | ||||
|      | ||||
|   ~FFT ( void)  { | ||||
|     delete sgrid; | ||||
|   } | ||||
|      | ||||
|   template<class vobj> | ||||
|   void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,Coordinate mask,int sign){ | ||||
|  | ||||
|     conformable(result.Grid(),vgrid); | ||||
|     conformable(source.Grid(),vgrid); | ||||
|     Lattice<vobj> tmp(vgrid); | ||||
|     tmp = source; | ||||
|     for(int d=0;d<Nd;d++){ | ||||
|       if( mask[d] ) { | ||||
| 	FFT_dim(result,tmp,d,sign); | ||||
| 	tmp=result; | ||||
|       } | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   template<class vobj> | ||||
|   void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){ | ||||
|     Coordinate mask(Nd,1); | ||||
|     FFT_dim_mask(result,source,mask,sign); | ||||
|   } | ||||
|  | ||||
|  | ||||
|   template<class vobj> | ||||
|   void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){ | ||||
| #ifndef HAVE_FFTW | ||||
|     assert(0); | ||||
| #else | ||||
|     conformable(result.Grid(),vgrid); | ||||
|     conformable(source.Grid(),vgrid); | ||||
|  | ||||
|     int L = vgrid->_ldimensions[dim]; | ||||
|     int G = vgrid->_fdimensions[dim]; | ||||
|        | ||||
|     Coordinate layout(Nd,1); | ||||
|     Coordinate pencil_gd(vgrid->_fdimensions); | ||||
|        | ||||
|     pencil_gd[dim] = G*processors[dim]; | ||||
|        | ||||
|     // Pencil global vol LxLxGxLxL per node | ||||
|     GridCartesian pencil_g(pencil_gd,layout,processors); | ||||
|        | ||||
|     // Construct pencils | ||||
|     typedef typename vobj::scalar_object sobj; | ||||
|     typedef typename sobj::scalar_type   scalar; | ||||
|        | ||||
|     Lattice<sobj> pgbuf(&pencil_g); | ||||
|     autoView(pgbuf_v , pgbuf, CpuWrite); | ||||
|  | ||||
|     typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar; | ||||
|     typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan; | ||||
|        | ||||
|     int Ncomp = sizeof(sobj)/sizeof(scalar); | ||||
|     int Nlow  = 1; | ||||
|     for(int d=0;d<dim;d++){ | ||||
|       Nlow*=vgrid->_ldimensions[d]; | ||||
|     } | ||||
|        | ||||
|     int rank = 1;  /* 1d transforms */ | ||||
|     int n[] = {G}; /* 1d transforms of length G */ | ||||
|     int howmany = Ncomp; | ||||
|     int odist,idist,istride,ostride; | ||||
|     idist   = odist   = 1;          /* Distance between consecutive FT's */ | ||||
|     istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */ | ||||
|     int *inembed = n, *onembed = n; | ||||
|        | ||||
|     scalar div; | ||||
|     if ( sign == backward ) div = 1.0/G; | ||||
|     else if ( sign == forward ) div = 1.0; | ||||
|     else assert(0); | ||||
|        | ||||
|     FFTW_plan p; | ||||
|     { | ||||
|       FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0]; | ||||
|       FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[0]; | ||||
|       p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany, | ||||
| 					   in,inembed, | ||||
| 					   istride,idist, | ||||
| 					   out,onembed, | ||||
| 					   ostride, odist, | ||||
| 					   sign,FFTW_ESTIMATE); | ||||
|     } | ||||
|        | ||||
|     // Barrel shift and collect global pencil | ||||
|     Coordinate lcoor(Nd), gcoor(Nd); | ||||
|     result = source; | ||||
|     int pc = processor_coor[dim]; | ||||
|     for(int p=0;p<processors[dim];p++) { | ||||
|       { | ||||
| 	autoView(r_v,result,CpuRead); | ||||
| 	autoView(p_v,pgbuf,CpuWrite); | ||||
| 	thread_for(idx, sgrid->lSites(),{ | ||||
|           Coordinate cbuf(Nd); | ||||
|           sobj s; | ||||
| 	  sgrid->LocalIndexToLocalCoor(idx,cbuf); | ||||
| 	  peekLocalSite(s,r_v,cbuf); | ||||
| 	  cbuf[dim]+=((pc+p) % processors[dim])*L; | ||||
| 	  pokeLocalSite(s,p_v,cbuf); | ||||
|         }); | ||||
|       } | ||||
|       if (p != processors[dim] - 1) { | ||||
| 	result = Cshift(result,dim,L); | ||||
|       } | ||||
|     } | ||||
|        | ||||
|     // Loop over orthog coords | ||||
|     int NN=pencil_g.lSites(); | ||||
|     GridStopWatch timer; | ||||
|     timer.Start(); | ||||
|     thread_for( idx,NN,{ | ||||
|         Coordinate cbuf(Nd); | ||||
| 	pencil_g.LocalIndexToLocalCoor(idx, cbuf); | ||||
| 	if ( cbuf[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0 | ||||
| 	  FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[idx]; | ||||
| 	  FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[idx]; | ||||
| 	  FFTW<scalar>::fftw_execute_dft(p,in,out); | ||||
| 	} | ||||
|     }); | ||||
|     timer.Stop(); | ||||
|        | ||||
|     // performance counting | ||||
|     double add,mul,fma; | ||||
|     FFTW<scalar>::fftw_flops(p,&add,&mul,&fma); | ||||
|     flops_call = add+mul+2.0*fma; | ||||
|     usec += timer.useconds(); | ||||
|     flops+= flops_call*NN; | ||||
|        | ||||
|     // writing out result | ||||
|     { | ||||
|       autoView(pgbuf_v,pgbuf,CpuRead); | ||||
|       autoView(result_v,result,CpuWrite); | ||||
|       thread_for(idx,sgrid->lSites(),{ | ||||
| 	Coordinate clbuf(Nd), cgbuf(Nd); | ||||
| 	sobj s; | ||||
| 	sgrid->LocalIndexToLocalCoor(idx,clbuf); | ||||
| 	cgbuf = clbuf; | ||||
| 	cgbuf[dim] = clbuf[dim]+L*pc; | ||||
| 	peekLocalSite(s,pgbuf_v,cgbuf); | ||||
| 	pokeLocalSite(s,result_v,clbuf); | ||||
|       }); | ||||
|     } | ||||
|     result = result*div; | ||||
|        | ||||
|     // destroying plan | ||||
|     FFTW<scalar>::fftw_destroy_plan(p); | ||||
| #endif | ||||
|   } | ||||
| }; | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
| #endif | ||||
|   | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -23,24 +23,24 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_PRECONDITIONER_H | ||||
| #define GRID_PRECONDITIONER_H | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
|   template<class Field> class Preconditioner :  public LinearFunction<Field> {  | ||||
|     virtual void operator()(const Field &src, Field & psi)=0; | ||||
|   }; | ||||
| template<class Field> class Preconditioner :  public LinearFunction<Field> {  | ||||
|   virtual void operator()(const Field &src, Field & psi)=0; | ||||
| }; | ||||
|  | ||||
|   template<class Field> class TrivialPrecon :  public Preconditioner<Field> {  | ||||
|   public: | ||||
|     void operator()(const Field &src, Field & psi){ | ||||
|       psi = src; | ||||
|     } | ||||
|     TrivialPrecon(void){}; | ||||
|   }; | ||||
| template<class Field> class TrivialPrecon :  public Preconditioner<Field> {  | ||||
| public: | ||||
|   void operator()(const Field &src, Field & psi){ | ||||
|     psi = src; | ||||
|   } | ||||
|   TrivialPrecon(void){}; | ||||
| }; | ||||
|  | ||||
| } | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -23,57 +23,58 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef  GRID_ALGORITHM_SPARSE_MATRIX_H | ||||
| #define  GRID_ALGORITHM_SPARSE_MATRIX_H | ||||
|  | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
|   ///////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // Interface defining what I expect of a general sparse matrix, such as a Fermion action | ||||
|   ///////////////////////////////////////////////////////////////////////////////////////////// | ||||
|     template<class Field> class SparseMatrixBase { | ||||
|     public: | ||||
|       virtual GridBase *Grid(void) =0; | ||||
|       // Full checkerboar operations | ||||
|       virtual RealD M    (const Field &in, Field &out)=0; | ||||
|       virtual RealD Mdag (const Field &in, Field &out)=0; | ||||
|       virtual void  MdagM(const Field &in, Field &out,RealD &ni,RealD &no) { | ||||
| 	Field tmp (in._grid); | ||||
| 	ni=M(in,tmp); | ||||
| 	no=Mdag(tmp,out); | ||||
|       } | ||||
|       virtual  void Mdiag    (const Field &in, Field &out)=0; | ||||
|       virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0; | ||||
|     }; | ||||
| ///////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // Interface defining what I expect of a general sparse matrix, such as a Fermion action | ||||
| ///////////////////////////////////////////////////////////////////////////////////////////// | ||||
| template<class Field> class SparseMatrixBase { | ||||
| public: | ||||
|   virtual GridBase *Grid(void) =0; | ||||
|   // Full checkerboar operations | ||||
|   virtual void  M    (const Field &in, Field &out)=0; | ||||
|   virtual void  Mdag (const Field &in, Field &out)=0; | ||||
|   virtual void  MdagM(const Field &in, Field &out) { | ||||
|     Field tmp (in.Grid()); | ||||
|     M(in,tmp); | ||||
|     Mdag(tmp,out); | ||||
|   } | ||||
|   virtual  void Mdiag    (const Field &in, Field &out)=0; | ||||
|   virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0; | ||||
|   virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0; | ||||
| }; | ||||
|  | ||||
|   ///////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // Interface augmented by a red black sparse matrix, such as a Fermion action | ||||
|   ///////////////////////////////////////////////////////////////////////////////////////////// | ||||
|     template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> { | ||||
|     public: | ||||
|       virtual GridBase *RedBlackGrid(void)=0; | ||||
| ///////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // Interface augmented by a red black sparse matrix, such as a Fermion action | ||||
| ///////////////////////////////////////////////////////////////////////////////////////////// | ||||
| template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> { | ||||
| public: | ||||
|   virtual GridBase *RedBlackGrid(void)=0; | ||||
|  | ||||
|       ////////////////////////////////////////////////////////////////////// | ||||
|       // Query the even even properties to make algorithmic decisions | ||||
|       ////////////////////////////////////////////////////////////////////// | ||||
|       virtual RealD  Mass(void)        { return 0.0; }; | ||||
|       virtual int    ConstEE(void)     { return 1; }; // Disable assumptions unless overridden | ||||
|       virtual int    isTrivialEE(void) { return 0; }; // by a derived class that knows better | ||||
|   ////////////////////////////////////////////////////////////////////// | ||||
|   // Query the even even properties to make algorithmic decisions | ||||
|   ////////////////////////////////////////////////////////////////////// | ||||
|   virtual RealD  Mass(void)        { return 0.0; }; | ||||
|   virtual int    ConstEE(void)     { return 1; }; // Disable assumptions unless overridden | ||||
|   virtual int    isTrivialEE(void) { return 0; }; // by a derived class that knows better | ||||
|  | ||||
|       // half checkerboard operaions | ||||
|       virtual  void Meooe    (const Field &in, Field &out)=0; | ||||
|       virtual  void Mooee    (const Field &in, Field &out)=0; | ||||
|       virtual  void MooeeInv (const Field &in, Field &out)=0; | ||||
|   // half checkerboard operaions | ||||
|   virtual  void Meooe    (const Field &in, Field &out)=0; | ||||
|   virtual  void Mooee    (const Field &in, Field &out)=0; | ||||
|   virtual  void MooeeInv (const Field &in, Field &out)=0; | ||||
|  | ||||
|       virtual  void MeooeDag    (const Field &in, Field &out)=0; | ||||
|       virtual  void MooeeDag    (const Field &in, Field &out)=0; | ||||
|       virtual  void MooeeInvDag (const Field &in, Field &out)=0; | ||||
|   virtual  void MeooeDag    (const Field &in, Field &out)=0; | ||||
|   virtual  void MooeeDag    (const Field &in, Field &out)=0; | ||||
|   virtual  void MooeeInvDag (const Field &in, Field &out)=0; | ||||
|  | ||||
|     }; | ||||
| }; | ||||
|  | ||||
| } | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
| #endif | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -25,14 +25,14 @@ Author: Christoph Lehner <clehner@bnl.gov> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_CHEBYSHEV_H | ||||
| #define GRID_CHEBYSHEV_H | ||||
|  | ||||
| #include <Grid/algorithms/LinearOperator.h> | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| struct ChebyParams : Serializable { | ||||
|   GRID_SERIALIZABLE_CLASS_MEMBERS(ChebyParams, | ||||
| @@ -41,337 +41,369 @@ struct ChebyParams : Serializable { | ||||
| 				  int, Npoly); | ||||
| }; | ||||
|  | ||||
|   //////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // Generic Chebyshev approximations | ||||
|   //////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   template<class Field> | ||||
|   class Chebyshev : public OperatorFunction<Field> { | ||||
|   private: | ||||
|     std::vector<RealD> Coeffs; | ||||
|     int order; | ||||
|     RealD hi; | ||||
|     RealD lo; | ||||
| //////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // Generic Chebyshev approximations | ||||
| //////////////////////////////////////////////////////////////////////////////////////////// | ||||
| template<class Field> | ||||
| class Chebyshev : public OperatorFunction<Field> { | ||||
| private: | ||||
|   using OperatorFunction<Field>::operator(); | ||||
|  | ||||
|   public: | ||||
|     void csv(std::ostream &out){ | ||||
|       RealD diff = hi-lo; | ||||
|       RealD delta = (hi-lo)*1.0e-9; | ||||
|       for (RealD x=lo; x<hi; x+=delta) { | ||||
| 	delta*=1.1; | ||||
| 	RealD f = approx(x); | ||||
| 	out<< x<<" "<<f<<std::endl; | ||||
|       } | ||||
|       return; | ||||
|   std::vector<RealD> Coeffs; | ||||
|   int order; | ||||
|   RealD hi; | ||||
|   RealD lo; | ||||
|  | ||||
| public: | ||||
|   void csv(std::ostream &out){ | ||||
|     RealD diff = hi-lo; | ||||
|     RealD delta = diff*1.0e-9; | ||||
|     for (RealD x=lo; x<hi; x+=delta) { | ||||
|       delta*=1.1; | ||||
|       RealD f = approx(x); | ||||
|       out<< x<<" "<<f<<std::endl; | ||||
|     } | ||||
|     return; | ||||
|   } | ||||
|  | ||||
|     // Convenience for plotting the approximation | ||||
|     void   PlotApprox(std::ostream &out) { | ||||
|       out<<"Polynomial approx ["<<lo<<","<<hi<<"]"<<std::endl; | ||||
|       for(RealD x=lo;x<hi;x+=(hi-lo)/50.0){ | ||||
| 	out <<x<<"\t"<<approx(x)<<std::endl; | ||||
|       } | ||||
|     }; | ||||
|  | ||||
|     Chebyshev(){}; | ||||
|     Chebyshev(ChebyParams p){ Init(p.alpha,p.beta,p.Npoly);}; | ||||
|     Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);}; | ||||
|     Chebyshev(RealD _lo,RealD _hi,int _order) {Init(_lo,_hi,_order);}; | ||||
|  | ||||
|     //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|     // c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation". | ||||
|     //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // CJ: the one we need for Lanczos | ||||
|     void Init(RealD _lo,RealD _hi,int _order) | ||||
|     { | ||||
|       lo=_lo; | ||||
|       hi=_hi; | ||||
|       order=_order; | ||||
|        | ||||
|       if(order < 2) exit(-1); | ||||
|       Coeffs.resize(order); | ||||
|       Coeffs.assign(0.,order); | ||||
|       Coeffs[order-1] = 1.; | ||||
|     }; | ||||
|  | ||||
|     void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD)) | ||||
|     { | ||||
|       lo=_lo; | ||||
|       hi=_hi; | ||||
|       order=_order; | ||||
|        | ||||
|       if(order < 2) exit(-1); | ||||
|       Coeffs.resize(order); | ||||
|       for(int j=0;j<order;j++){ | ||||
| 	RealD s=0; | ||||
| 	for(int k=0;k<order;k++){ | ||||
| 	  RealD y=std::cos(M_PI*(k+0.5)/order); | ||||
| 	  RealD x=0.5*(y*(hi-lo)+(hi+lo)); | ||||
| 	  RealD f=func(x); | ||||
| 	  s=s+f*std::cos( j*M_PI*(k+0.5)/order ); | ||||
| 	} | ||||
| 	Coeffs[j] = s * 2.0/order; | ||||
|       } | ||||
|     }; | ||||
|  | ||||
|      | ||||
|     void JacksonSmooth(void){ | ||||
|       RealD M=order; | ||||
|       RealD alpha = M_PI/(M+2); | ||||
|       RealD lmax = std::cos(alpha); | ||||
|       RealD sumUsq =0; | ||||
|       std::vector<RealD> U(M); | ||||
|       std::vector<RealD> a(M); | ||||
|       std::vector<RealD> g(M); | ||||
|       for(int n=0;n<=M;n++){ | ||||
| 	U[n] = std::sin((n+1)*std::acos(lmax))/std::sin(std::acos(lmax)); | ||||
| 	sumUsq += U[n]*U[n]; | ||||
|       }       | ||||
|       sumUsq = std::sqrt(sumUsq); | ||||
|  | ||||
|       for(int i=1;i<=M;i++){ | ||||
| 	a[i] = U[i]/sumUsq; | ||||
|       } | ||||
|       g[0] = 1.0; | ||||
|       for(int m=1;m<=M;m++){ | ||||
| 	g[m] = 0; | ||||
| 	for(int i=0;i<=M-m;i++){ | ||||
| 	  g[m]+= a[i]*a[m+i]; | ||||
| 	} | ||||
|       } | ||||
|       for(int m=1;m<=M;m++){ | ||||
| 	Coeffs[m]*=g[m]; | ||||
|       } | ||||
|     } | ||||
|     RealD approx(RealD x) // Convenience for plotting the approximation | ||||
|     { | ||||
|       RealD Tn; | ||||
|       RealD Tnm; | ||||
|       RealD Tnp; | ||||
|        | ||||
|       RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo)); | ||||
|        | ||||
|       RealD T0=1; | ||||
|       RealD T1=y; | ||||
|        | ||||
|       RealD sum; | ||||
|       sum = 0.5*Coeffs[0]*T0; | ||||
|       sum+= Coeffs[1]*T1; | ||||
|        | ||||
|       Tn =T1; | ||||
|       Tnm=T0; | ||||
|       for(int i=2;i<order;i++){ | ||||
| 	Tnp=2*y*Tn-Tnm; | ||||
| 	Tnm=Tn; | ||||
| 	Tn =Tnp; | ||||
| 	sum+= Tn*Coeffs[i]; | ||||
|       } | ||||
|       return sum; | ||||
|     }; | ||||
|  | ||||
|     RealD approxD(RealD x) | ||||
|     { | ||||
|       RealD Un; | ||||
|       RealD Unm; | ||||
|       RealD Unp; | ||||
|        | ||||
|       RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo)); | ||||
|        | ||||
|       RealD U0=1; | ||||
|       RealD U1=2*y; | ||||
|        | ||||
|       RealD sum; | ||||
|       sum = Coeffs[1]*U0; | ||||
|       sum+= Coeffs[2]*U1*2.0; | ||||
|        | ||||
|       Un =U1; | ||||
|       Unm=U0; | ||||
|       for(int i=2;i<order-1;i++){ | ||||
| 	Unp=2*y*Un-Unm; | ||||
| 	Unm=Un; | ||||
| 	Un =Unp; | ||||
| 	sum+= Un*Coeffs[i+1]*(i+1.0); | ||||
|       } | ||||
|       return sum/(0.5*(hi-lo)); | ||||
|     }; | ||||
|      | ||||
|     RealD approxInv(RealD z, RealD x0, int maxiter, RealD resid) { | ||||
|       RealD x = x0; | ||||
|       RealD eps; | ||||
|        | ||||
|       int i; | ||||
|       for (i=0;i<maxiter;i++) { | ||||
| 	eps = approx(x) - z; | ||||
| 	if (fabs(eps / z) < resid) | ||||
| 	  return x; | ||||
| 	x = x - eps / approxD(x); | ||||
|       } | ||||
|        | ||||
|       return std::numeric_limits<double>::quiet_NaN(); | ||||
|     } | ||||
|      | ||||
|     // Implement the required interface | ||||
|     void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) { | ||||
|  | ||||
|       GridBase *grid=in._grid; | ||||
|  | ||||
|       // std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl; | ||||
|       //std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl; | ||||
|  | ||||
|       int vol=grid->gSites(); | ||||
|  | ||||
|       Field T0(grid); T0 = in;   | ||||
|       Field T1(grid);  | ||||
|       Field T2(grid); | ||||
|       Field y(grid); | ||||
|        | ||||
|       Field *Tnm = &T0; | ||||
|       Field *Tn  = &T1; | ||||
|       Field *Tnp = &T2; | ||||
|  | ||||
|       // Tn=T1 = (xscale M + mscale)in | ||||
|       RealD xscale = 2.0/(hi-lo); | ||||
|       RealD mscale = -(hi+lo)/(hi-lo); | ||||
|       Linop.HermOp(T0,y); | ||||
|       T1=y*xscale+in*mscale; | ||||
|  | ||||
|       // sum = .5 c[0] T0 + c[1] T1 | ||||
|       out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1; | ||||
|       for(int n=2;n<order;n++){ | ||||
| 	 | ||||
| 	Linop.HermOp(*Tn,y); | ||||
|  | ||||
| 	y=xscale*y+mscale*(*Tn); | ||||
|  | ||||
| 	*Tnp=2.0*y-(*Tnm); | ||||
|  | ||||
| 	out=out+Coeffs[n]* (*Tnp); | ||||
|  | ||||
| 	// Cycle pointers to avoid copies | ||||
| 	Field *swizzle = Tnm; | ||||
| 	Tnm    =Tn; | ||||
| 	Tn     =Tnp; | ||||
| 	Tnp    =swizzle; | ||||
| 	   | ||||
|       } | ||||
|   // Convenience for plotting the approximation | ||||
|   void   PlotApprox(std::ostream &out) { | ||||
|     out<<"Polynomial approx ["<<lo<<","<<hi<<"]"<<std::endl; | ||||
|     for(RealD x=lo;x<hi;x+=(hi-lo)/50.0){ | ||||
|       out <<x<<"\t"<<approx(x)<<std::endl; | ||||
|     } | ||||
|   }; | ||||
|  | ||||
|   Chebyshev(){}; | ||||
|   Chebyshev(ChebyParams p){ Init(p.alpha,p.beta,p.Npoly);}; | ||||
|   Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);}; | ||||
|   Chebyshev(RealD _lo,RealD _hi,int _order) {Init(_lo,_hi,_order);}; | ||||
|  | ||||
|   template<class Field> | ||||
|   class ChebyshevLanczos : public Chebyshev<Field> { | ||||
|   private: | ||||
|     std::vector<RealD> Coeffs; | ||||
|     int order; | ||||
|     RealD alpha; | ||||
|     RealD beta; | ||||
|     RealD mu; | ||||
|  | ||||
|   public: | ||||
|     ChebyshevLanczos(RealD _alpha,RealD _beta,RealD _mu,int _order) : | ||||
|     alpha(_alpha), | ||||
|       beta(_beta), | ||||
|           mu(_mu) | ||||
|     { | ||||
|       order=_order; | ||||
|       Coeffs.resize(order); | ||||
|       for(int i=0;i<_order;i++){ | ||||
| 	Coeffs[i] = 0.0; | ||||
|       } | ||||
|       Coeffs[order-1]=1.0; | ||||
|     }; | ||||
|  | ||||
|     void csv(std::ostream &out){ | ||||
|       for (RealD x=-1.2*alpha; x<1.2*alpha; x+=(2.0*alpha)/10000) { | ||||
| 	RealD f = approx(x); | ||||
| 	out<< x<<" "<<f<<std::endl; | ||||
|       } | ||||
|       return; | ||||
|   //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation". | ||||
|   //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // CJ: the one we need for Lanczos | ||||
|   void Init(RealD _lo,RealD _hi,int _order) | ||||
|   { | ||||
|     lo=_lo; | ||||
|     hi=_hi; | ||||
|     order=_order; | ||||
|        | ||||
|     if(order < 2) exit(-1); | ||||
|     Coeffs.resize(order); | ||||
|     Coeffs.assign(0.,order); | ||||
|     Coeffs[order-1] = 1.; | ||||
|   }; | ||||
|    | ||||
|   // PB - more efficient low pass drops high modes above the low as 1/x uses all Chebyshev's. | ||||
|   // Similar kick effect below the threshold as Lanczos filter approach | ||||
|   void InitLowPass(RealD _lo,RealD _hi,int _order) | ||||
|   { | ||||
|     lo=_lo; | ||||
|     hi=_hi; | ||||
|     order=_order; | ||||
|        | ||||
|     if(order < 2) exit(-1); | ||||
|     Coeffs.resize(order); | ||||
|     for(int j=0;j<order;j++){ | ||||
|       RealD k=(order-1.0); | ||||
|       RealD s=std::cos( j*M_PI*(k+0.5)/order ); | ||||
|       Coeffs[j] = s * 2.0/order; | ||||
|     } | ||||
|      | ||||
|   }; | ||||
|  | ||||
|     RealD approx(RealD xx) // Convenience for plotting the approximation | ||||
|     { | ||||
|       RealD Tn; | ||||
|       RealD Tnm; | ||||
|       RealD Tnp; | ||||
|       Real aa = alpha * alpha; | ||||
|       Real bb = beta  *  beta; | ||||
|   void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD)) | ||||
|   { | ||||
|     lo=_lo; | ||||
|     hi=_hi; | ||||
|     order=_order; | ||||
|        | ||||
|       RealD x = ( 2.0 * (xx-mu)*(xx-mu) - (aa+bb) ) / (aa-bb); | ||||
|  | ||||
|       RealD y= x; | ||||
|        | ||||
|       RealD T0=1; | ||||
|       RealD T1=y; | ||||
|        | ||||
|       RealD sum; | ||||
|       sum = 0.5*Coeffs[0]*T0; | ||||
|       sum+= Coeffs[1]*T1; | ||||
|        | ||||
|       Tn =T1; | ||||
|       Tnm=T0; | ||||
|       for(int i=2;i<order;i++){ | ||||
| 	Tnp=2*y*Tn-Tnm; | ||||
| 	Tnm=Tn; | ||||
| 	Tn =Tnp; | ||||
| 	sum+= Tn*Coeffs[i]; | ||||
|       } | ||||
|       return sum; | ||||
|     }; | ||||
|  | ||||
|     // shift_Multiply in Rudy's code | ||||
|     void AminusMuSq(LinearOperatorBase<Field> &Linop, const Field &in, Field &out)  | ||||
|     { | ||||
|       GridBase *grid=in._grid; | ||||
|       Field tmp(grid); | ||||
|  | ||||
|       RealD aa= alpha*alpha; | ||||
|       RealD bb= beta * beta; | ||||
|  | ||||
|       Linop.HermOp(in,out); | ||||
|       out = out - mu*in; | ||||
|  | ||||
|       Linop.HermOp(out,tmp); | ||||
|       tmp = tmp - mu * out; | ||||
|  | ||||
|       out = (2.0/ (aa-bb) ) * tmp -  ((aa+bb)/(aa-bb))*in; | ||||
|     }; | ||||
|     // Implement the required interface | ||||
|     void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) { | ||||
|  | ||||
|       GridBase *grid=in._grid; | ||||
|  | ||||
|       int vol=grid->gSites(); | ||||
|  | ||||
|       Field T0(grid); T0 = in;   | ||||
|       Field T1(grid);  | ||||
|       Field T2(grid); | ||||
|       Field  y(grid); | ||||
|        | ||||
|       Field *Tnm = &T0; | ||||
|       Field *Tn  = &T1; | ||||
|       Field *Tnp = &T2; | ||||
|  | ||||
|       // Tn=T1 = (xscale M )*in | ||||
|       AminusMuSq(Linop,T0,T1); | ||||
|  | ||||
|       // sum = .5 c[0] T0 + c[1] T1 | ||||
|       out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1; | ||||
|       for(int n=2;n<order;n++){ | ||||
| 	 | ||||
| 	AminusMuSq(Linop,*Tn,y); | ||||
|  | ||||
| 	*Tnp=2.0*y-(*Tnm); | ||||
|  | ||||
| 	out=out+Coeffs[n]* (*Tnp); | ||||
|  | ||||
| 	// Cycle pointers to avoid copies | ||||
| 	Field *swizzle = Tnm; | ||||
| 	Tnm    =Tn; | ||||
| 	Tn     =Tnp; | ||||
| 	Tnp    =swizzle; | ||||
| 	   | ||||
|     if(order < 2) exit(-1); | ||||
|     Coeffs.resize(order); | ||||
|     for(int j=0;j<order;j++){ | ||||
|       RealD s=0; | ||||
|       for(int k=0;k<order;k++){ | ||||
| 	RealD y=std::cos(M_PI*(k+0.5)/order); | ||||
| 	RealD x=0.5*(y*(hi-lo)+(hi+lo)); | ||||
| 	RealD f=func(x); | ||||
| 	s=s+f*std::cos( j*M_PI*(k+0.5)/order ); | ||||
|       } | ||||
|       Coeffs[j] = s * 2.0/order; | ||||
|     } | ||||
|   }; | ||||
| } | ||||
|  | ||||
|      | ||||
|   void JacksonSmooth(void){ | ||||
|     RealD M=order; | ||||
|     RealD alpha = M_PI/(M+2); | ||||
|     RealD lmax = std::cos(alpha); | ||||
|     RealD sumUsq =0; | ||||
|     std::vector<RealD> U(M); | ||||
|     std::vector<RealD> a(M); | ||||
|     std::vector<RealD> g(M); | ||||
|     for(int n=0;n<=M;n++){ | ||||
|       U[n] = std::sin((n+1)*std::acos(lmax))/std::sin(std::acos(lmax)); | ||||
|       sumUsq += U[n]*U[n]; | ||||
|     }       | ||||
|     sumUsq = std::sqrt(sumUsq); | ||||
|  | ||||
|     for(int i=1;i<=M;i++){ | ||||
|       a[i] = U[i]/sumUsq; | ||||
|     } | ||||
|     g[0] = 1.0; | ||||
|     for(int m=1;m<=M;m++){ | ||||
|       g[m] = 0; | ||||
|       for(int i=0;i<=M-m;i++){ | ||||
| 	g[m]+= a[i]*a[m+i]; | ||||
|       } | ||||
|     } | ||||
|     for(int m=1;m<=M;m++){ | ||||
|       Coeffs[m]*=g[m]; | ||||
|     } | ||||
|   } | ||||
|   RealD approx(RealD x) // Convenience for plotting the approximation | ||||
|   { | ||||
|     RealD Tn; | ||||
|     RealD Tnm; | ||||
|     RealD Tnp; | ||||
|        | ||||
|     RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo)); | ||||
|        | ||||
|     RealD T0=1; | ||||
|     RealD T1=y; | ||||
|        | ||||
|     RealD sum; | ||||
|     sum = 0.5*Coeffs[0]*T0; | ||||
|     sum+= Coeffs[1]*T1; | ||||
|        | ||||
|     Tn =T1; | ||||
|     Tnm=T0; | ||||
|     for(int i=2;i<order;i++){ | ||||
|       Tnp=2*y*Tn-Tnm; | ||||
|       Tnm=Tn; | ||||
|       Tn =Tnp; | ||||
|       sum+= Tn*Coeffs[i]; | ||||
|     } | ||||
|     return sum; | ||||
|   }; | ||||
|  | ||||
|   RealD approxD(RealD x) | ||||
|   { | ||||
|     RealD Un; | ||||
|     RealD Unm; | ||||
|     RealD Unp; | ||||
|        | ||||
|     RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo)); | ||||
|        | ||||
|     RealD U0=1; | ||||
|     RealD U1=2*y; | ||||
|        | ||||
|     RealD sum; | ||||
|     sum = Coeffs[1]*U0; | ||||
|     sum+= Coeffs[2]*U1*2.0; | ||||
|        | ||||
|     Un =U1; | ||||
|     Unm=U0; | ||||
|     for(int i=2;i<order-1;i++){ | ||||
|       Unp=2*y*Un-Unm; | ||||
|       Unm=Un; | ||||
|       Un =Unp; | ||||
|       sum+= Un*Coeffs[i+1]*(i+1.0); | ||||
|     } | ||||
|     return sum/(0.5*(hi-lo)); | ||||
|   }; | ||||
|      | ||||
|   RealD approxInv(RealD z, RealD x0, int maxiter, RealD resid) { | ||||
|     RealD x = x0; | ||||
|     RealD eps; | ||||
|        | ||||
|     int i; | ||||
|     for (i=0;i<maxiter;i++) { | ||||
|       eps = approx(x) - z; | ||||
|       if (fabs(eps / z) < resid) | ||||
| 	return x; | ||||
|       x = x - eps / approxD(x); | ||||
|     } | ||||
|        | ||||
|     return std::numeric_limits<double>::quiet_NaN(); | ||||
|   } | ||||
|      | ||||
|   // Implement the required interface | ||||
|   void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) { | ||||
|  | ||||
|     GridBase *grid=in.Grid(); | ||||
|  | ||||
|     int vol=grid->gSites(); | ||||
|     typedef typename Field::vector_type vector_type; | ||||
|  | ||||
|     Field T0(grid); T0 = in;   | ||||
|     Field T1(grid);  | ||||
|     Field T2(grid); | ||||
|     Field y(grid); | ||||
|        | ||||
|     Field *Tnm = &T0; | ||||
|     Field *Tn  = &T1; | ||||
|     Field *Tnp = &T2; | ||||
|  | ||||
|     // Tn=T1 = (xscale M + mscale)in | ||||
|     RealD xscale = 2.0/(hi-lo); | ||||
|     RealD mscale = -(hi+lo)/(hi-lo); | ||||
|     Linop.HermOp(T0,y); | ||||
|     axpby(T1,xscale,mscale,y,in); | ||||
|  | ||||
|     // sum = .5 c[0] T0 + c[1] T1 | ||||
|     //    out = ()*T0 + Coeffs[1]*T1; | ||||
|     axpby(out,0.5*Coeffs[0],Coeffs[1],T0,T1); | ||||
|     for(int n=2;n<order;n++){ | ||||
|  | ||||
|       Linop.HermOp(*Tn,y); | ||||
| #if 0 | ||||
|       auto y_v = y.View(); | ||||
|       auto Tn_v = Tn->View(); | ||||
|       auto Tnp_v = Tnp->View(); | ||||
|       auto Tnm_v = Tnm->View(); | ||||
|       constexpr int Nsimd = vector_type::Nsimd(); | ||||
|       accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, { | ||||
| 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); | ||||
| 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss)); | ||||
|       }); | ||||
|       if ( Coeffs[n] != 0.0) { | ||||
| 	axpy(out,Coeffs[n],*Tnp,out); | ||||
|       } | ||||
| #else | ||||
|       axpby(y,xscale,mscale,y,(*Tn)); | ||||
|       axpby(*Tnp,2.0,-1.0,y,(*Tnm)); | ||||
|       if ( Coeffs[n] != 0.0) { | ||||
| 	axpy(out,Coeffs[n],*Tnp,out); | ||||
|       } | ||||
| #endif | ||||
|       // Cycle pointers to avoid copies | ||||
|       Field *swizzle = Tnm; | ||||
|       Tnm    =Tn; | ||||
|       Tn     =Tnp; | ||||
|       Tnp    =swizzle; | ||||
| 	   | ||||
|     } | ||||
|   } | ||||
| }; | ||||
|  | ||||
|  | ||||
| template<class Field> | ||||
| class ChebyshevLanczos : public Chebyshev<Field> { | ||||
| private: | ||||
|   std::vector<RealD> Coeffs; | ||||
|   int order; | ||||
|   RealD alpha; | ||||
|   RealD beta; | ||||
|   RealD mu; | ||||
|  | ||||
| public: | ||||
|   ChebyshevLanczos(RealD _alpha,RealD _beta,RealD _mu,int _order) : | ||||
|     alpha(_alpha), | ||||
|     beta(_beta), | ||||
|     mu(_mu) | ||||
|   { | ||||
|     order=_order; | ||||
|     Coeffs.resize(order); | ||||
|     for(int i=0;i<_order;i++){ | ||||
|       Coeffs[i] = 0.0; | ||||
|     } | ||||
|     Coeffs[order-1]=1.0; | ||||
|   }; | ||||
|  | ||||
|   void csv(std::ostream &out){ | ||||
|     for (RealD x=-1.2*alpha; x<1.2*alpha; x+=(2.0*alpha)/10000) { | ||||
|       RealD f = approx(x); | ||||
|       out<< x<<" "<<f<<std::endl; | ||||
|     } | ||||
|     return; | ||||
|   } | ||||
|  | ||||
|   RealD approx(RealD xx) // Convenience for plotting the approximation | ||||
|   { | ||||
|     RealD Tn; | ||||
|     RealD Tnm; | ||||
|     RealD Tnp; | ||||
|     Real aa = alpha * alpha; | ||||
|     Real bb = beta  *  beta; | ||||
|        | ||||
|     RealD x = ( 2.0 * (xx-mu)*(xx-mu) - (aa+bb) ) / (aa-bb); | ||||
|  | ||||
|     RealD y= x; | ||||
|        | ||||
|     RealD T0=1; | ||||
|     RealD T1=y; | ||||
|        | ||||
|     RealD sum; | ||||
|     sum = 0.5*Coeffs[0]*T0; | ||||
|     sum+= Coeffs[1]*T1; | ||||
|        | ||||
|     Tn =T1; | ||||
|     Tnm=T0; | ||||
|     for(int i=2;i<order;i++){ | ||||
|       Tnp=2*y*Tn-Tnm; | ||||
|       Tnm=Tn; | ||||
|       Tn =Tnp; | ||||
|       sum+= Tn*Coeffs[i]; | ||||
|     } | ||||
|     return sum; | ||||
|   }; | ||||
|  | ||||
|   // shift_Multiply in Rudy's code | ||||
|   void AminusMuSq(LinearOperatorBase<Field> &Linop, const Field &in, Field &out)  | ||||
|   { | ||||
|     GridBase *grid=in.Grid(); | ||||
|     Field tmp(grid); | ||||
|  | ||||
|     RealD aa= alpha*alpha; | ||||
|     RealD bb= beta * beta; | ||||
|  | ||||
|     Linop.HermOp(in,out); | ||||
|     out = out - mu*in; | ||||
|  | ||||
|     Linop.HermOp(out,tmp); | ||||
|     tmp = tmp - mu * out; | ||||
|  | ||||
|     out = (2.0/ (aa-bb) ) * tmp -  ((aa+bb)/(aa-bb))*in; | ||||
|   }; | ||||
|   // Implement the required interface | ||||
|   void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) { | ||||
|  | ||||
|     GridBase *grid=in.Grid(); | ||||
|  | ||||
|     int vol=grid->gSites(); | ||||
|  | ||||
|     Field T0(grid); T0 = in;   | ||||
|     Field T1(grid);  | ||||
|     Field T2(grid); | ||||
|     Field  y(grid); | ||||
|        | ||||
|     Field *Tnm = &T0; | ||||
|     Field *Tn  = &T1; | ||||
|     Field *Tnp = &T2; | ||||
|  | ||||
|     // Tn=T1 = (xscale M )*in | ||||
|     AminusMuSq(Linop,T0,T1); | ||||
|  | ||||
|     // sum = .5 c[0] T0 + c[1] T1 | ||||
|     out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1; | ||||
|     for(int n=2;n<order;n++){ | ||||
| 	 | ||||
|       AminusMuSq(Linop,*Tn,y); | ||||
|  | ||||
|       *Tnp=2.0*y-(*Tnm); | ||||
|  | ||||
|       out=out+Coeffs[n]* (*Tnp); | ||||
|  | ||||
|       // Cycle pointers to avoid copies | ||||
|       Field *swizzle = Tnm; | ||||
|       Tnm    =Tn; | ||||
|       Tn     =Tnp; | ||||
|       Tnp    =swizzle; | ||||
| 	   | ||||
|     } | ||||
|   } | ||||
| }; | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
|   | ||||
| @@ -26,127 +26,127 @@ with this program; if not, write to the Free Software Foundation, Inc., | ||||
|  | ||||
| See the full license in the file "LICENSE" in the top level distribution directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| 			   /*  END LEGAL */ | ||||
|  | ||||
| #ifndef INCLUDED_FORECAST_H | ||||
| #define INCLUDED_FORECAST_H | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
|   // Abstract base class. | ||||
|   // Takes a matrix (Mat), a source (phi), and a vector of Fields (chi) | ||||
|   // and returns a forecasted solution to the system D*psi = phi (psi). | ||||
|   template<class Matrix, class Field> | ||||
|   class Forecast | ||||
| // Abstract base class. | ||||
| // Takes a matrix (Mat), a source (phi), and a vector of Fields (chi) | ||||
| // and returns a forecasted solution to the system D*psi = phi (psi). | ||||
| template<class Matrix, class Field> | ||||
| class Forecast | ||||
| { | ||||
| public: | ||||
|   virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& chi) = 0; | ||||
| }; | ||||
|  | ||||
| // Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012), | ||||
| // used to forecast solutions across poles of the EOFA heatbath. | ||||
| // | ||||
| // Modified from CPS (cps_pp/src/util/dirac_op/d_op_base/comsrc/minresext.C) | ||||
| template<class Matrix, class Field> | ||||
| class ChronoForecast : public Forecast<Matrix,Field> | ||||
| { | ||||
| public: | ||||
|   Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns) | ||||
|   { | ||||
|     public: | ||||
|       virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& chi) = 0; | ||||
|     int degree = prev_solns.size(); | ||||
|     Field chi(phi); // forecasted solution | ||||
|  | ||||
|     // Trivial cases | ||||
|     if(degree == 0){ chi = Zero(); return chi; } | ||||
|     else if(degree == 1){ return prev_solns[0]; } | ||||
|  | ||||
|     //    RealD dot; | ||||
|     ComplexD xp; | ||||
|     Field r(phi); // residual | ||||
|     Field Mv(phi); | ||||
|     std::vector<Field> v(prev_solns); // orthonormalized previous solutions | ||||
|     std::vector<Field> MdagMv(degree,phi); | ||||
|  | ||||
|     // Array to hold the matrix elements | ||||
|     std::vector<std::vector<ComplexD>> G(degree, std::vector<ComplexD>(degree)); | ||||
|  | ||||
|     // Solution and source vectors | ||||
|     std::vector<ComplexD> a(degree); | ||||
|     std::vector<ComplexD> b(degree); | ||||
|  | ||||
|     // Orthonormalize the vector basis | ||||
|     for(int i=0; i<degree; i++){ | ||||
|       v[i] *= 1.0/std::sqrt(norm2(v[i])); | ||||
|       for(int j=i+1; j<degree; j++){ v[j] -= innerProduct(v[i],v[j]) * v[i]; } | ||||
|     } | ||||
|  | ||||
|     // Perform sparse matrix multiplication and construct rhs | ||||
|     for(int i=0; i<degree; i++){ | ||||
|       b[i] = innerProduct(v[i],phi); | ||||
|       Mat.M(v[i],Mv); | ||||
|       Mat.Mdag(Mv,MdagMv[i]); | ||||
|       G[i][i] = innerProduct(v[i],MdagMv[i]); | ||||
|     } | ||||
|  | ||||
|     // Construct the matrix | ||||
|     for(int j=0; j<degree; j++){ | ||||
|       for(int k=j+1; k<degree; k++){ | ||||
| 	G[j][k] = innerProduct(v[j],MdagMv[k]); | ||||
| 	G[k][j] = conjugate(G[j][k]); | ||||
|       }} | ||||
|  | ||||
|     // Gauss-Jordan elimination with partial pivoting | ||||
|     for(int i=0; i<degree; i++){ | ||||
|  | ||||
|       // Perform partial pivoting | ||||
|       int k = i; | ||||
|       for(int j=i+1; j<degree; j++){ if(abs(G[j][j]) > abs(G[k][k])){ k = j; } } | ||||
|       if(k != i){ | ||||
| 	xp = b[k]; | ||||
| 	b[k] = b[i]; | ||||
| 	b[i] = xp; | ||||
| 	for(int j=0; j<degree; j++){ | ||||
| 	  xp = G[k][j]; | ||||
| 	  G[k][j] = G[i][j]; | ||||
| 	  G[i][j] = xp; | ||||
| 	} | ||||
|       } | ||||
|  | ||||
|       // Convert matrix to upper triangular form | ||||
|       for(int j=i+1; j<degree; j++){ | ||||
| 	xp = G[j][i]/G[i][i]; | ||||
| 	b[j] -= xp * b[i]; | ||||
| 	for(int k=0; k<degree; k++){ G[j][k] -= xp*G[i][k]; } | ||||
|       } | ||||
|     } | ||||
|  | ||||
|     // Use Gaussian elimination to solve equations and calculate initial guess | ||||
|     chi = Zero(); | ||||
|     r = phi; | ||||
|     for(int i=degree-1; i>=0; i--){ | ||||
|       a[i] = 0.0; | ||||
|       for(int j=i+1; j<degree; j++){ a[i] += G[i][j] * a[j]; } | ||||
|       a[i] = (b[i]-a[i])/G[i][i]; | ||||
|       chi += a[i]*v[i]; | ||||
|       r -= a[i]*MdagMv[i]; | ||||
|     } | ||||
|  | ||||
|     RealD true_r(0.0); | ||||
|     ComplexD tmp; | ||||
|     for(int i=0; i<degree; i++){ | ||||
|       tmp = -b[i]; | ||||
|       for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; } | ||||
|       tmp = conjugate(tmp)*tmp; | ||||
|       true_r += std::sqrt(tmp.real()); | ||||
|     } | ||||
|  | ||||
|     RealD error = std::sqrt(norm2(r)/norm2(phi)); | ||||
|     std::cout << GridLogMessage << "ChronoForecast: |res|/|src| = " << error << std::endl; | ||||
|  | ||||
|     return chi; | ||||
|   }; | ||||
| }; | ||||
|  | ||||
|   // Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012), | ||||
|   // used to forecast solutions across poles of the EOFA heatbath. | ||||
|   // | ||||
|   // Modified from CPS (cps_pp/src/util/dirac_op/d_op_base/comsrc/minresext.C) | ||||
|   template<class Matrix, class Field> | ||||
|   class ChronoForecast : public Forecast<Matrix,Field> | ||||
|   { | ||||
|     public: | ||||
|       Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns) | ||||
|       { | ||||
|         int degree = prev_solns.size(); | ||||
|         Field chi(phi); // forecasted solution | ||||
|  | ||||
|         // Trivial cases | ||||
|         if(degree == 0){ chi = zero; return chi; } | ||||
|         else if(degree == 1){ return prev_solns[0]; } | ||||
|  | ||||
|         RealD dot; | ||||
|         ComplexD xp; | ||||
|         Field r(phi); // residual | ||||
|         Field Mv(phi); | ||||
|         std::vector<Field> v(prev_solns); // orthonormalized previous solutions | ||||
|         std::vector<Field> MdagMv(degree,phi); | ||||
|  | ||||
|         // Array to hold the matrix elements | ||||
|         std::vector<std::vector<ComplexD>> G(degree, std::vector<ComplexD>(degree)); | ||||
|  | ||||
|         // Solution and source vectors | ||||
|         std::vector<ComplexD> a(degree); | ||||
|         std::vector<ComplexD> b(degree); | ||||
|  | ||||
|         // Orthonormalize the vector basis | ||||
|         for(int i=0; i<degree; i++){ | ||||
|           v[i] *= 1.0/std::sqrt(norm2(v[i])); | ||||
|           for(int j=i+1; j<degree; j++){ v[j] -= innerProduct(v[i],v[j]) * v[i]; } | ||||
|         } | ||||
|  | ||||
|         // Perform sparse matrix multiplication and construct rhs | ||||
|         for(int i=0; i<degree; i++){ | ||||
|           b[i] = innerProduct(v[i],phi); | ||||
|           Mat.M(v[i],Mv); | ||||
|           Mat.Mdag(Mv,MdagMv[i]); | ||||
|           G[i][i] = innerProduct(v[i],MdagMv[i]); | ||||
|         } | ||||
|  | ||||
|         // Construct the matrix | ||||
|         for(int j=0; j<degree; j++){ | ||||
|         for(int k=j+1; k<degree; k++){ | ||||
|           G[j][k] = innerProduct(v[j],MdagMv[k]); | ||||
|           G[k][j] = std::conj(G[j][k]); | ||||
|         }} | ||||
|  | ||||
|         // Gauss-Jordan elimination with partial pivoting | ||||
|         for(int i=0; i<degree; i++){ | ||||
|  | ||||
|           // Perform partial pivoting | ||||
|           int k = i; | ||||
|           for(int j=i+1; j<degree; j++){ if(std::abs(G[j][j]) > std::abs(G[k][k])){ k = j; } } | ||||
|           if(k != i){ | ||||
|             xp = b[k]; | ||||
|             b[k] = b[i]; | ||||
|             b[i] = xp; | ||||
|             for(int j=0; j<degree; j++){ | ||||
|               xp = G[k][j]; | ||||
|               G[k][j] = G[i][j]; | ||||
|               G[i][j] = xp; | ||||
|             } | ||||
|           } | ||||
|  | ||||
|           // Convert matrix to upper triangular form | ||||
|           for(int j=i+1; j<degree; j++){ | ||||
|             xp = G[j][i]/G[i][i]; | ||||
|             b[j] -= xp * b[i]; | ||||
|             for(int k=0; k<degree; k++){ G[j][k] -= xp*G[i][k]; } | ||||
|           } | ||||
|         } | ||||
|  | ||||
|         // Use Gaussian elimination to solve equations and calculate initial guess | ||||
|         chi = zero; | ||||
|         r = phi; | ||||
|         for(int i=degree-1; i>=0; i--){ | ||||
|           a[i] = 0.0; | ||||
|           for(int j=i+1; j<degree; j++){ a[i] += G[i][j] * a[j]; } | ||||
|           a[i] = (b[i]-a[i])/G[i][i]; | ||||
|           chi += a[i]*v[i]; | ||||
|           r -= a[i]*MdagMv[i]; | ||||
|         } | ||||
|  | ||||
|         RealD true_r(0.0); | ||||
|         ComplexD tmp; | ||||
|         for(int i=0; i<degree; i++){ | ||||
|           tmp = -b[i]; | ||||
|           for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; } | ||||
|           tmp = std::conj(tmp)*tmp; | ||||
|           true_r += std::sqrt(tmp.real()); | ||||
|         } | ||||
|  | ||||
|         RealD error = std::sqrt(norm2(r)/norm2(phi)); | ||||
|         std::cout << GridLogMessage << "ChronoForecast: |res|/|src| = " << error << std::endl; | ||||
|  | ||||
|         return chi; | ||||
|       }; | ||||
|   }; | ||||
|  | ||||
| } | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
| #endif | ||||
|   | ||||
							
								
								
									
										129
									
								
								Grid/algorithms/approx/JacobiPolynomial.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										129
									
								
								Grid/algorithms/approx/JacobiPolynomial.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,129 @@ | ||||
| #ifndef GRID_JACOBIPOLYNOMIAL_H | ||||
| #define GRID_JACOBIPOLYNOMIAL_H | ||||
|  | ||||
| #include <Grid/algorithms/LinearOperator.h> | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| template<class Field> | ||||
| class JacobiPolynomial : public OperatorFunction<Field> { | ||||
|  private: | ||||
|   using OperatorFunction<Field>::operator(); | ||||
|  | ||||
|   int order; | ||||
|   RealD hi; | ||||
|   RealD lo; | ||||
|   RealD alpha; | ||||
|   RealD beta; | ||||
|  | ||||
|  public: | ||||
|   void csv(std::ostream &out){ | ||||
|     csv(out,lo,hi); | ||||
|   } | ||||
|   void csv(std::ostream &out,RealD llo,RealD hhi){ | ||||
|     RealD diff = hhi-llo; | ||||
|     RealD delta = diff*1.0e-5; | ||||
|     for (RealD x=llo-delta; x<=hhi; x+=delta) { | ||||
|       RealD f = approx(x); | ||||
|       out<< x<<" "<<f <<std::endl; | ||||
|     } | ||||
|     return; | ||||
|   } | ||||
|  | ||||
|   JacobiPolynomial(){}; | ||||
|   JacobiPolynomial(RealD _lo,RealD _hi,int _order,RealD _alpha, RealD _beta) | ||||
|   { | ||||
|       lo=_lo; | ||||
|       hi=_hi; | ||||
|       alpha=_alpha; | ||||
|       beta=_beta; | ||||
|       order=_order; | ||||
|   }; | ||||
|  | ||||
|   RealD approx(RealD x) // Convenience for plotting the approximation                                                        | ||||
|   { | ||||
|     RealD Tn; | ||||
|     RealD Tnm; | ||||
|     RealD Tnp; | ||||
|  | ||||
|     RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo)); | ||||
|  | ||||
|     RealD T0=1.0; | ||||
|     RealD T1=(alpha-beta)*0.5+(alpha+beta+2.0)*0.5*y; | ||||
|  | ||||
|     Tn =T1; | ||||
|     Tnm=T0; | ||||
|     for(int n=2;n<=order;n++){ | ||||
|       RealD cnp = 2.0*n*(n+alpha+beta)*(2.0*n-2.0+alpha+beta); | ||||
|       RealD cny = (2.0*n-2.0+alpha+beta)*(2.0*n-1.0+alpha+beta)*(2.0*n+alpha+beta); | ||||
|       RealD cn1 = (2.0*n+alpha+beta-1.0)*(alpha*alpha-beta*beta); | ||||
|       RealD cnm = - 2.0*(n+alpha-1.0)*(n+beta-1.0)*(2.0*n+alpha+beta); | ||||
|       Tnp= ( cny * y *Tn + cn1 * Tn + cnm * Tnm )/ cnp; | ||||
|       Tnm=Tn; | ||||
|       Tn =Tnp; | ||||
|     } | ||||
|     return Tnp; | ||||
|   }; | ||||
|  | ||||
|   // Implement the required interface                                                                                        | ||||
|   void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) { | ||||
|     GridBase *grid=in.Grid(); | ||||
|  | ||||
|     int vol=grid->gSites(); | ||||
|  | ||||
|     Field T0(grid); | ||||
|     Field T1(grid); | ||||
|     Field T2(grid); | ||||
|     Field y(grid); | ||||
|  | ||||
|  | ||||
|     Field *Tnm = &T0; | ||||
|     Field *Tn  = &T1; | ||||
|     Field *Tnp = &T2; | ||||
|  | ||||
|     //    RealD T0=1.0;                                                                                                      | ||||
|     T0=in; | ||||
|  | ||||
|     //    RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));                                                                            | ||||
|     //           = x * 2/(hi-lo) - (hi+lo)/(hi-lo)                                                                           | ||||
|     Linop.HermOp(T0,y); | ||||
|     RealD xscale = 2.0/(hi-lo); | ||||
|     RealD mscale = -(hi+lo)/(hi-lo); | ||||
|     Linop.HermOp(T0,y); | ||||
|     y=y*xscale+in*mscale; | ||||
|  | ||||
|     // RealD T1=(alpha-beta)*0.5+(alpha+beta+2.0)*0.5*y; | ||||
|     RealD halfAmB  = (alpha-beta)*0.5; | ||||
|     RealD halfApBp2= (alpha+beta+2.0)*0.5; | ||||
|     T1 = halfAmB * in + halfApBp2*y; | ||||
|  | ||||
|     for(int n=2;n<=order;n++){ | ||||
|  | ||||
|       Linop.HermOp(*Tn,y); | ||||
|       y=xscale*y+mscale*(*Tn); | ||||
|  | ||||
|       RealD cnp = 2.0*n*(n+alpha+beta)*(2.0*n-2.0+alpha+beta); | ||||
|       RealD cny = (2.0*n-2.0+alpha+beta)*(2.0*n-1.0+alpha+beta)*(2.0*n+alpha+beta); | ||||
|       RealD cn1 = (2.0*n+alpha+beta-1.0)*(alpha*alpha-beta*beta); | ||||
|       RealD cnm = - 2.0*(n+alpha-1.0)*(n+beta-1.0)*(2.0*n+alpha+beta); | ||||
|  | ||||
|       //      Tnp= ( cny * y *Tn + cn1 * Tn + cnm * Tnm )/ cnp;                                                              | ||||
|       cny=cny/cnp; | ||||
|       cn1=cn1/cnp; | ||||
|       cn1=cn1/cnp; | ||||
|       cnm=cnm/cnp; | ||||
|  | ||||
|       *Tnp=cny*y + cn1 *(*Tn) + cnm * (*Tnm); | ||||
|  | ||||
|       // Cycle pointers to avoid copies                                                                                      | ||||
|       Field *swizzle = Tnm; | ||||
|       Tnm    =Tn; | ||||
|       Tn     =Tnp; | ||||
|       Tnp    =swizzle; | ||||
|     } | ||||
|     out=*Tnp; | ||||
|  | ||||
|   } | ||||
| }; | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
| @@ -27,7 +27,8 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | ||||
|     /*  END LEGAL */ | ||||
| #include <Grid/GridCore.h> | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| double MultiShiftFunction::approx(double x) | ||||
| { | ||||
|   double a = norm; | ||||
| @@ -53,4 +54,4 @@ void MultiShiftFunction::csv(std::ostream &out) | ||||
|   } | ||||
|   return; | ||||
| } | ||||
| } | ||||
| NAMESPACE_END(Grid); | ||||
|   | ||||
| @@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| #ifndef MULTI_SHIFT_FUNCTION | ||||
| #define MULTI_SHIFT_FUNCTION | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| class MultiShiftFunction { | ||||
| public: | ||||
| @@ -63,5 +63,5 @@ public: | ||||
|   } | ||||
|  | ||||
| }; | ||||
| } | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
|   | ||||
| @@ -298,7 +298,7 @@ void AlgRemez::stpini(bigfloat *step) { | ||||
| // Search for error maxima and minima | ||||
| void AlgRemez::search(bigfloat *step) { | ||||
|   bigfloat a, q, xm, ym, xn, yn, xx0, xx1; | ||||
|   int i, j, meq, emsign, ensign, steps; | ||||
|   int i, meq, emsign, ensign, steps; | ||||
|  | ||||
|   meq = neq + 1; | ||||
|   bigfloat *yy = new bigfloat[meq]; | ||||
| @@ -306,7 +306,6 @@ void AlgRemez::search(bigfloat *step) { | ||||
|   bigfloat eclose = 1.0e30; | ||||
|   bigfloat farther = 0l; | ||||
|  | ||||
|   j = 1; | ||||
|   xx0 = apstrt; | ||||
|  | ||||
|   for (i = 0; i < meq; i++) { | ||||
|   | ||||
							
								
								
									
										473
									
								
								Grid/algorithms/approx/RemezGeneral.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										473
									
								
								Grid/algorithms/approx/RemezGeneral.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,473 @@ | ||||
| #include<math.h> | ||||
| #include<stdio.h> | ||||
| #include<stdlib.h> | ||||
| #include<string> | ||||
| #include<iostream> | ||||
| #include<iomanip> | ||||
| #include<cassert> | ||||
|  | ||||
| #include<Grid/algorithms/approx/RemezGeneral.h> | ||||
|  | ||||
|  | ||||
| // Constructor | ||||
| AlgRemezGeneral::AlgRemezGeneral(double lower, double upper, long precision, | ||||
| 				 bigfloat (*f)(bigfloat x, void *data), void *data): f(f),  | ||||
| 										     data(data),  | ||||
| 										     prec(precision), | ||||
| 										     apstrt(lower), apend(upper), apwidt(upper - lower), | ||||
| 										     n(0), d(0), pow_n(0), pow_d(0) | ||||
| { | ||||
|   bigfloat::setDefaultPrecision(prec); | ||||
|  | ||||
|   std::cout<<"Approximation bounds are ["<<apstrt<<","<<apend<<"]\n"; | ||||
|   std::cout<<"Precision of arithmetic is "<<precision<<std::endl; | ||||
| } | ||||
|  | ||||
| //Determine the properties of the numerator and denominator polynomials | ||||
| void AlgRemezGeneral::setupPolyProperties(int num_degree, int den_degree, PolyType num_type_in, PolyType den_type_in){ | ||||
|   pow_n = num_degree; | ||||
|   pow_d = den_degree; | ||||
|  | ||||
|   if(pow_n % 2 == 0 && num_type_in == PolyType::Odd) assert(0); | ||||
|   if(pow_n % 2 == 1 && num_type_in == PolyType::Even) assert(0); | ||||
|  | ||||
|   if(pow_d % 2 == 0 && den_type_in == PolyType::Odd) assert(0); | ||||
|   if(pow_d % 2 == 1 && den_type_in == PolyType::Even) assert(0); | ||||
|  | ||||
|   num_type = num_type_in; | ||||
|   den_type = den_type_in; | ||||
|  | ||||
|   num_pows.resize(pow_n+1); | ||||
|   den_pows.resize(pow_d+1); | ||||
|  | ||||
|   int n_in = 0; | ||||
|   bool odd = num_type == PolyType::Full || num_type == PolyType::Odd; | ||||
|   bool even = num_type == PolyType::Full || num_type == PolyType::Even; | ||||
|   for(int i=0;i<=pow_n;i++){ | ||||
|     num_pows[i] = -1; | ||||
|     if(i % 2 == 0 && even) num_pows[i] = n_in++; | ||||
|     if(i % 2 == 1 && odd) num_pows[i] = n_in++; | ||||
|   } | ||||
|  | ||||
|   std::cout << n_in << " terms in numerator" << std::endl; | ||||
|   --n_in; //power is 1 less than the number of terms, eg  pow=1   a x^1  + b x^0 | ||||
|  | ||||
|   int d_in = 0; | ||||
|   odd = den_type == PolyType::Full || den_type == PolyType::Odd; | ||||
|   even = den_type == PolyType::Full || den_type == PolyType::Even; | ||||
|   for(int i=0;i<=pow_d;i++){ | ||||
|     den_pows[i] = -1; | ||||
|     if(i % 2 == 0 && even) den_pows[i] = d_in++; | ||||
|     if(i % 2 == 1 && odd) den_pows[i] = d_in++; | ||||
|   } | ||||
|  | ||||
|   std::cout << d_in << " terms in denominator" << std::endl; | ||||
|   --d_in; | ||||
|  | ||||
|   n = n_in; | ||||
|   d = d_in; | ||||
| } | ||||
|  | ||||
| //Setup algorithm | ||||
| void AlgRemezGeneral::reinitializeAlgorithm(){ | ||||
|   spread = 1.0e37; | ||||
|   iter = 0; | ||||
|  | ||||
|   neq = n + d + 1; //not +2 because highest-power term in denominator is fixed to 1 | ||||
|  | ||||
|   param.resize(neq); | ||||
|   yy.resize(neq+1); | ||||
|  | ||||
|   //Initialize linear equation temporaries | ||||
|   A.resize(neq*neq); | ||||
|   B.resize(neq); | ||||
|   IPS.resize(neq); | ||||
|  | ||||
|   //Initialize maximum and minimum errors | ||||
|   xx.resize(neq+2); | ||||
|   mm.resize(neq+1); | ||||
|   initialGuess(); | ||||
|  | ||||
|   //Initialize search steps | ||||
|   step.resize(neq+1); | ||||
|   stpini(); | ||||
| } | ||||
|  | ||||
| double AlgRemezGeneral::generateApprox(const int num_degree, const int den_degree,  | ||||
| 				       const PolyType num_type_in, const PolyType den_type_in,  | ||||
| 				       const double _tolerance, const int report_freq){ | ||||
|   //Setup the properties of the polynomial | ||||
|   setupPolyProperties(num_degree, den_degree, num_type_in, den_type_in); | ||||
|  | ||||
|   //Setup the algorithm | ||||
|   reinitializeAlgorithm(); | ||||
|  | ||||
|   bigfloat tolerance = _tolerance; | ||||
|  | ||||
|   //Iterate until convergance | ||||
|   while (spread > tolerance) {  | ||||
|     if (iter++ % report_freq==0) | ||||
|       std::cout<<"Iteration " <<iter-1<<" spread "<<(double)spread<<" delta "<<(double)delta << std::endl;  | ||||
|  | ||||
|     equations(); | ||||
|     if (delta < tolerance) { | ||||
|       std::cout<<"Iteration " << iter-1 << " delta too small (" << delta << "<" << tolerance << "), try increasing precision\n"; | ||||
|       assert(0); | ||||
|     };     | ||||
|     assert( delta>= tolerance ); | ||||
|  | ||||
|     search(); | ||||
|   } | ||||
|  | ||||
|   int sign; | ||||
|   double error = (double)getErr(mm[0],&sign); | ||||
|   std::cout<<"Converged at "<<iter<<" iterations; error = "<<error<<std::endl; | ||||
|  | ||||
|   // Return the maximum error in the approximation | ||||
|   return error; | ||||
| } | ||||
|  | ||||
|  | ||||
| // Initial values of maximal and minimal errors | ||||
| void AlgRemezGeneral::initialGuess(){ | ||||
|   // Supply initial guesses for solution points | ||||
|   long ncheb = neq;			// Degree of Chebyshev error estimate | ||||
|  | ||||
|   // Find ncheb+1 extrema of Chebyshev polynomial | ||||
|   bigfloat a = ncheb; | ||||
|   bigfloat r; | ||||
|  | ||||
|   mm[0] = apstrt; | ||||
|   for (long i = 1; i < ncheb; i++) { | ||||
|     r = 0.5 * (1 - cos((M_PI * i)/(double) a)); | ||||
|     //r *= sqrt_bf(r); | ||||
|     r = (exp((double)r)-1.0)/(exp(1.0)-1.0); | ||||
|     mm[i] = apstrt + r * apwidt; | ||||
|   } | ||||
|   mm[ncheb] = apend; | ||||
|  | ||||
|   a = 2.0 * ncheb; | ||||
|   for (long i = 0; i <= ncheb; i++) { | ||||
|     r = 0.5 * (1 - cos(M_PI * (2*i+1)/(double) a)); | ||||
|     //r *= sqrt_bf(r); // Squeeze to low end of interval | ||||
|     r = (exp((double)r)-1.0)/(exp(1.0)-1.0); | ||||
|     xx[i] = apstrt + r * apwidt; | ||||
|   } | ||||
| } | ||||
|  | ||||
| // Initialise step sizes | ||||
| void AlgRemezGeneral::stpini(){ | ||||
|   xx[neq+1] = apend; | ||||
|   delta = 0.25; | ||||
|   step[0] = xx[0] - apstrt; | ||||
|   for (int i = 1; i < neq; i++) step[i] = xx[i] - xx[i-1]; | ||||
|   step[neq] = step[neq-1]; | ||||
| } | ||||
|  | ||||
| // Search for error maxima and minima | ||||
| void AlgRemezGeneral::search(){ | ||||
|   bigfloat a, q, xm, ym, xn, yn, xx1; | ||||
|   int emsign, ensign, steps; | ||||
|  | ||||
|   int meq = neq + 1; | ||||
|  | ||||
|   bigfloat eclose = 1.0e30; | ||||
|   bigfloat farther = 0l; | ||||
|  | ||||
|   bigfloat xx0 = apstrt; | ||||
|  | ||||
|   for (int i = 0; i < meq; i++) { | ||||
|     steps = 0; | ||||
|     xx1 = xx[i]; // Next zero | ||||
|     if (i == meq-1) xx1 = apend; | ||||
|     xm = mm[i]; | ||||
|     ym = getErr(xm,&emsign); | ||||
|     q = step[i]; | ||||
|     xn = xm + q; | ||||
|     if (xn < xx0 || xn >= xx1) {	// Cannot skip over adjacent boundaries | ||||
|       q = -q; | ||||
|       xn = xm; | ||||
|       yn = ym; | ||||
|       ensign = emsign; | ||||
|     } else { | ||||
|       yn = getErr(xn,&ensign); | ||||
|       if (yn < ym) { | ||||
| 	q = -q; | ||||
| 	xn = xm; | ||||
| 	yn = ym; | ||||
| 	ensign = emsign; | ||||
|       } | ||||
|     } | ||||
|    | ||||
|     while(yn >= ym) {		// March until error becomes smaller. | ||||
|       if (++steps > 10) | ||||
|       	break; | ||||
|        | ||||
|       ym = yn; | ||||
|       xm = xn; | ||||
|       emsign = ensign; | ||||
|       a = xm + q; | ||||
|       if (a == xm || a <= xx0 || a >= xx1) | ||||
| 	break;// Must not skip over the zeros either side.       | ||||
|  | ||||
|       xn = a; | ||||
|       yn = getErr(xn,&ensign); | ||||
|     } | ||||
|  | ||||
|     mm[i] = xm;			// Position of maximum | ||||
|     yy[i] = ym;			// Value of maximum | ||||
|  | ||||
|     if (eclose > ym) eclose = ym; | ||||
|     if (farther < ym) farther = ym; | ||||
|  | ||||
|     xx0 = xx1; // Walk to next zero. | ||||
|   } // end of search loop | ||||
|  | ||||
|   q = (farther - eclose);	// Decrease step size if error spread increased | ||||
|  | ||||
|   if (eclose != 0.0) q /= eclose; // Relative error spread | ||||
|  | ||||
|   if (q >= spread) | ||||
|     delta *= 0.5; // Spread is increasing; decrease step size | ||||
|    | ||||
|   spread = q; | ||||
|  | ||||
|   for (int i = 0; i < neq; i++) { | ||||
|     q = yy[i+1]; | ||||
|     if (q != 0.0) q = yy[i] / q  - (bigfloat)1l; | ||||
|     else q = 0.0625; | ||||
|     if (q > (bigfloat)0.25) q = 0.25; | ||||
|     q *= mm[i+1] - mm[i]; | ||||
|     step[i] = q * delta; | ||||
|   } | ||||
|   step[neq] = step[neq-1]; | ||||
|    | ||||
|   for (int i = 0; i < neq; i++) {	// Insert new locations for the zeros. | ||||
|     xm = xx[i] - step[i]; | ||||
|  | ||||
|     if (xm <= apstrt) | ||||
|       continue; | ||||
|  | ||||
|     if (xm >= apend) | ||||
|       continue; | ||||
|  | ||||
|     if (xm <= mm[i]) | ||||
|       xm = (bigfloat)0.5 * (mm[i] + xx[i]);     | ||||
|  | ||||
|     if (xm >= mm[i+1]) | ||||
|       xm = (bigfloat)0.5 * (mm[i+1] + xx[i]); | ||||
|      | ||||
|     xx[i] = xm; | ||||
|   } | ||||
| } | ||||
|  | ||||
| // Solve the equations | ||||
| void AlgRemezGeneral::equations(){ | ||||
|   bigfloat x, y, z; | ||||
|   bigfloat *aa; | ||||
|    | ||||
|   for (int i = 0; i < neq; i++) {	// set up the equations for solution by simq() | ||||
|     int ip = neq * i;		// offset to 1st element of this row of matrix | ||||
|     x = xx[i];			// the guess for this row | ||||
|     y = func(x);		// right-hand-side vector | ||||
|  | ||||
|     z = (bigfloat)1l; | ||||
|     aa = A.data()+ip; | ||||
|     int t = 0; | ||||
|     for (int j = 0; j <= pow_n; j++) { | ||||
|       if(num_pows[j] != -1){ *aa++ = z; t++; } | ||||
|       z *= x; | ||||
|     } | ||||
|     assert(t == n+1); | ||||
|  | ||||
|     z = (bigfloat)1l; | ||||
|     t = 0; | ||||
|     for (int j = 0; j < pow_d; j++) { | ||||
|       if(den_pows[j] != -1){ *aa++ = -y * z; t++; } | ||||
|       z *= x; | ||||
|     } | ||||
|     assert(t == d); | ||||
|  | ||||
|     B[i] = y * z;		// Right hand side vector | ||||
|   } | ||||
|  | ||||
|   // Solve the simultaneous linear equations. | ||||
|   if (simq()){ | ||||
|     std::cout<<"simq failed\n"; | ||||
|     exit(0); | ||||
|   } | ||||
| } | ||||
|  | ||||
|  | ||||
| // Evaluate the rational form P(x)/Q(x) using coefficients | ||||
| // from the solution vector param | ||||
| bigfloat AlgRemezGeneral::approx(const bigfloat x) const{ | ||||
|   // Work backwards toward the constant term. | ||||
|   int c = n; | ||||
|   bigfloat yn = param[c--];		// Highest order numerator coefficient | ||||
|   for (int i = pow_n-1; i >= 0; i--) yn = x * yn  +  (num_pows[i] != -1 ? param[c--] : bigfloat(0l));   | ||||
|  | ||||
|   c = n+d; | ||||
|   bigfloat yd = 1l; //Highest degree coefficient is 1.0 | ||||
|   for (int i = pow_d-1; i >= 0; i--) yd = x * yd  +  (den_pows[i] != -1 ? param[c--] : bigfloat(0l));  | ||||
|  | ||||
|   return(yn/yd); | ||||
| } | ||||
|  | ||||
| // Compute size and sign of the approximation error at x | ||||
| bigfloat AlgRemezGeneral::getErr(bigfloat x, int *sign) const{ | ||||
|   bigfloat f = func(x); | ||||
|   bigfloat e = approx(x) - f; | ||||
|   if (f != 0) e /= f; | ||||
|   if (e < (bigfloat)0.0) { | ||||
|     *sign = -1; | ||||
|     e = -e; | ||||
|   } | ||||
|   else *sign = 1; | ||||
|    | ||||
|   return(e); | ||||
| } | ||||
|  | ||||
| // Solve the system AX=B | ||||
| int AlgRemezGeneral::simq(){ | ||||
|  | ||||
|   int ip, ipj, ipk, ipn; | ||||
|   int idxpiv; | ||||
|   int kp, kp1, kpk, kpn; | ||||
|   int nip, nkp; | ||||
|   bigfloat em, q, rownrm, big, size, pivot, sum; | ||||
|   bigfloat *aa; | ||||
|   bigfloat *X = param.data(); | ||||
|  | ||||
|   int n = neq; | ||||
|   int nm1 = n - 1; | ||||
|   // Initialize IPS and X | ||||
|    | ||||
|   int ij = 0; | ||||
|   for (int i = 0; i < n; i++) { | ||||
|     IPS[i] = i; | ||||
|     rownrm = 0.0; | ||||
|     for(int j = 0; j < n; j++) { | ||||
|       q = abs_bf(A[ij]); | ||||
|       if(rownrm < q) rownrm = q; | ||||
|       ++ij; | ||||
|     } | ||||
|     if (rownrm == (bigfloat)0l) { | ||||
|       std::cout<<"simq rownrm=0\n"; | ||||
|       return(1); | ||||
|     } | ||||
|     X[i] = (bigfloat)1.0 / rownrm; | ||||
|   } | ||||
|    | ||||
|   for (int k = 0; k < nm1; k++) { | ||||
|     big = 0.0; | ||||
|     idxpiv = 0; | ||||
|      | ||||
|     for (int i = k; i < n; i++) { | ||||
|       ip = IPS[i]; | ||||
|       ipk = n*ip + k; | ||||
|       size = abs_bf(A[ipk]) * X[ip]; | ||||
|       if (size > big) { | ||||
| 	big = size; | ||||
| 	idxpiv = i; | ||||
|       } | ||||
|     } | ||||
|      | ||||
|     if (big == (bigfloat)0l) { | ||||
|       std::cout<<"simq big=0\n"; | ||||
|       return(2); | ||||
|     } | ||||
|     if (idxpiv != k) { | ||||
|       int j = IPS[k]; | ||||
|       IPS[k] = IPS[idxpiv]; | ||||
|       IPS[idxpiv] = j; | ||||
|     } | ||||
|     kp = IPS[k]; | ||||
|     kpk = n*kp + k; | ||||
|     pivot = A[kpk]; | ||||
|     kp1 = k+1; | ||||
|     for (int i = kp1; i < n; i++) { | ||||
|       ip = IPS[i]; | ||||
|       ipk = n*ip + k; | ||||
|       em = -A[ipk] / pivot; | ||||
|       A[ipk] = -em; | ||||
|       nip = n*ip; | ||||
|       nkp = n*kp; | ||||
|       aa = A.data()+nkp+kp1; | ||||
|       for (int j = kp1; j < n; j++) { | ||||
| 	ipj = nip + j; | ||||
| 	A[ipj] = A[ipj] + em * *aa++; | ||||
|       } | ||||
|     } | ||||
|   } | ||||
|   kpn = n * IPS[n-1] + n - 1;	// last element of IPS[n] th row | ||||
|   if (A[kpn] == (bigfloat)0l) { | ||||
|     std::cout<<"simq A[kpn]=0\n"; | ||||
|     return(3); | ||||
|   } | ||||
|  | ||||
|    | ||||
|   ip = IPS[0]; | ||||
|   X[0] = B[ip]; | ||||
|   for (int i = 1; i < n; i++) { | ||||
|     ip = IPS[i]; | ||||
|     ipj = n * ip; | ||||
|     sum = 0.0; | ||||
|     for (int j = 0; j < i; j++) { | ||||
|       sum += A[ipj] * X[j]; | ||||
|       ++ipj; | ||||
|     } | ||||
|     X[i] = B[ip] - sum; | ||||
|   } | ||||
|    | ||||
|   ipn = n * IPS[n-1] + n - 1; | ||||
|   X[n-1] = X[n-1] / A[ipn]; | ||||
|    | ||||
|   for (int iback = 1; iback < n; iback++) { | ||||
|     //i goes (n-1),...,1 | ||||
|     int i = nm1 - iback; | ||||
|     ip = IPS[i]; | ||||
|     nip = n*ip; | ||||
|     sum = 0.0; | ||||
|     aa = A.data()+nip+i+1; | ||||
|     for (int j= i + 1; j < n; j++)  | ||||
|       sum += *aa++ * X[j]; | ||||
|     X[i] = (X[i] - sum) / A[nip+i]; | ||||
|   } | ||||
|    | ||||
|   return(0); | ||||
| } | ||||
|  | ||||
| void AlgRemezGeneral::csv(std::ostream & os) const{ | ||||
|   os << "Numerator" << std::endl; | ||||
|   for(int i=0;i<=pow_n;i++){ | ||||
|     os << getCoeffNum(i) << "*x^" << i; | ||||
|     if(i!=pow_n) os << " + "; | ||||
|   } | ||||
|   os << std::endl; | ||||
|  | ||||
|   os << "Denominator" << std::endl; | ||||
|   for(int i=0;i<=pow_d;i++){ | ||||
|     os << getCoeffDen(i) << "*x^" << i; | ||||
|     if(i!=pow_d) os << " + "; | ||||
|   } | ||||
|   os << std::endl; | ||||
|  | ||||
|   //For a true minimax solution the errors should all be equal and the signs should oscillate +-+-+- etc | ||||
|   int sign; | ||||
|   os << "Errors at maxima: coordinate, error, (sign)" << std::endl; | ||||
|   for(int i=0;i<neq+1;i++){  | ||||
|     os << mm[i] << " " << getErr(mm[i],&sign) << " (" << sign << ")" << std::endl; | ||||
|   } | ||||
|  | ||||
|   os << "Scan over range:" << std::endl; | ||||
|   int npt = 60; | ||||
|   bigfloat dlt = (apend - apstrt)/bigfloat(npt-1); | ||||
|  | ||||
|   for (bigfloat x=apstrt; x<=apend; x = x + dlt) { | ||||
|     double f = evaluateFunc(x); | ||||
|     double r = evaluateApprox(x); | ||||
|     os<< x<<","<<r<<","<<f<<","<<r-f<<std::endl; | ||||
|   } | ||||
|   return; | ||||
| } | ||||
							
								
								
									
										170
									
								
								Grid/algorithms/approx/RemezGeneral.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										170
									
								
								Grid/algorithms/approx/RemezGeneral.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,170 @@ | ||||
| /* | ||||
|   C.Kelly Jan 2020 based on implementation by M. Clark May 2005 | ||||
|  | ||||
|   AlgRemezGeneral is an implementation of the Remez algorithm for approximating an arbitrary function by a rational polynomial  | ||||
|   It includes optional restriction to odd/even polynomials for the numerator and/or denominator | ||||
| */ | ||||
|  | ||||
| #ifndef INCLUDED_ALG_REMEZ_GENERAL_H | ||||
| #define INCLUDED_ALG_REMEZ_GENERAL_H | ||||
|  | ||||
| #include <stddef.h> | ||||
| #include <Grid/GridStd.h> | ||||
|  | ||||
| #ifdef HAVE_LIBGMP | ||||
| #include "bigfloat.h" | ||||
| #else | ||||
| #include "bigfloat_double.h" | ||||
| #endif | ||||
|  | ||||
|  | ||||
| class AlgRemezGeneral{ | ||||
|  public: | ||||
|   enum PolyType { Even, Odd, Full }; | ||||
|  | ||||
|  private: | ||||
|  | ||||
|   // In GSL-style, pass the function as a function pointer. Any data required to evaluate the function is passed in as a void pointer | ||||
|   bigfloat (*f)(bigfloat x, void *data); | ||||
|   void *data; | ||||
|  | ||||
|   // The approximation parameters | ||||
|   std::vector<bigfloat> param; | ||||
|   bigfloat norm; | ||||
|  | ||||
|   // The number of non-zero terms in the numerator and denominator | ||||
|   int n, d; | ||||
|   // The numerator and denominator degree (i.e.  the largest power) | ||||
|   int pow_n, pow_d; | ||||
|    | ||||
|   // Specify if the numerator and/or denominator are odd/even polynomials | ||||
|   PolyType num_type; | ||||
|   PolyType den_type; | ||||
|   std::vector<int> num_pows; //contains the mapping, with -1 if not present | ||||
|   std::vector<int> den_pows; | ||||
|  | ||||
|   // The bounds of the approximation | ||||
|   bigfloat apstrt, apwidt, apend; | ||||
|  | ||||
|   // Variables used to calculate the approximation | ||||
|   int nd1, iter; | ||||
|   std::vector<bigfloat> xx; | ||||
|   std::vector<bigfloat> mm; | ||||
|   std::vector<bigfloat> step; | ||||
|  | ||||
|   bigfloat delta, spread; | ||||
|    | ||||
|   // Variables used in search | ||||
|   std::vector<bigfloat> yy; | ||||
|  | ||||
|   // Variables used in solving linear equations | ||||
|   std::vector<bigfloat> A; | ||||
|   std::vector<bigfloat> B; | ||||
|   std::vector<int> IPS; | ||||
|  | ||||
|   // The number of equations we must solve at each iteration (n+d+1) | ||||
|   int neq; | ||||
|  | ||||
|   // The precision of the GNU MP library | ||||
|   long prec; | ||||
|  | ||||
|   // Initialize member variables associated with the polynomial's properties | ||||
|   void setupPolyProperties(int num_degree, int den_degree, PolyType num_type_in, PolyType den_type_in); | ||||
|  | ||||
|   // Initial values of maximal and minmal errors | ||||
|   void initialGuess(); | ||||
|  | ||||
|   // Initialise step sizes | ||||
|   void stpini(); | ||||
|  | ||||
|   // Initialize the algorithm | ||||
|   void reinitializeAlgorithm(); | ||||
|  | ||||
|   // Solve the equations | ||||
|   void equations(); | ||||
|  | ||||
|   // Search for error maxima and minima | ||||
|   void search();  | ||||
|  | ||||
|   // Calculate function required for the approximation | ||||
|   inline bigfloat func(bigfloat x) const{ | ||||
|     return f(x, data); | ||||
|   } | ||||
|  | ||||
|   // Compute size and sign of the approximation error at x | ||||
|   bigfloat getErr(bigfloat x, int *sign) const; | ||||
|  | ||||
|   // Solve the system AX=B   where X = param | ||||
|   int simq(); | ||||
|  | ||||
|   // Evaluate the rational form P(x)/Q(x) using coefficients from the solution vector param | ||||
|   bigfloat approx(bigfloat x) const; | ||||
|  | ||||
|  public: | ||||
|    | ||||
|   AlgRemezGeneral(double lower, double upper, long prec, | ||||
| 		  bigfloat (*f)(bigfloat x, void *data), void *data); | ||||
|  | ||||
|   inline int getDegree(void) const{  | ||||
|     assert(n==d); | ||||
|     return n; | ||||
|   } | ||||
|   // Reset the bounds of the approximation | ||||
|   inline void setBounds(double lower, double upper) { | ||||
|     apstrt = lower; | ||||
|     apend = upper; | ||||
|     apwidt = apend - apstrt; | ||||
|   } | ||||
|  | ||||
|   // Get the bounds of the approximation | ||||
|   inline void getBounds(double &lower, double &upper) const{  | ||||
|     lower=(double)apstrt; | ||||
|     upper=(double)apend; | ||||
|   } | ||||
|  | ||||
|   // Run the algorithm to generate the rational approximation | ||||
|   double generateApprox(int num_degree, int den_degree,  | ||||
| 			PolyType num_type, PolyType den_type, | ||||
| 			const double tolerance = 1e-15, const int report_freq = 1000); | ||||
|    | ||||
|   inline double generateApprox(int num_degree, int den_degree,  | ||||
| 			       const double tolerance = 1e-15, const int report_freq = 1000){ | ||||
|     return generateApprox(num_degree, den_degree, Full, Full, tolerance, report_freq); | ||||
|   } | ||||
|    | ||||
|   // Evaluate the rational form P(x)/Q(x) using coefficients from the | ||||
|   // solution vector param | ||||
|   inline double evaluateApprox(double x) const{ | ||||
|     return (double)approx((bigfloat)x); | ||||
|   } | ||||
|  | ||||
|   // Evaluate the rational form Q(x)/P(x) using coefficients from the solution vector param | ||||
|   inline double evaluateInverseApprox(double x) const{ | ||||
|     return 1.0/(double)approx((bigfloat)x); | ||||
|   }   | ||||
|  | ||||
|   // Calculate function required for the approximation | ||||
|   inline double evaluateFunc(double x) const{ | ||||
|     return (double)func((bigfloat)x); | ||||
|   } | ||||
|  | ||||
|   // Calculate inverse function required for the approximation | ||||
|   inline double evaluateInverseFunc(double x) const{ | ||||
|     return 1.0/(double)func((bigfloat)x); | ||||
|   } | ||||
|  | ||||
|   // Dump csv of function, approx and error | ||||
|   void csv(std::ostream &os = std::cout) const; | ||||
|  | ||||
|   // Get the coefficient of the term x^i in the numerator | ||||
|   inline double getCoeffNum(const int i) const{     | ||||
|     return num_pows[i] == -1 ? 0. : double(param[num_pows[i]]); | ||||
|   } | ||||
|   // Get the coefficient of the term x^i in the denominator | ||||
|   inline double getCoeffDen(const int i) const{  | ||||
|     if(i == pow_d) return 1.0; | ||||
|     else return den_pows[i] == -1 ? 0. : double(param[den_pows[i]+n+1]);  | ||||
|   } | ||||
| }; | ||||
|  | ||||
| #endif | ||||
							
								
								
									
										183
									
								
								Grid/algorithms/approx/ZMobius.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										183
									
								
								Grid/algorithms/approx/ZMobius.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,183 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
|     Source file: ./lib/algorithms/approx/ZMobius.cc | ||||
|  | ||||
|     Copyright (C) 2015 | ||||
|  | ||||
| Author: Christopher Kelly <ckelly@phys.columbia.edu> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
|  | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
|  | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
|  | ||||
| #include <Grid/algorithms/approx/ZMobius.h> | ||||
| #include <Grid/algorithms/approx/RemezGeneral.h> | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
| NAMESPACE_BEGIN(Approx); | ||||
|  | ||||
| //Compute the tanh approximation | ||||
| inline double epsilonMobius(const double x, const std::vector<ComplexD> &w){ | ||||
|   int Ls = w.size(); | ||||
|  | ||||
|   ComplexD fxp = 1., fmp = 1.; | ||||
|   for(int i=0;i<Ls;i++){ | ||||
|     fxp = fxp * ( w[i] + x ); | ||||
|     fmp = fmp * ( w[i] - x ); | ||||
|   } | ||||
|   return ((fxp - fmp)/(fxp + fmp)).real(); | ||||
| } | ||||
| inline double epsilonMobius(const double x, const std::vector<RealD> &w){ | ||||
|   int Ls = w.size(); | ||||
|  | ||||
|   double fxp = 1., fmp = 1.; | ||||
|   for(int i=0;i<Ls;i++){ | ||||
|     fxp = fxp * ( w[i] + x ); | ||||
|     fmp = fmp * ( w[i] - x ); | ||||
|   } | ||||
|   return (fxp - fmp)/(fxp + fmp); | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| //Compute the tanh approximation in a form suitable for the Remez | ||||
| bigfloat epsilonMobius(bigfloat x, void* data){ | ||||
|   const std::vector<RealD> &omega = *( (std::vector<RealD> const*)data ); | ||||
|   bigfloat fxp(1.0); | ||||
|   bigfloat fmp(1.0); | ||||
|  | ||||
|   for(int i=0;i<omega.size();i++){ | ||||
|     fxp = fxp * ( bigfloat(omega[i]) + x); | ||||
|     fmp = fmp * ( bigfloat(omega[i]) - x); | ||||
|   } | ||||
|   return (fxp - fmp)/(fxp + fmp); | ||||
| } | ||||
|  | ||||
| //Compute the Zmobius Omega parameters suitable for eigenvalue range   -lambda_bound <= lambda <= lambda_bound | ||||
| //Note omega_i = 1/(b_i + c_i)   where b_i and c_i are the Mobius parameters | ||||
| void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out, | ||||
| 			 const std::vector<RealD> &omega_in, const int Ls_in, | ||||
| 			 const RealD lambda_bound){ | ||||
|   assert(omega_in.size() == Ls_in); | ||||
|   omega_out.resize(Ls_out); | ||||
|  | ||||
|   //Use the Remez algorithm to generate the appropriate rational polynomial | ||||
|   //For odd polynomial, to satisfy Haar condition must take either positive or negative half of range (cf https://arxiv.org/pdf/0803.0439.pdf page 6)   | ||||
|   AlgRemezGeneral remez(0, lambda_bound, 64, &epsilonMobius, (void*)&omega_in);  | ||||
|   remez.generateApprox(Ls_out-1, Ls_out,AlgRemezGeneral::Odd, AlgRemezGeneral::Even, 1e-15, 100); | ||||
|   remez.csv(std::cout); | ||||
|  | ||||
|   //The rational approximation has the form  [ f(x) - f(-x) ] / [ f(x) + f(-x) ]  where  f(x) = \Prod_{i=0}^{L_s-1} ( \omega_i + x ) | ||||
|   //cf https://academiccommons.columbia.edu/doi/10.7916/D8T72HD7  pg 102 | ||||
|   //omega_i are therefore the negative of the complex roots of f(x) | ||||
|  | ||||
|   //We can find the roots by recognizing that the eigenvalues of a matrix A are the roots of the characteristic polynomial | ||||
|   // \rho(\lambda) = det( A - \lambda I )    where I is the unit matrix | ||||
|   //The matrix whose characteristic polynomial is an arbitrary monic polynomial a0 + a1 x + a2 x^2 + ... x^n   is the companion matrix  | ||||
|   // A = | 0    1   0    0 0 .... 0 | | ||||
|   //     | 0    0   1    0 0 .... 0 | | ||||
|   //     | :    :   :    : :      : | | ||||
|   //     | 0    0   0    0 0      1 | ||||
|   //     | -a0 -a1 -a2  ...  ... -an| | ||||
|  | ||||
|  | ||||
|   //Note the Remez defines the largest power to have unit coefficient | ||||
|   std::vector<RealD> coeffs(Ls_out+1); | ||||
|   for(int i=0;i<Ls_out+1;i+=2) coeffs[i] = coeffs[i] = remez.getCoeffDen(i); //even powers | ||||
|   for(int i=1;i<Ls_out+1;i+=2) coeffs[i] = coeffs[i] = remez.getCoeffNum(i); //odd powers | ||||
|  | ||||
|   std::vector<std::complex<RealD> > roots(Ls_out); | ||||
|  | ||||
|   //Form the companion matrix | ||||
|   Eigen::MatrixXd compn(Ls_out,Ls_out); | ||||
|   for(int i=0;i<Ls_out-1;i++) compn(i,0) = 0.; | ||||
|   compn(Ls_out - 1, 0) = -coeffs[0]; | ||||
|    | ||||
|   for(int j=1;j<Ls_out;j++){ | ||||
|     for(int i=0;i<Ls_out-1;i++) compn(i,j) = i == j-1 ? 1. : 0.; | ||||
|     compn(Ls_out - 1, j) = -coeffs[j]; | ||||
|   } | ||||
|  | ||||
|   //Eigensolve | ||||
|   Eigen::EigenSolver<Eigen::MatrixXd> slv(compn, false); | ||||
|  | ||||
|   const auto & ev = slv.eigenvalues(); | ||||
|   for(int i=0;i<Ls_out;i++) | ||||
|     omega_out[i] = -ev(i); | ||||
|  | ||||
|   //Sort ascending (smallest at start of vector!) | ||||
|   std::sort(omega_out.begin(), omega_out.end(),  | ||||
| 	    [&](const ComplexD &a, const ComplexD &b){ return a.real() < b.real() || (a.real() == b.real() && a.imag() < b.imag()); }); | ||||
|  | ||||
|   //McGlynn thesis pg 122 suggest improved iteration counts if magnitude of omega diminishes towards the center of the 5th dimension | ||||
|   std::vector<ComplexD> omega_tmp = omega_out; | ||||
|   int s_low=0, s_high=Ls_out-1, ss=0; | ||||
|   for(int s_from = Ls_out-1; s_from >= 0; s_from--){ //loop from largest omega | ||||
|     int s_to; | ||||
|     if(ss % 2 == 0){ | ||||
|       s_to = s_low++; | ||||
|     }else{ | ||||
|       s_to = s_high--; | ||||
|     } | ||||
|     omega_out[s_to] = omega_tmp[s_from]; | ||||
|     ++ss; | ||||
|   } | ||||
|    | ||||
|   std::cout << "Resulting omega_i:" << std::endl;   | ||||
|   for(int i=0;i<Ls_out;i++) | ||||
|     std::cout << omega_out[i] << std::endl; | ||||
|  | ||||
|   std::cout << "Test result matches the approximate polynomial found by the Remez" << std::endl; | ||||
|   std::cout << "<x> <remez approx> <poly approx> <diff poly approx remez approx> <exact> <diff poly approx exact>\n"; | ||||
|    | ||||
|   int npt = 60; | ||||
|   double dlt = lambda_bound/double(npt-1); | ||||
|  | ||||
|   for (int i =0; i<npt; i++){ | ||||
|     double x = i*dlt; | ||||
|     double r = remez.evaluateApprox(x); | ||||
|     double p = epsilonMobius(x, omega_out); | ||||
|     double e = epsilonMobius(x, omega_in); | ||||
|  | ||||
|     std::cout << x<< " " << r << " " << p <<" " <<r-p << " " << e << " " << e-p << std::endl; | ||||
|   } | ||||
|  | ||||
| } | ||||
|    | ||||
| //mobius_param = b+c   with b-c=1 | ||||
| void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound){ | ||||
|   std::vector<RealD> omega_in(Ls_in, 1./mobius_param); | ||||
|   computeZmobiusOmega(omega_out, Ls_out, omega_in, Ls_in, lambda_bound); | ||||
| } | ||||
|  | ||||
| //ZMobius class takes  gamma_i = (b+c) omega_i as its input, where b, c are factored out | ||||
| void computeZmobiusGamma(std::vector<ComplexD> &gamma_out,  | ||||
| 			 const RealD mobius_param_out, const int Ls_out,  | ||||
| 			 const RealD mobius_param_in, const int Ls_in, | ||||
| 			 const RealD lambda_bound){ | ||||
|   computeZmobiusOmega(gamma_out, Ls_out, mobius_param_in, Ls_in, lambda_bound); | ||||
|   for(int i=0;i<Ls_out;i++) gamma_out[i] = gamma_out[i] * mobius_param_out; | ||||
| } | ||||
| //Assumes mobius_param_out == mobius_param_in | ||||
| void computeZmobiusGamma(std::vector<ComplexD> &gamma_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound){ | ||||
|   computeZmobiusGamma(gamma_out, mobius_param, Ls_out, mobius_param, Ls_in, lambda_bound); | ||||
| } | ||||
|  | ||||
| NAMESPACE_END(Approx); | ||||
| NAMESPACE_END(Grid); | ||||
							
								
								
									
										57
									
								
								Grid/algorithms/approx/ZMobius.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										57
									
								
								Grid/algorithms/approx/ZMobius.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,57 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
|     Source file: ./lib/algorithms/approx/ZMobius.h | ||||
|  | ||||
|     Copyright (C) 2015 | ||||
|  | ||||
| Author: Christopher Kelly <ckelly@phys.columbia.edu> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
|  | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
|  | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_ZMOBIUS_APPROX_H | ||||
| #define GRID_ZMOBIUS_APPROX_H | ||||
|  | ||||
| #include <Grid/GridCore.h> | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
| NAMESPACE_BEGIN(Approx); | ||||
|  | ||||
| //Compute the Zmobius Omega parameters suitable for eigenvalue range   -lambda_bound <= lambda <= lambda_bound | ||||
| //Note omega_i = 1/(b_i + c_i)   where b_i and c_i are the Mobius parameters | ||||
| void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out, | ||||
| 			 const std::vector<RealD> &omega_in, const int Ls_in, | ||||
| 			 const RealD lambda_bound); | ||||
|    | ||||
| //mobius_param = b+c   with b-c=1 | ||||
| void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound); | ||||
|  | ||||
| //ZMobius class takes  gamma_i = (b+c) omega_i as its input, where b, c are factored out | ||||
| void computeZmobiusGamma(std::vector<ComplexD> &gamma_out,  | ||||
| 			 const RealD mobius_param_out, const int Ls_out,  | ||||
| 			 const RealD mobius_param_in, const int Ls_in, | ||||
| 			 const RealD lambda_bound); | ||||
|  | ||||
| //Assumes mobius_param_out == mobius_param_in | ||||
| void computeZmobiusGamma(std::vector<ComplexD> &gamma_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound); | ||||
|  | ||||
| NAMESPACE_END(Approx); | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
| #endif | ||||
| @@ -58,8 +58,8 @@ | ||||
|  | ||||
| /* Compute the partial fraction expansion coefficients (alpha) from the | ||||
|  * factored form */ | ||||
| namespace Grid { | ||||
| namespace Approx { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
| NAMESPACE_BEGIN(Approx); | ||||
|  | ||||
| static void construct_partfrac(izd *z) { | ||||
|   int dn = z -> dn, dd = z -> dd, type = z -> type; | ||||
| @@ -516,7 +516,9 @@ zolotarev_data* higham(PRECISION epsilon, int n) { | ||||
|   free(d); | ||||
|   return zd; | ||||
| } | ||||
| }} | ||||
|  | ||||
| NAMESPACE_END(Approx); | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
| #ifdef TEST | ||||
|  | ||||
| @@ -585,6 +587,7 @@ static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) { | ||||
|   return (ONE - T) / (ONE + T); | ||||
| } | ||||
|  | ||||
|  | ||||
| /* Test program. Apart from printing out the parameters for R(x) it produces | ||||
|  * the following data files for plotting (unless NPLOT is defined): | ||||
|  * | ||||
| @@ -723,5 +726,5 @@ int main(int argc, char** argv) { | ||||
|   return EXIT_SUCCESS; | ||||
| } | ||||
|  | ||||
|  | ||||
| #endif /* TEST */ | ||||
|  | ||||
|   | ||||
| @@ -1,13 +1,13 @@ | ||||
| /* -*- Mode: C; comment-column: 22; fill-column: 79; -*- */ | ||||
|  | ||||
| #ifdef __cplusplus | ||||
| namespace Grid { | ||||
| namespace Approx { | ||||
| #include <Grid/Namespace.h> | ||||
| NAMESPACE_BEGIN(Grid); | ||||
| NAMESPACE_BEGIN(Approx); | ||||
| #endif | ||||
|  | ||||
| #define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY> | ||||
|  | ||||
|  | ||||
| #ifndef ZOLOTAREV_INTERNAL | ||||
| #ifndef PRECISION | ||||
| #define PRECISION double | ||||
| @@ -83,5 +83,6 @@ void zolotarev_free(zolotarev_data *zdata); | ||||
| #endif | ||||
|  | ||||
| #ifdef __cplusplus | ||||
| }} | ||||
| NAMESPACE_END(Approx); | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
|   | ||||
| @@ -10,10 +10,12 @@ | ||||
| #ifndef INCLUDED_BIGFLOAT_H | ||||
| #define INCLUDED_BIGFLOAT_H | ||||
|  | ||||
|  | ||||
| #define __GMP_WITHIN_CONFIGURE | ||||
| #include <gmp.h> | ||||
| #include <mpf2mpfr.h> | ||||
| #include <mpfr.h> | ||||
| #undef  __GMP_WITHIN_CONFIGURE | ||||
|  | ||||
| class bigfloat { | ||||
|  | ||||
| private: | ||||
|   | ||||
| @@ -25,6 +25,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
|  | ||||
| #ifndef INCLUDED_BIGFLOAT_DOUBLE_H | ||||
| #define INCLUDED_BIGFLOAT_DOUBLE_H | ||||
|  | ||||
| #include <math.h> | ||||
|  | ||||
| typedef double mfloat;  | ||||
| @@ -186,4 +190,6 @@ public: | ||||
|   //  friend bigfloat& random(void); | ||||
| }; | ||||
|  | ||||
| #endif | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -90,8 +90,8 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field> | ||||
|   void operator() (const Field &src, Field &psi){ | ||||
|   void operator() (const Field &src, Field &psi){ | ||||
|  | ||||
|     psi.checkerboard = src.checkerboard; | ||||
|     grid             = src._grid; | ||||
|     psi.Checkerboard() = src.Checkerboard(); | ||||
|     grid             = src.Grid(); | ||||
|  | ||||
|     RealD f; | ||||
|     RealD rtzp,rtz,a,d,b; | ||||
|   | ||||
							
								
								
									
										234
									
								
								Grid/algorithms/iterative/BiCGSTAB.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										234
									
								
								Grid/algorithms/iterative/BiCGSTAB.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,234 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
| Grid physics library, www.github.com/paboyle/Grid | ||||
|  | ||||
| Source file: ./lib/algorithms/iterative/BiCGSTAB.h | ||||
|  | ||||
| Copyright (C) 2015 | ||||
|  | ||||
| Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
| Author: juettner <juettner@soton.ac.uk> | ||||
| Author: David Murphy <djmurphy@mit.edu> | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
|  | ||||
| You should have received a copy of the GNU General Public License along | ||||
| with this program; if not, write to the Free Software Foundation, Inc., | ||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
| See the full license in the file "LICENSE" in the top level distribution | ||||
| directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
|  | ||||
| #ifndef GRID_BICGSTAB_H | ||||
| #define GRID_BICGSTAB_H | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| ///////////////////////////////////////////////////////////// | ||||
| // Base classes for iterative processes based on operators | ||||
| // single input vec, single output vec. | ||||
| ///////////////////////////////////////////////////////////// | ||||
|  | ||||
| template <class Field> | ||||
| class BiCGSTAB : public OperatorFunction<Field>  | ||||
| { | ||||
|   public: | ||||
|     using OperatorFunction<Field>::operator(); | ||||
|      | ||||
|     bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge. | ||||
|                              // Defaults true. | ||||
|     RealD Tolerance; | ||||
|     Integer MaxIterations; | ||||
|     Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion | ||||
|    | ||||
|     BiCGSTAB(RealD tol, Integer maxit, bool err_on_no_conv = true) :  | ||||
|       Tolerance(tol), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv){}; | ||||
|  | ||||
|     void operator()(LinearOperatorBase<Field>& Linop, const Field& src, Field& psi)  | ||||
|     { | ||||
|       psi.Checkerboard() = src.Checkerboard(); | ||||
|       conformable(psi, src); | ||||
|  | ||||
|       RealD cp(0), rho(1), rho_prev(0), alpha(1), beta(0), omega(1); | ||||
|       RealD a(0), bo(0), b(0), ssq(0); | ||||
|  | ||||
|       Field p(src); | ||||
|       Field r(src); | ||||
|       Field rhat(src); | ||||
|       Field v(src); | ||||
|       Field s(src); | ||||
|       Field t(src); | ||||
|       Field h(src); | ||||
|  | ||||
|       v = Zero(); | ||||
|       p = Zero(); | ||||
|  | ||||
|       // Initial residual computation & set up | ||||
|       RealD guess = norm2(psi); | ||||
|       assert(std::isnan(guess) == 0); | ||||
|      | ||||
|       Linop.Op(psi, v); | ||||
|       b = norm2(v); | ||||
|  | ||||
|       r = src - v; | ||||
|       rhat = r; | ||||
|       a = norm2(r); | ||||
|       ssq = norm2(src); | ||||
|  | ||||
|       std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: guess " << guess << std::endl; | ||||
|       std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB:   src " << ssq << std::endl; | ||||
|       std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB:    mp " << b << std::endl; | ||||
|       std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB:     r " << a << std::endl; | ||||
|  | ||||
|       RealD rsq = Tolerance * Tolerance * ssq; | ||||
|  | ||||
|       // Check if guess is really REALLY good :) | ||||
|       if(a <= rsq){ return; } | ||||
|  | ||||
|       std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: k=0 residual " << a << " target " << rsq << std::endl; | ||||
|  | ||||
|       GridStopWatch LinalgTimer; | ||||
|       GridStopWatch InnerTimer; | ||||
|       GridStopWatch AxpyNormTimer; | ||||
|       GridStopWatch LinearCombTimer; | ||||
|       GridStopWatch MatrixTimer; | ||||
|       GridStopWatch SolverTimer; | ||||
|  | ||||
|       SolverTimer.Start(); | ||||
|       int k; | ||||
|       for (k = 1; k <= MaxIterations; k++)  | ||||
|       { | ||||
|         rho_prev = rho; | ||||
|  | ||||
|         LinalgTimer.Start(); | ||||
|         InnerTimer.Start(); | ||||
|         ComplexD Crho  = innerProduct(rhat,r); | ||||
|         InnerTimer.Stop(); | ||||
|         rho = Crho.real(); | ||||
|  | ||||
|         beta = (rho / rho_prev) * (alpha / omega); | ||||
|  | ||||
|         LinearCombTimer.Start(); | ||||
|         bo = beta * omega; | ||||
| 	{ | ||||
| 	  autoView( p_v , p, AcceleratorWrite); | ||||
| 	  autoView( r_v , r, AcceleratorRead); | ||||
| 	  autoView( v_v , v, AcceleratorRead); | ||||
| 	  accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{ | ||||
| 	      coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss)); | ||||
| 	    }); | ||||
| 	} | ||||
|         LinearCombTimer.Stop(); | ||||
|         LinalgTimer.Stop(); | ||||
|  | ||||
|         MatrixTimer.Start(); | ||||
|         Linop.Op(p,v); | ||||
|         MatrixTimer.Stop(); | ||||
|  | ||||
|         LinalgTimer.Start(); | ||||
|         InnerTimer.Start(); | ||||
|         ComplexD Calpha = innerProduct(rhat,v); | ||||
|         InnerTimer.Stop(); | ||||
|         alpha = rho / Calpha.real(); | ||||
|  | ||||
|         LinearCombTimer.Start(); | ||||
| 	{ | ||||
| 	  autoView( p_v , p, AcceleratorRead); | ||||
| 	  autoView( r_v , r, AcceleratorRead); | ||||
| 	  autoView( v_v , v, AcceleratorRead); | ||||
| 	  autoView( psi_v,psi, AcceleratorRead); | ||||
| 	  autoView( h_v  ,  h, AcceleratorWrite); | ||||
| 	  autoView( s_v  ,  s, AcceleratorWrite); | ||||
| 	  accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{ | ||||
| 	      coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss)); | ||||
| 	    }); | ||||
| 	  accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{ | ||||
| 	      coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss)); | ||||
|  	  }); | ||||
|         } | ||||
|         LinearCombTimer.Stop(); | ||||
|         LinalgTimer.Stop(); | ||||
|  | ||||
|         MatrixTimer.Start(); | ||||
|         Linop.Op(s,t); | ||||
|         MatrixTimer.Stop(); | ||||
|  | ||||
|         LinalgTimer.Start(); | ||||
|         InnerTimer.Start(); | ||||
|         ComplexD Comega = innerProduct(t,s); | ||||
|         InnerTimer.Stop(); | ||||
|         omega = Comega.real() / norm2(t); | ||||
|  | ||||
|         LinearCombTimer.Start(); | ||||
| 	{ | ||||
| 	  autoView( psi_v,psi, AcceleratorWrite); | ||||
| 	  autoView( r_v , r, AcceleratorWrite); | ||||
| 	  autoView( h_v , h, AcceleratorRead); | ||||
| 	  autoView( s_v , s, AcceleratorRead); | ||||
| 	  autoView( t_v , t, AcceleratorRead); | ||||
| 	  accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{ | ||||
| 	      coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss)); | ||||
| 	      coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss)); | ||||
| 	    }); | ||||
| 	} | ||||
|         LinearCombTimer.Stop(); | ||||
| 	 | ||||
|         cp = norm2(r); | ||||
|         LinalgTimer.Stop(); | ||||
|  | ||||
|         std::cout << GridLogIterative << "BiCGSTAB: Iteration " << k << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl; | ||||
|  | ||||
|         // Stopping condition | ||||
|         if(cp <= rsq)  | ||||
|         { | ||||
|           SolverTimer.Stop(); | ||||
|           Linop.Op(psi, v); | ||||
|           p = v - src; | ||||
|  | ||||
|           RealD srcnorm = sqrt(norm2(src)); | ||||
|           RealD resnorm = sqrt(norm2(p)); | ||||
|           RealD true_residual = resnorm / srcnorm; | ||||
|  | ||||
|           std::cout << GridLogMessage << "BiCGSTAB Converged on iteration " << k << std::endl; | ||||
|           std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp/ssq) << std::endl; | ||||
|           std::cout << GridLogMessage << "\tTrue residual " << true_residual << std::endl; | ||||
|           std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl; | ||||
|  | ||||
|           std::cout << GridLogMessage << "Time breakdown " << std::endl; | ||||
|           std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() << std::endl; | ||||
|           std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() << std::endl; | ||||
|           std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() << std::endl; | ||||
|           std::cout << GridLogMessage << "\tInner      " << InnerTimer.Elapsed() << std::endl; | ||||
|           std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() << std::endl; | ||||
|           std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() << std::endl; | ||||
|  | ||||
|           if(ErrorOnNoConverge){ assert(true_residual / Tolerance < 10000.0); } | ||||
|  | ||||
|           IterationsToComplete = k;	 | ||||
|  | ||||
|           return; | ||||
|         } | ||||
|       } | ||||
|        | ||||
|       std::cout << GridLogMessage << "BiCGSTAB did NOT converge" << std::endl; | ||||
|  | ||||
|       if(ErrorOnNoConverge){ assert(0); } | ||||
|       IterationsToComplete = k; | ||||
|     } | ||||
| }; | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
| #endif | ||||
							
								
								
									
										158
									
								
								Grid/algorithms/iterative/BiCGSTABMixedPrec.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										158
									
								
								Grid/algorithms/iterative/BiCGSTABMixedPrec.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,158 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
| Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| Source file: ./lib/algorithms/iterative/BiCGSTABMixedPrec.h | ||||
|  | ||||
| Copyright (C) 2015 | ||||
|  | ||||
| Author: Christopher Kelly <ckelly@phys.columbia.edu> | ||||
| Author: David Murphy <djmurphy@mit.edu> | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
|  | ||||
| You should have received a copy of the GNU General Public License along | ||||
| with this program; if not, write to the Free Software Foundation, Inc., | ||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
| See the full license in the file "LICENSE" in the top level distribution directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
|  | ||||
| #ifndef GRID_BICGSTAB_MIXED_PREC_H | ||||
| #define GRID_BICGSTAB_MIXED_PREC_H | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| // Mixed precision restarted defect correction BiCGSTAB | ||||
| template<class FieldD, class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>  | ||||
| class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD>  | ||||
| { | ||||
|   public:                                                 | ||||
|     RealD   Tolerance; | ||||
|     RealD   InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed | ||||
|     Integer MaxInnerIterations; | ||||
|     Integer MaxOuterIterations; | ||||
|     GridBase* SinglePrecGrid; // Grid for single-precision fields | ||||
|     RealD OuterLoopNormMult; // Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance | ||||
|     LinearOperatorBase<FieldF> &Linop_f; | ||||
|     LinearOperatorBase<FieldD> &Linop_d; | ||||
|  | ||||
|     Integer TotalInnerIterations; //Number of inner CG iterations | ||||
|     Integer TotalOuterIterations; //Number of restarts | ||||
|     Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step | ||||
|  | ||||
|     //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess | ||||
|     LinearFunction<FieldF> *guesser; | ||||
|      | ||||
|     MixedPrecisionBiCGSTAB(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid,  | ||||
|         LinearOperatorBase<FieldF>& _Linop_f, LinearOperatorBase<FieldD>& _Linop_d) :  | ||||
|       Linop_f(_Linop_f), Linop_d(_Linop_d), Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit),  | ||||
|       MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid), OuterLoopNormMult(100.), guesser(NULL) {}; | ||||
|  | ||||
|     void useGuesser(LinearFunction<FieldF>& g){ | ||||
|       guesser = &g; | ||||
|     } | ||||
|    | ||||
|     void operator() (const FieldD& src_d_in, FieldD& sol_d) | ||||
|     { | ||||
|       TotalInnerIterations = 0; | ||||
|      | ||||
|       GridStopWatch TotalTimer; | ||||
|       TotalTimer.Start(); | ||||
|        | ||||
|       int cb = src_d_in.Checkerboard(); | ||||
|       sol_d.Checkerboard() = cb; | ||||
|        | ||||
|       RealD src_norm = norm2(src_d_in); | ||||
|       RealD stop = src_norm * Tolerance*Tolerance; | ||||
|  | ||||
|       GridBase* DoublePrecGrid = src_d_in.Grid(); | ||||
|       FieldD tmp_d(DoublePrecGrid); | ||||
|       tmp_d.Checkerboard() = cb; | ||||
|        | ||||
|       FieldD tmp2_d(DoublePrecGrid); | ||||
|       tmp2_d.Checkerboard() = cb; | ||||
|        | ||||
|       FieldD src_d(DoublePrecGrid); | ||||
|       src_d = src_d_in; //source for next inner iteration, computed from residual during operation | ||||
|        | ||||
|       RealD inner_tol = InnerTolerance; | ||||
|        | ||||
|       FieldF src_f(SinglePrecGrid); | ||||
|       src_f.Checkerboard() = cb; | ||||
|        | ||||
|       FieldF sol_f(SinglePrecGrid); | ||||
|       sol_f.Checkerboard() = cb; | ||||
|        | ||||
|       BiCGSTAB<FieldF> CG_f(inner_tol, MaxInnerIterations); | ||||
|       CG_f.ErrorOnNoConverge = false; | ||||
|  | ||||
|       GridStopWatch InnerCGtimer; | ||||
|  | ||||
|       GridStopWatch PrecChangeTimer; | ||||
|        | ||||
|       Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count | ||||
|          | ||||
|       for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++) | ||||
|       { | ||||
|         // Compute double precision rsd and also new RHS vector. | ||||
|         Linop_d.Op(sol_d, tmp_d); | ||||
|         RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector | ||||
|          | ||||
|         std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Outer iteration " << outer_iter << " residual " << norm << " target " << stop << std::endl; | ||||
|  | ||||
|         if(norm < OuterLoopNormMult * stop){ | ||||
|           std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Outer iteration converged on iteration " << outer_iter << std::endl; | ||||
|           break; | ||||
|         } | ||||
|         while(norm * inner_tol * inner_tol < stop){ inner_tol *= 2; } // inner_tol = sqrt(stop/norm) ?? | ||||
|  | ||||
|         PrecChangeTimer.Start(); | ||||
|         precisionChange(src_f, src_d); | ||||
|         PrecChangeTimer.Stop(); | ||||
|          | ||||
|         sol_f = Zero(); | ||||
|  | ||||
|         //Optionally improve inner solver guess (eg using known eigenvectors) | ||||
|         if(guesser != NULL){ (*guesser)(src_f, sol_f); } | ||||
|  | ||||
|         //Inner CG | ||||
|         CG_f.Tolerance = inner_tol; | ||||
|         InnerCGtimer.Start(); | ||||
|         CG_f(Linop_f, src_f, sol_f); | ||||
|         InnerCGtimer.Stop(); | ||||
|         TotalInnerIterations += CG_f.IterationsToComplete; | ||||
|          | ||||
|         //Convert sol back to double and add to double prec solution | ||||
|         PrecChangeTimer.Start(); | ||||
|         precisionChange(tmp_d, sol_f); | ||||
|         PrecChangeTimer.Stop(); | ||||
|          | ||||
|         axpy(sol_d, 1.0, tmp_d, sol_d); | ||||
|       } | ||||
|        | ||||
|       //Final trial CG | ||||
|       std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Starting final patch-up double-precision solve" << std::endl; | ||||
|        | ||||
|       BiCGSTAB<FieldD> CG_d(Tolerance, MaxInnerIterations); | ||||
|       CG_d(Linop_d, src_d_in, sol_d); | ||||
|       TotalFinalStepIterations = CG_d.IterationsToComplete; | ||||
|  | ||||
|       TotalTimer.Stop(); | ||||
|       std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl; | ||||
|       std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl; | ||||
|   } | ||||
| }; | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
| #endif | ||||
| @@ -27,11 +27,9 @@ See the full license in the file "LICENSE" in the top level distribution | ||||
| directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H | ||||
| #define GRID_BLOCK_CONJUGATE_GRADIENT_H | ||||
| #pragma once | ||||
|  | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec }; | ||||
|  | ||||
| @@ -54,6 +52,7 @@ class BlockConjugateGradient : public OperatorFunction<Field> { | ||||
|   Integer MaxIterations; | ||||
|   Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion | ||||
|   Integer PrintInterval; //GridLogMessages or Iterative | ||||
|   RealD TrueResidual; | ||||
|    | ||||
|   BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true) | ||||
|     : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv),PrintInterval(100) | ||||
| @@ -154,12 +153,12 @@ virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Fiel | ||||
| void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)  | ||||
| { | ||||
|   int Orthog = blockDim; // First dimension is block dim; this is an assumption | ||||
|   Nblock = B._grid->_fdimensions[Orthog]; | ||||
|   Nblock = B.Grid()->_fdimensions[Orthog]; | ||||
| /* FAKE */ | ||||
|   Nblock=8; | ||||
|   std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl; | ||||
|  | ||||
|   X.checkerboard = B.checkerboard; | ||||
|   X.Checkerboard() = B.Checkerboard(); | ||||
|   conformable(X, B); | ||||
|  | ||||
|   Field tmp(B); | ||||
| @@ -308,7 +307,8 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X) | ||||
|  | ||||
|       Linop.HermOp(X, AD); | ||||
|       AD = AD-B; | ||||
|       std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) <<std::endl; | ||||
|       TrueResidual = std::sqrt(norm2(AD)/norm2(B)); | ||||
|       std::cout << GridLogMessage <<"\tTrue residual is " << TrueResidual <<std::endl; | ||||
|  | ||||
|       std::cout << GridLogMessage << "Time Breakdown "<<std::endl; | ||||
|       std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl; | ||||
| @@ -334,11 +334,11 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X) | ||||
| void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)  | ||||
| { | ||||
|   int Orthog = blockDim; // First dimension is block dim | ||||
|   Nblock = Src._grid->_fdimensions[Orthog]; | ||||
|   Nblock = Src.Grid()->_fdimensions[Orthog]; | ||||
|  | ||||
|   std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl; | ||||
|  | ||||
|   Psi.checkerboard = Src.checkerboard; | ||||
|   Psi.Checkerboard() = Src.Checkerboard(); | ||||
|   conformable(Psi, Src); | ||||
|  | ||||
|   Field P(Src); | ||||
| @@ -444,7 +444,8 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field & | ||||
|  | ||||
|       Linop.HermOp(Psi, AP); | ||||
|       AP = AP-Src; | ||||
|       std::cout <<GridLogMessage << "\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl; | ||||
|       TrueResidual = std::sqrt(norm2(AP)/norm2(Src)); | ||||
|       std::cout <<GridLogMessage << "\tTrue residual is " << TrueResidual <<std::endl; | ||||
|  | ||||
|       std::cout << GridLogMessage << "Time Breakdown "<<std::endl; | ||||
|       std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl; | ||||
| @@ -478,7 +479,7 @@ void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector< | ||||
|   for(int b=0;b<Nblock;b++){ | ||||
|     tmp[b]   = Y[b]; | ||||
|     for(int bp=0;bp<Nblock;bp++) { | ||||
|       tmp[b] = tmp[b] + (scale*m(bp,b))*X[bp];  | ||||
|       tmp[b] = tmp[b] + scomplex(scale*m(bp,b))*X[bp];  | ||||
|     } | ||||
|   } | ||||
|   for(int b=0;b<Nblock;b++){ | ||||
| @@ -488,9 +489,9 @@ void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector< | ||||
| void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){ | ||||
|   // Should make this cache friendly with site outermost, parallel_for | ||||
|   for(int b=0;b<Nblock;b++){ | ||||
|     AP[b] = zero; | ||||
|     AP[b] = Zero(); | ||||
|     for(int bp=0;bp<Nblock;bp++) { | ||||
|       AP[b] += (m(bp,b))*X[bp];  | ||||
|       AP[b] += scomplex(m(bp,b))*X[bp];  | ||||
|     } | ||||
|   } | ||||
| } | ||||
| @@ -517,7 +518,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field | ||||
|   std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl; | ||||
|  | ||||
|   for(int b=0;b<Nblock;b++){  | ||||
|     X[b].checkerboard = B[b].checkerboard; | ||||
|     X[b].Checkerboard() = B[b].Checkerboard(); | ||||
|     conformable(X[b], B[b]); | ||||
|     conformable(X[b], X[0]);  | ||||
|   } | ||||
| @@ -655,7 +656,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field | ||||
|       if ( rr > max_resid ) max_resid = rr; | ||||
|     } | ||||
|  | ||||
|     std::cout << GridLogIterative << "\t Block Iteration "<<k<<" ave resid "<< sqrt(rrsum/sssum) << " max "<< sqrt(max_resid) <<std::endl; | ||||
|     std::cout << GridLogIterative << "\t Block Iteration "<<k<<" ave resid "<< std::sqrt(rrsum/sssum) << " max "<< std::sqrt(max_resid) <<std::endl; | ||||
|  | ||||
|     if ( max_resid < Tolerance*Tolerance ) {  | ||||
|  | ||||
| @@ -670,7 +671,8 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field | ||||
|  | ||||
|       for(int b=0;b<Nblock;b++) Linop.HermOp(X[b], AD[b]); | ||||
|       for(int b=0;b<Nblock;b++) AD[b] = AD[b]-B[b]; | ||||
|       std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(normv(AD)/normv(B)) <<std::endl; | ||||
|       TrueResidual = std::sqrt(normv(AD)/normv(B)); | ||||
|       std::cout << GridLogMessage << "\tTrue residual is " << TrueResidual <<std::endl; | ||||
|  | ||||
|       std::cout << GridLogMessage << "Time Breakdown "<<std::endl; | ||||
|       std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl; | ||||
| @@ -690,9 +692,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field | ||||
|   IterationsToComplete = k; | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| }; | ||||
|  | ||||
| } | ||||
| #endif | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
|   | ||||
| @@ -34,6 +34,8 @@ namespace Grid { | ||||
| template<class Field> | ||||
| class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> { | ||||
|  public: | ||||
|   using OperatorFunction<Field>::operator(); | ||||
|  | ||||
|   bool ErrorOnNoConverge; // Throw an assert when CAGMRES fails to converge, | ||||
|                           // defaults to true | ||||
|  | ||||
| @@ -52,10 +54,10 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction< | ||||
|  | ||||
|   Eigen::MatrixXcd H; | ||||
|  | ||||
|   std::vector<std::complex<double>> y; | ||||
|   std::vector<std::complex<double>> gamma; | ||||
|   std::vector<std::complex<double>> c; | ||||
|   std::vector<std::complex<double>> s; | ||||
|   std::vector<ComplexD> y; | ||||
|   std::vector<ComplexD> gamma; | ||||
|   std::vector<ComplexD> c; | ||||
|   std::vector<ComplexD> s; | ||||
|  | ||||
|   CommunicationAvoidingGeneralisedMinimalResidual(RealD   tol, | ||||
|                                                   Integer maxit, | ||||
| @@ -76,7 +78,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction< | ||||
|  | ||||
|     std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular GMRES" << std::endl; | ||||
|  | ||||
|     psi.checkerboard = src.checkerboard; | ||||
|     psi.Checkerboard() = src.Checkerboard(); | ||||
|     conformable(psi, src); | ||||
|  | ||||
|     RealD guess = norm2(psi); | ||||
| @@ -86,7 +88,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction< | ||||
|     RealD ssq = norm2(src); | ||||
|     RealD rsq = Tolerance * Tolerance * ssq; | ||||
|  | ||||
|     Field r(src._grid); | ||||
|     Field r(src.Grid()); | ||||
|  | ||||
|     std::cout << std::setprecision(4) << std::scientific; | ||||
|     std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl; | ||||
| @@ -142,11 +144,11 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction< | ||||
|  | ||||
|     RealD cp = 0; | ||||
|  | ||||
|     Field w(src._grid); | ||||
|     Field r(src._grid); | ||||
|     Field w(src.Grid()); | ||||
|     Field r(src.Grid()); | ||||
|  | ||||
|     // this should probably be made a class member so that it is only allocated once, not in every restart | ||||
|     std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero; | ||||
|     std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero(); | ||||
|  | ||||
|     MatrixTimer.Start(); | ||||
|     LinOp.Op(psi, w); | ||||
| @@ -157,7 +159,9 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction< | ||||
|  | ||||
|     gamma[0] = sqrt(norm2(r)); | ||||
|  | ||||
|     v[0] = (1. / gamma[0]) * r; | ||||
|     ComplexD scale = 1.0/gamma[0]; | ||||
|     v[0] = scale * r; | ||||
|  | ||||
|     LinalgTimer.Stop(); | ||||
|  | ||||
|     for (int i=0; i<RestartLength; i++) { | ||||
| @@ -168,7 +172,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction< | ||||
|  | ||||
|       qrUpdate(i); | ||||
|  | ||||
|       cp = std::norm(gamma[i+1]); | ||||
|       cp = norm(gamma[i+1]); | ||||
|  | ||||
|       std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount | ||||
|                 << " residual " << cp << " target " << rsq << std::endl; | ||||
| @@ -194,11 +198,11 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction< | ||||
|     LinalgTimer.Start(); | ||||
|     for (int i = 0; i <= iter; ++i) { | ||||
|       H(iter, i) = innerProduct(v[i], w); | ||||
|       w = w - H(iter, i) * v[i]; | ||||
|       w = w - ComplexD(H(iter, i)) * v[i]; | ||||
|     } | ||||
|  | ||||
|     H(iter, iter + 1) = sqrt(norm2(w)); | ||||
|     v[iter + 1] = (1. / H(iter, iter + 1)) * w; | ||||
|     v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w; | ||||
|     LinalgTimer.Stop(); | ||||
|   } | ||||
|  | ||||
| @@ -206,13 +210,13 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction< | ||||
|  | ||||
|     QrTimer.Start(); | ||||
|     for (int i = 0; i < iter ; ++i) { | ||||
|       auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1); | ||||
|       H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1); | ||||
|       auto tmp       = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1)); | ||||
|       H(iter, i)     = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1)); | ||||
|       H(iter, i + 1) = tmp; | ||||
|     } | ||||
|  | ||||
|     // Compute new Givens Rotation | ||||
|     ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1))); | ||||
|     auto nu     = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1))); | ||||
|     c[iter]     = H(iter, iter) / nu; | ||||
|     s[iter]     = H(iter, iter + 1) / nu; | ||||
|  | ||||
| @@ -221,7 +225,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction< | ||||
|     H(iter, iter + 1) = 0.; | ||||
|  | ||||
|     gamma[iter + 1] = -s[iter] * gamma[iter]; | ||||
|     gamma[iter]     = std::conj(c[iter]) * gamma[iter]; | ||||
|     gamma[iter]     = conjugate(c[iter]) * gamma[iter]; | ||||
|     QrTimer.Stop(); | ||||
|   } | ||||
|  | ||||
| @@ -231,8 +235,8 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction< | ||||
|     for (int i = iter; i >= 0; i--) { | ||||
|       y[i] = gamma[i]; | ||||
|       for (int k = i + 1; k <= iter; k++) | ||||
|         y[i] = y[i] - H(k, i) * y[k]; | ||||
|       y[i] = y[i] / H(i, i); | ||||
|         y[i] = y[i] - ComplexD(H(k, i)) * y[k]; | ||||
|       y[i] = y[i] / ComplexD(H(i, i)); | ||||
|     } | ||||
|  | ||||
|     for (int i = 0; i <= iter; i++) | ||||
|   | ||||
| @@ -27,11 +27,11 @@ with this program; if not, write to the Free Software Foundation, Inc., | ||||
| See the full license in the file "LICENSE" in the top level distribution | ||||
| directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| 			   /*  END LEGAL */ | ||||
| #ifndef GRID_CONJUGATE_GRADIENT_H | ||||
| #define GRID_CONJUGATE_GRADIENT_H | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| ///////////////////////////////////////////////////////////// | ||||
| // Base classes for iterative processes based on operators | ||||
| @@ -40,25 +40,30 @@ namespace Grid { | ||||
|  | ||||
| template <class Field> | ||||
| class ConjugateGradient : public OperatorFunction<Field> { | ||||
|  public: | ||||
| public: | ||||
|  | ||||
|   using OperatorFunction<Field>::operator(); | ||||
|  | ||||
|   bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge. | ||||
|                            // Defaults true. | ||||
|   RealD Tolerance; | ||||
|   Integer MaxIterations; | ||||
|   Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion | ||||
|   RealD TrueResidual; | ||||
|    | ||||
|   ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true) | ||||
|       : Tolerance(tol), | ||||
|         MaxIterations(maxit), | ||||
|         ErrorOnNoConverge(err_on_no_conv){}; | ||||
|     : Tolerance(tol), | ||||
|       MaxIterations(maxit), | ||||
|       ErrorOnNoConverge(err_on_no_conv){}; | ||||
|  | ||||
|   void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) { | ||||
|  | ||||
|     psi.Checkerboard() = src.Checkerboard(); | ||||
|  | ||||
|     psi.checkerboard = src.checkerboard; | ||||
|     conformable(psi, src); | ||||
|  | ||||
|     RealD cp, c, a, d, b, ssq, qq, b_pred; | ||||
|     RealD cp, c, a, d, b, ssq, qq; | ||||
|     //RealD b_pred; | ||||
|  | ||||
|     Field p(src); | ||||
|     Field mmp(src); | ||||
| @@ -67,10 +72,9 @@ class ConjugateGradient : public OperatorFunction<Field> { | ||||
|     // Initial residual computation & set up | ||||
|     RealD guess = norm2(psi); | ||||
|     assert(std::isnan(guess) == 0); | ||||
|  | ||||
|      | ||||
|     Linop.HermOpAndNorm(psi, mmp, d, b); | ||||
|  | ||||
|      | ||||
|     r = src - mmp; | ||||
|     p = r; | ||||
|  | ||||
| @@ -78,6 +82,14 @@ class ConjugateGradient : public OperatorFunction<Field> { | ||||
|     cp = a; | ||||
|     ssq = norm2(src); | ||||
|  | ||||
|     // Handle trivial case of zero src | ||||
|     if (ssq == 0.){ | ||||
|       psi = Zero(); | ||||
|       IterationsToComplete = 1; | ||||
|       TrueResidual = 0.; | ||||
|       return; | ||||
|     } | ||||
|  | ||||
|     std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: guess " << guess << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:   src " << ssq << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:    mp " << d << std::endl; | ||||
| @@ -89,6 +101,7 @@ class ConjugateGradient : public OperatorFunction<Field> { | ||||
|  | ||||
|     // Check if guess is really REALLY good :) | ||||
|     if (cp <= rsq) { | ||||
|       TrueResidual = std::sqrt(a/ssq); | ||||
|       std::cout << GridLogMessage << "ConjugateGradient guess is converged already " << std::endl; | ||||
|       IterationsToComplete = 0;	 | ||||
|       return; | ||||
| @@ -127,15 +140,20 @@ class ConjugateGradient : public OperatorFunction<Field> { | ||||
|       b = cp / c; | ||||
|  | ||||
|       LinearCombTimer.Start(); | ||||
|       parallel_for(int ss=0;ss<src._grid->oSites();ss++){ | ||||
| 	vstream(psi[ss], a      *  p[ss] + psi[ss]); | ||||
| 	vstream(p  [ss], b      *  p[ss] + r[ss]); | ||||
|       { | ||||
| 	autoView( psi_v , psi, AcceleratorWrite); | ||||
| 	autoView( p_v   , p,   AcceleratorWrite); | ||||
| 	autoView( r_v   , r,   AcceleratorWrite); | ||||
| 	accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{ | ||||
| 	    coalescedWrite(psi_v[ss], a      *  p_v(ss) + psi_v(ss)); | ||||
| 	    coalescedWrite(p_v[ss]  , b      *  p_v(ss) + r_v  (ss)); | ||||
| 	}); | ||||
|       } | ||||
|       LinearCombTimer.Stop(); | ||||
|       LinalgTimer.Stop(); | ||||
|  | ||||
|       std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k | ||||
|                 << " residual^2 " << sqrt(cp/ssq) << " target " << Tolerance << std::endl; | ||||
|                 << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl; | ||||
|  | ||||
|       // Stopping condition | ||||
|       if (cp <= rsq) { | ||||
| @@ -143,30 +161,37 @@ class ConjugateGradient : public OperatorFunction<Field> { | ||||
|         Linop.HermOpAndNorm(psi, mmp, d, qq); | ||||
|         p = mmp - src; | ||||
|  | ||||
|         RealD srcnorm = sqrt(norm2(src)); | ||||
|         RealD resnorm = sqrt(norm2(p)); | ||||
|         RealD srcnorm = std::sqrt(norm2(src)); | ||||
|         RealD resnorm = std::sqrt(norm2(p)); | ||||
|         RealD true_residual = resnorm / srcnorm; | ||||
|  | ||||
|         std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl; | ||||
|         std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl; | ||||
|         std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k  | ||||
| 		  << "\tComputed residual " << std::sqrt(cp / ssq) | ||||
| 		  << "\tTrue residual " << true_residual | ||||
| 		  << "\tTarget " << Tolerance << std::endl; | ||||
|  | ||||
|         std::cout << GridLogMessage << "Time breakdown "<<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tInner      " << InnerTimer.Elapsed() <<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl; | ||||
|         std::cout << GridLogIterative << "Time breakdown "<<std::endl; | ||||
| 	std::cout << GridLogIterative << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl; | ||||
| 	std::cout << GridLogIterative << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl; | ||||
| 	std::cout << GridLogIterative << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl; | ||||
| 	std::cout << GridLogIterative << "\tInner      " << InnerTimer.Elapsed() <<std::endl; | ||||
| 	std::cout << GridLogIterative << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl; | ||||
| 	std::cout << GridLogIterative << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl; | ||||
|  | ||||
|         if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); | ||||
|  | ||||
| 	IterationsToComplete = k;	 | ||||
| 	TrueResidual = true_residual; | ||||
|  | ||||
|         return; | ||||
|       } | ||||
|     } | ||||
|     // Failed. Calculate true residual before giving up                                                          | ||||
|     Linop.HermOpAndNorm(psi, mmp, d, qq); | ||||
|     p = mmp - src; | ||||
|  | ||||
|     TrueResidual = sqrt(norm2(p)/ssq); | ||||
|  | ||||
|     std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations<< std::endl; | ||||
|  | ||||
|     if (ErrorOnNoConverge) assert(0); | ||||
| @@ -174,5 +199,5 @@ class ConjugateGradient : public OperatorFunction<Field> { | ||||
|  | ||||
|   } | ||||
| }; | ||||
| } | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -23,13 +23,12 @@ Author: Christopher Kelly <ckelly@phys.columbia.edu> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_H | ||||
| #define GRID_CONJUGATE_GRADIENT_MIXED_PREC_H | ||||
|  | ||||
| namespace Grid { | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
|   //Mixed precision restarted defect correction CG | ||||
|   template<class FieldD,class FieldF,  | ||||
| @@ -67,98 +66,96 @@ namespace Grid { | ||||
|       guesser = &g; | ||||
|     } | ||||
|    | ||||
|     void operator() (const FieldD &src_d_in, FieldD &sol_d){ | ||||
|       TotalInnerIterations = 0; | ||||
|   void operator() (const FieldD &src_d_in, FieldD &sol_d){ | ||||
|     TotalInnerIterations = 0; | ||||
| 	 | ||||
|       GridStopWatch TotalTimer; | ||||
|       TotalTimer.Start(); | ||||
|     GridStopWatch TotalTimer; | ||||
|     TotalTimer.Start(); | ||||
|      | ||||
|       int cb = src_d_in.checkerboard; | ||||
|       sol_d.checkerboard = cb; | ||||
|     int cb = src_d_in.Checkerboard(); | ||||
|     sol_d.Checkerboard() = cb; | ||||
|      | ||||
|       RealD src_norm = norm2(src_d_in); | ||||
|       RealD stop = src_norm * Tolerance*Tolerance; | ||||
|     RealD src_norm = norm2(src_d_in); | ||||
|     RealD stop = src_norm * Tolerance*Tolerance; | ||||
|  | ||||
|       GridBase* DoublePrecGrid = src_d_in._grid; | ||||
|       FieldD tmp_d(DoublePrecGrid); | ||||
|       tmp_d.checkerboard = cb; | ||||
|     GridBase* DoublePrecGrid = src_d_in.Grid(); | ||||
|     FieldD tmp_d(DoublePrecGrid); | ||||
|     tmp_d.Checkerboard() = cb; | ||||
|      | ||||
|       FieldD tmp2_d(DoublePrecGrid); | ||||
|       tmp2_d.checkerboard = cb; | ||||
|     FieldD tmp2_d(DoublePrecGrid); | ||||
|     tmp2_d.Checkerboard() = cb; | ||||
|      | ||||
|       FieldD src_d(DoublePrecGrid); | ||||
|       src_d = src_d_in; //source for next inner iteration, computed from residual during operation | ||||
|     FieldD src_d(DoublePrecGrid); | ||||
|     src_d = src_d_in; //source for next inner iteration, computed from residual during operation | ||||
|      | ||||
|       RealD inner_tol = InnerTolerance; | ||||
|     RealD inner_tol = InnerTolerance; | ||||
|      | ||||
|       FieldF src_f(SinglePrecGrid); | ||||
|       src_f.checkerboard = cb; | ||||
|     FieldF src_f(SinglePrecGrid); | ||||
|     src_f.Checkerboard() = cb; | ||||
|      | ||||
|       FieldF sol_f(SinglePrecGrid); | ||||
|       sol_f.checkerboard = cb; | ||||
|     FieldF sol_f(SinglePrecGrid); | ||||
|     sol_f.Checkerboard() = cb; | ||||
|      | ||||
|       ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations); | ||||
|       CG_f.ErrorOnNoConverge = false; | ||||
|     ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations); | ||||
|     CG_f.ErrorOnNoConverge = false; | ||||
|  | ||||
|       GridStopWatch InnerCGtimer; | ||||
|     GridStopWatch InnerCGtimer; | ||||
|  | ||||
|       GridStopWatch PrecChangeTimer; | ||||
|     GridStopWatch PrecChangeTimer; | ||||
|      | ||||
|       Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count | ||||
|     Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count | ||||
|        | ||||
|       for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){ | ||||
| 	//Compute double precision rsd and also new RHS vector. | ||||
| 	Linop_d.HermOp(sol_d, tmp_d); | ||||
| 	RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector | ||||
|     for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){ | ||||
|       //Compute double precision rsd and also new RHS vector. | ||||
|       Linop_d.HermOp(sol_d, tmp_d); | ||||
|       RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector | ||||
|        | ||||
| 	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl; | ||||
|       std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl; | ||||
|  | ||||
| 	if(norm < OuterLoopNormMult * stop){ | ||||
| 	  std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl; | ||||
| 	  break; | ||||
| 	} | ||||
| 	while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ?? | ||||
|  | ||||
| 	PrecChangeTimer.Start(); | ||||
| 	precisionChange(src_f, src_d); | ||||
| 	PrecChangeTimer.Stop(); | ||||
|        | ||||
| 	zeroit(sol_f); | ||||
|  | ||||
| 	//Optionally improve inner solver guess (eg using known eigenvectors) | ||||
| 	if(guesser != NULL) | ||||
| 	  (*guesser)(src_f, sol_f); | ||||
|  | ||||
| 	//Inner CG | ||||
| 	CG_f.Tolerance = inner_tol; | ||||
| 	InnerCGtimer.Start(); | ||||
| 	CG_f(Linop_f, src_f, sol_f); | ||||
| 	InnerCGtimer.Stop(); | ||||
| 	TotalInnerIterations += CG_f.IterationsToComplete; | ||||
|        | ||||
| 	//Convert sol back to double and add to double prec solution | ||||
| 	PrecChangeTimer.Start(); | ||||
| 	precisionChange(tmp_d, sol_f); | ||||
| 	PrecChangeTimer.Stop(); | ||||
|        | ||||
| 	axpy(sol_d, 1.0, tmp_d, sol_d); | ||||
|       if(norm < OuterLoopNormMult * stop){ | ||||
| 	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl; | ||||
| 	break; | ||||
|       } | ||||
|      | ||||
|       //Final trial CG | ||||
|       std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl; | ||||
|      | ||||
|       ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations); | ||||
|       CG_d(Linop_d, src_d_in, sol_d); | ||||
|       TotalFinalStepIterations = CG_d.IterationsToComplete; | ||||
|       while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ?? | ||||
|  | ||||
|       TotalTimer.Stop(); | ||||
|       std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl; | ||||
|       std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl; | ||||
|       PrecChangeTimer.Start(); | ||||
|       precisionChange(src_f, src_d); | ||||
|       PrecChangeTimer.Stop(); | ||||
|        | ||||
|       sol_f = Zero(); | ||||
|  | ||||
|       //Optionally improve inner solver guess (eg using known eigenvectors) | ||||
|       if(guesser != NULL) | ||||
| 	(*guesser)(src_f, sol_f); | ||||
|  | ||||
|       //Inner CG | ||||
|       CG_f.Tolerance = inner_tol; | ||||
|       InnerCGtimer.Start(); | ||||
|       CG_f(Linop_f, src_f, sol_f); | ||||
|       InnerCGtimer.Stop(); | ||||
|       TotalInnerIterations += CG_f.IterationsToComplete; | ||||
|        | ||||
|       //Convert sol back to double and add to double prec solution | ||||
|       PrecChangeTimer.Start(); | ||||
|       precisionChange(tmp_d, sol_f); | ||||
|       PrecChangeTimer.Stop(); | ||||
|        | ||||
|       axpy(sol_d, 1.0, tmp_d, sol_d); | ||||
|     } | ||||
|   }; | ||||
|      | ||||
|     //Final trial CG | ||||
|     std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl; | ||||
|      | ||||
|     ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations); | ||||
|     CG_d(Linop_d, src_d_in, sol_d); | ||||
|     TotalFinalStepIterations = CG_d.IterationsToComplete; | ||||
|  | ||||
|     TotalTimer.Stop(); | ||||
|     std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl; | ||||
|     std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl; | ||||
|   } | ||||
| }; | ||||
|  | ||||
|  | ||||
| } | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
| #endif | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -24,147 +24,165 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H | ||||
| #define GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
|     ///////////////////////////////////////////////////////////// | ||||
|     // Base classes for iterative processes based on operators | ||||
|     // single input vec, single output vec. | ||||
|     ///////////////////////////////////////////////////////////// | ||||
| ///////////////////////////////////////////////////////////// | ||||
| // Base classes for iterative processes based on operators | ||||
| // single input vec, single output vec. | ||||
| ///////////////////////////////////////////////////////////// | ||||
|  | ||||
|   template<class Field>  | ||||
|     class ConjugateGradientMultiShift : public OperatorMultiFunction<Field>, | ||||
|                                         public OperatorFunction<Field> | ||||
|     { | ||||
| template<class Field>  | ||||
| class ConjugateGradientMultiShift : public OperatorMultiFunction<Field>, | ||||
| 				    public OperatorFunction<Field> | ||||
| { | ||||
| public:                                                 | ||||
|     RealD   Tolerance; | ||||
|     Integer MaxIterations; | ||||
|     Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion | ||||
|     int verbose; | ||||
|     MultiShiftFunction shifts; | ||||
|  | ||||
|     ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) :  | ||||
| 	MaxIterations(maxit), | ||||
| 	shifts(_shifts) | ||||
|     {  | ||||
|       verbose=1; | ||||
|   using OperatorFunction<Field>::operator(); | ||||
|  | ||||
|   RealD   Tolerance; | ||||
|   Integer MaxIterations; | ||||
|   Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion | ||||
|   std::vector<int> IterationsToCompleteShift;  // Iterations for this shift | ||||
|   int verbose; | ||||
|   MultiShiftFunction shifts; | ||||
|   std::vector<RealD> TrueResidualShift; | ||||
|  | ||||
|   ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) :  | ||||
|     MaxIterations(maxit), | ||||
|     shifts(_shifts) | ||||
|   {  | ||||
|     verbose=1; | ||||
|     IterationsToCompleteShift.resize(_shifts.order); | ||||
|     TrueResidualShift.resize(_shifts.order); | ||||
|   } | ||||
|  | ||||
|   void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) | ||||
|   { | ||||
|     GridBase *grid = src.Grid(); | ||||
|     int nshift = shifts.order; | ||||
|     std::vector<Field> results(nshift,grid); | ||||
|     (*this)(Linop,src,results,psi); | ||||
|   } | ||||
|   void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &results, Field &psi) | ||||
|   { | ||||
|     int nshift = shifts.order; | ||||
|  | ||||
|     (*this)(Linop,src,results); | ||||
|    | ||||
|     psi = shifts.norm*src; | ||||
|     for(int i=0;i<nshift;i++){ | ||||
|       psi = psi + shifts.residues[i]*results[i]; | ||||
|     } | ||||
|  | ||||
| void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) | ||||
| { | ||||
|   GridBase *grid = src._grid; | ||||
|   int nshift = shifts.order; | ||||
|   std::vector<Field> results(nshift,grid); | ||||
|   (*this)(Linop,src,results,psi); | ||||
| } | ||||
| void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &results, Field &psi) | ||||
| { | ||||
|   int nshift = shifts.order; | ||||
|  | ||||
|   (*this)(Linop,src,results); | ||||
|    | ||||
|   psi = shifts.norm*src; | ||||
|   for(int i=0;i<nshift;i++){ | ||||
|     psi = psi + shifts.residues[i]*results[i]; | ||||
|     return; | ||||
|   } | ||||
|  | ||||
|   return; | ||||
| } | ||||
|   void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi) | ||||
|   { | ||||
|    | ||||
|     GridBase *grid = src.Grid(); | ||||
|    | ||||
|     //////////////////////////////////////////////////////////////////////// | ||||
|     // Convenience references to the info stored in "MultiShiftFunction" | ||||
|     //////////////////////////////////////////////////////////////////////// | ||||
|     int nshift = shifts.order; | ||||
|  | ||||
| void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi) | ||||
| { | ||||
|    | ||||
|   GridBase *grid = src._grid; | ||||
|    | ||||
|   //////////////////////////////////////////////////////////////////////// | ||||
|   // Convenience references to the info stored in "MultiShiftFunction" | ||||
|   //////////////////////////////////////////////////////////////////////// | ||||
|   int nshift = shifts.order; | ||||
|     std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts" | ||||
|     std::vector<RealD> &mresidual(shifts.tolerances); | ||||
|     std::vector<RealD> alpha(nshift,1.0); | ||||
|     std::vector<Field>   ps(nshift,grid);// Search directions | ||||
|  | ||||
|   std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts" | ||||
|   std::vector<RealD> &mresidual(shifts.tolerances); | ||||
|   std::vector<RealD> alpha(nshift,1.0); | ||||
|   std::vector<Field>   ps(nshift,grid);// Search directions | ||||
|     assert(psi.size()==nshift); | ||||
|     assert(mass.size()==nshift); | ||||
|     assert(mresidual.size()==nshift); | ||||
|    | ||||
|     // dynamic sized arrays on stack; 2d is a pain with vector | ||||
|     RealD  bs[nshift]; | ||||
|     RealD  rsq[nshift]; | ||||
|     RealD  z[nshift][2]; | ||||
|     int     converged[nshift]; | ||||
|    | ||||
|     const int       primary =0; | ||||
|    | ||||
|     //Primary shift fields CG iteration | ||||
|     RealD a,b,c,d; | ||||
|     RealD cp,bp,qq; //prev | ||||
|    | ||||
|     // Matrix mult fields | ||||
|     Field r(grid); | ||||
|     Field p(grid); | ||||
|     Field tmp(grid); | ||||
|     Field mmp(grid); | ||||
|    | ||||
|     // Check lightest mass | ||||
|     for(int s=0;s<nshift;s++){ | ||||
|       assert( mass[s]>= mass[primary] ); | ||||
|       converged[s]=0; | ||||
|     } | ||||
|    | ||||
|     // Wire guess to zero | ||||
|     // Residuals "r" are src | ||||
|     // First search direction "p" is also src | ||||
|     cp = norm2(src); | ||||
|  | ||||
|   assert(psi.size()==nshift); | ||||
|   assert(mass.size()==nshift); | ||||
|   assert(mresidual.size()==nshift); | ||||
|     // Handle trivial case of zero src. | ||||
|     if( cp == 0. ){ | ||||
|       for(int s=0;s<nshift;s++){ | ||||
| 	psi[s] = Zero(); | ||||
| 	IterationsToCompleteShift[s] = 1; | ||||
| 	TrueResidualShift[s] = 0.; | ||||
|       } | ||||
|       return; | ||||
|     } | ||||
|  | ||||
|     for(int s=0;s<nshift;s++){ | ||||
|       rsq[s] = cp * mresidual[s] * mresidual[s]; | ||||
|       std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s | ||||
| 	       <<" target resid "<<rsq[s]<<std::endl; | ||||
|       ps[s] = src; | ||||
|     } | ||||
|     // r and p for primary | ||||
|     r=src; | ||||
|     p=src; | ||||
|    | ||||
|   // dynamic sized arrays on stack; 2d is a pain with vector | ||||
|   RealD  bs[nshift]; | ||||
|   RealD  rsq[nshift]; | ||||
|   RealD  z[nshift][2]; | ||||
|   int     converged[nshift]; | ||||
|     //MdagM+m[0] | ||||
|     Linop.HermOpAndNorm(p,mmp,d,qq); | ||||
|     axpy(mmp,mass[0],p,mmp); | ||||
|     RealD rn = norm2(p); | ||||
|     d += rn*mass[0]; | ||||
|    | ||||
|   const int       primary =0; | ||||
|     // have verified that inner product of  | ||||
|     // p and mmp is equal to d after this since | ||||
|     // the d computation is tricky | ||||
|     //  qq = real(innerProduct(p,mmp)); | ||||
|     //  std::cout<<GridLogMessage << "debug equal ?  qq "<<qq<<" d "<< d<<std::endl; | ||||
|    | ||||
|   //Primary shift fields CG iteration | ||||
|   RealD a,b,c,d; | ||||
|   RealD cp,bp,qq; //prev | ||||
|     b = -cp /d; | ||||
|    | ||||
|   // Matrix mult fields | ||||
|   Field r(grid); | ||||
|   Field p(grid); | ||||
|   Field tmp(grid); | ||||
|   Field mmp(grid); | ||||
|     // Set up the various shift variables | ||||
|     int       iz=0; | ||||
|     z[0][1-iz] = 1.0; | ||||
|     z[0][iz]   = 1.0; | ||||
|     bs[0]      = b; | ||||
|     for(int s=1;s<nshift;s++){ | ||||
|       z[s][1-iz] = 1.0; | ||||
|       z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0])); | ||||
|       bs[s]      = b*z[s][iz];  | ||||
|     } | ||||
|    | ||||
|   // Check lightest mass | ||||
|   for(int s=0;s<nshift;s++){ | ||||
|     assert( mass[s]>= mass[primary] ); | ||||
|     converged[s]=0; | ||||
|   } | ||||
|     // r += b[0] A.p[0] | ||||
|     // c= norm(r) | ||||
|     c=axpy_norm(r,b,mmp,r); | ||||
|    | ||||
|   // Wire guess to zero | ||||
|   // Residuals "r" are src | ||||
|   // First search direction "p" is also src | ||||
|   cp = norm2(src); | ||||
|   for(int s=0;s<nshift;s++){ | ||||
|     rsq[s] = cp * mresidual[s] * mresidual[s]; | ||||
|     std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s | ||||
| 	     <<" target resid "<<rsq[s]<<std::endl; | ||||
|     ps[s] = src; | ||||
|   } | ||||
|   // r and p for primary | ||||
|   r=src; | ||||
|   p=src; | ||||
|     for(int s=0;s<nshift;s++) { | ||||
|       axpby(psi[s],0.,-bs[s]*alpha[s],src,src); | ||||
|     } | ||||
|    | ||||
|   //MdagM+m[0] | ||||
|   Linop.HermOpAndNorm(p,mmp,d,qq); | ||||
|   axpy(mmp,mass[0],p,mmp); | ||||
|   RealD rn = norm2(p); | ||||
|   d += rn*mass[0]; | ||||
|    | ||||
|   // have verified that inner product of  | ||||
|   // p and mmp is equal to d after this since | ||||
|   // the d computation is tricky | ||||
|   //  qq = real(innerProduct(p,mmp)); | ||||
|   //  std::cout<<GridLogMessage << "debug equal ?  qq "<<qq<<" d "<< d<<std::endl; | ||||
|    | ||||
|   b = -cp /d; | ||||
|    | ||||
|   // Set up the various shift variables | ||||
|   int       iz=0; | ||||
|   z[0][1-iz] = 1.0; | ||||
|   z[0][iz]   = 1.0; | ||||
|   bs[0]      = b; | ||||
|   for(int s=1;s<nshift;s++){ | ||||
|     z[s][1-iz] = 1.0; | ||||
|     z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0])); | ||||
|     bs[s]      = b*z[s][iz];  | ||||
|   } | ||||
|    | ||||
|   // r += b[0] A.p[0] | ||||
|   // c= norm(r) | ||||
|   c=axpy_norm(r,b,mmp,r); | ||||
|    | ||||
|   for(int s=0;s<nshift;s++) { | ||||
|     axpby(psi[s],0.,-bs[s]*alpha[s],src,src); | ||||
|   } | ||||
|   | ||||
|   /////////////////////////////////////// | ||||
|   // Timers | ||||
|   /////////////////////////////////////// | ||||
| @@ -175,37 +193,37 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector | ||||
|   GridStopWatch SolverTimer; | ||||
|   SolverTimer.Start(); | ||||
|    | ||||
|   // Iteration loop | ||||
|   int k; | ||||
|     // Iteration loop | ||||
|     int k; | ||||
|    | ||||
|   for (k=1;k<=MaxIterations;k++){ | ||||
|     for (k=1;k<=MaxIterations;k++){ | ||||
|      | ||||
|     a = c /cp; | ||||
|       a = c /cp; | ||||
|     AXPYTimer.Start(); | ||||
|     axpy(p,a,p,r); | ||||
|       axpy(p,a,p,r); | ||||
|     AXPYTimer.Stop(); | ||||
|      | ||||
|     // Note to self - direction ps is iterated seperately | ||||
|     // for each shift. Does not appear to have any scope | ||||
|     // for avoiding linear algebra in "single" case. | ||||
|     //  | ||||
|     // However SAME r is used. Could load "r" and update | ||||
|     // ALL ps[s]. 2/3 Bandwidth saving | ||||
|     // New Kernel: Load r, vector of coeffs, vector of pointers ps | ||||
|       // Note to self - direction ps is iterated seperately | ||||
|       // for each shift. Does not appear to have any scope | ||||
|       // for avoiding linear algebra in "single" case. | ||||
|       //  | ||||
|       // However SAME r is used. Could load "r" and update | ||||
|       // ALL ps[s]. 2/3 Bandwidth saving | ||||
|       // New Kernel: Load r, vector of coeffs, vector of pointers ps | ||||
|     AXPYTimer.Start(); | ||||
|     for(int s=0;s<nshift;s++){ | ||||
|       if ( ! converged[s] ) {  | ||||
| 	if (s==0){ | ||||
| 	  axpy(ps[s],a,ps[s],r); | ||||
| 	} else{ | ||||
| 	  RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b); | ||||
| 	  axpby(ps[s],z[s][iz],as,r,ps[s]); | ||||
|       for(int s=0;s<nshift;s++){ | ||||
| 	if ( ! converged[s] ) {  | ||||
| 	  if (s==0){ | ||||
| 	    axpy(ps[s],a,ps[s],r); | ||||
| 	  } else{ | ||||
| 	    RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b); | ||||
| 	    axpby(ps[s],z[s][iz],as,r,ps[s]); | ||||
| 	  } | ||||
| 	} | ||||
|       } | ||||
|     } | ||||
|     AXPYTimer.Stop(); | ||||
|      | ||||
|     cp=c; | ||||
|       cp=c; | ||||
|     MatrixTimer.Start();   | ||||
|     //Linop.HermOpAndNorm(p,mmp,d,qq); // d is used | ||||
|     // The below is faster on KNL | ||||
| @@ -215,89 +233,91 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector | ||||
|     MatrixTimer.Stop();   | ||||
|  | ||||
|     AXPYTimer.Start(); | ||||
|     axpy(mmp,mass[0],p,mmp); | ||||
|       axpy(mmp,mass[0],p,mmp); | ||||
|     AXPYTimer.Stop(); | ||||
|     RealD rn = norm2(p); | ||||
|     d += rn*mass[0]; | ||||
|       RealD rn = norm2(p); | ||||
|       d += rn*mass[0]; | ||||
|      | ||||
|     bp=b; | ||||
|     b=-cp/d; | ||||
|       bp=b; | ||||
|       b=-cp/d; | ||||
|      | ||||
|     AXPYTimer.Start(); | ||||
|     c=axpy_norm(r,b,mmp,r); | ||||
|       c=axpy_norm(r,b,mmp,r); | ||||
|     AXPYTimer.Stop(); | ||||
|  | ||||
|     // Toggle the recurrence history | ||||
|     bs[0] = b; | ||||
|     iz = 1-iz; | ||||
|       // Toggle the recurrence history | ||||
|       bs[0] = b; | ||||
|       iz = 1-iz; | ||||
|     ShiftTimer.Start(); | ||||
|     for(int s=1;s<nshift;s++){ | ||||
|       if((!converged[s])){ | ||||
| 	RealD z0 = z[s][1-iz]; | ||||
| 	RealD z1 = z[s][iz]; | ||||
| 	z[s][iz] = z0*z1*bp | ||||
| 	  / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));  | ||||
| 	bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike | ||||
|       for(int s=1;s<nshift;s++){ | ||||
| 	if((!converged[s])){ | ||||
| 	  RealD z0 = z[s][1-iz]; | ||||
| 	  RealD z1 = z[s][iz]; | ||||
| 	  z[s][iz] = z0*z1*bp | ||||
| 	    / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));  | ||||
| 	  bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike | ||||
| 	} | ||||
|       } | ||||
|     } | ||||
|     ShiftTimer.Stop(); | ||||
|      | ||||
|     for(int s=0;s<nshift;s++){ | ||||
|       int ss = s; | ||||
|       // Scope for optimisation here in case of "single". | ||||
|       // Could load psi[0] and pull all ps[s] in. | ||||
|       //      if ( single ) ss=primary; | ||||
|       // Bandwith saving in single case is Ls * 3 -> 2+Ls, so ~ 3x saving | ||||
|       // Pipelined CG gain: | ||||
|       // | ||||
|       // New Kernel: Load r, vector of coeffs, vector of pointers ps | ||||
|       // New Kernel: Load psi[0], vector of coeffs, vector of pointers ps | ||||
|       // If can predict the coefficient bs then we can fuse these and avoid write reread cyce | ||||
|       //  on ps[s]. | ||||
|       // Before:  3 x npole  + 3 x npole | ||||
|       // After :  2 x npole (ps[s])        => 3x speed up of multishift CG. | ||||
|       for(int s=0;s<nshift;s++){ | ||||
| 	int ss = s; | ||||
| 	// Scope for optimisation here in case of "single". | ||||
| 	// Could load psi[0] and pull all ps[s] in. | ||||
| 	//      if ( single ) ss=primary; | ||||
| 	// Bandwith saving in single case is Ls * 3 -> 2+Ls, so ~ 3x saving | ||||
| 	// Pipelined CG gain: | ||||
| 	// | ||||
| 	// New Kernel: Load r, vector of coeffs, vector of pointers ps | ||||
| 	// New Kernel: Load psi[0], vector of coeffs, vector of pointers ps | ||||
| 	// If can predict the coefficient bs then we can fuse these and avoid write reread cyce | ||||
| 	//  on ps[s]. | ||||
| 	// Before:  3 x npole  + 3 x npole | ||||
| 	// After :  2 x npole (ps[s])        => 3x speed up of multishift CG. | ||||
|        | ||||
|       if( (!converged[s]) ) {  | ||||
| 	axpy(psi[ss],-bs[s]*alpha[s],ps[s],psi[ss]); | ||||
|       } | ||||
|     } | ||||
|      | ||||
|     // Convergence checks | ||||
|     int all_converged = 1; | ||||
|     for(int s=0;s<nshift;s++){ | ||||
|        | ||||
|       if ( (!converged[s]) ){ | ||||
| 	 | ||||
| 	RealD css  = c * z[s][iz]* z[s][iz]; | ||||
| 	 | ||||
| 	if(css<rsq[s]){ | ||||
| 	  if ( ! converged[s] ) | ||||
| 	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl; | ||||
| 	      converged[s]=1; | ||||
| 	} else { | ||||
| 	  all_converged=0; | ||||
| 	if( (!converged[s]) ) {  | ||||
| 	  axpy(psi[ss],-bs[s]*alpha[s],ps[s],psi[ss]); | ||||
| 	} | ||||
|  | ||||
|       } | ||||
|     } | ||||
|      | ||||
|     if ( all_converged ){ | ||||
|       // Convergence checks | ||||
|       int all_converged = 1; | ||||
|       for(int s=0;s<nshift;s++){ | ||||
|        | ||||
| 	if ( (!converged[s]) ){ | ||||
| 	  IterationsToCompleteShift[s] = k; | ||||
| 	 | ||||
| 	  RealD css  = c * z[s][iz]* z[s][iz]; | ||||
| 	 | ||||
| 	  if(css<rsq[s]){ | ||||
| 	    if ( ! converged[s] ) | ||||
| 	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl; | ||||
| 	    converged[s]=1; | ||||
| 	  } else { | ||||
| 	    all_converged=0; | ||||
| 	  } | ||||
|  | ||||
| 	} | ||||
|       } | ||||
|      | ||||
|       if ( all_converged ){ | ||||
|  | ||||
|     SolverTimer.Stop(); | ||||
|  | ||||
|  | ||||
|       std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl; | ||||
|       std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl; | ||||
| 	std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl; | ||||
| 	std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl; | ||||
|        | ||||
|       // Check answers  | ||||
|       for(int s=0; s < nshift; s++) {  | ||||
| 	Linop.HermOpAndNorm(psi[s],mmp,d,qq); | ||||
| 	axpy(tmp,mass[s],psi[s],mmp); | ||||
| 	axpy(r,-alpha[s],src,tmp); | ||||
| 	RealD rn = norm2(r); | ||||
| 	RealD cn = norm2(src); | ||||
| 	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl; | ||||
|       } | ||||
| 	// Check answers  | ||||
| 	for(int s=0; s < nshift; s++) {  | ||||
| 	  Linop.HermOpAndNorm(psi[s],mmp,d,qq); | ||||
| 	  axpy(tmp,mass[s],psi[s],mmp); | ||||
| 	  axpy(r,-alpha[s],src,tmp); | ||||
| 	  RealD rn = norm2(r); | ||||
| 	  RealD cn = norm2(src); | ||||
| 	  TrueResidualShift[s] = std::sqrt(rn/cn); | ||||
| 	  std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<< TrueResidualShift[s] <<std::endl; | ||||
| 	} | ||||
|  | ||||
|       std::cout << GridLogMessage << "Time Breakdown "<<std::endl; | ||||
|       std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl; | ||||
| @@ -307,16 +327,16 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector | ||||
|  | ||||
|       IterationsToComplete = k;	 | ||||
|  | ||||
|       return; | ||||
|     } | ||||
| 	return; | ||||
|       } | ||||
|  | ||||
|     | ||||
|     } | ||||
|     // ugly hack | ||||
|     std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl; | ||||
|     //  assert(0); | ||||
|   } | ||||
|   // ugly hack | ||||
|   std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl; | ||||
| //  assert(0); | ||||
| } | ||||
|  | ||||
|   }; | ||||
| } | ||||
| }; | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -23,234 +23,236 @@ Author: Christopher Kelly <ckelly@phys.columbia.edu> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H | ||||
| #define GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
|   template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>  | ||||
|   class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> { | ||||
|   public: | ||||
|     bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge. | ||||
|     // Defaults true. | ||||
|     RealD Tolerance; | ||||
|     Integer MaxIterations; | ||||
|     Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion | ||||
|     Integer ReliableUpdatesPerformed; | ||||
| template<class FieldD,class FieldF,  | ||||
| 	 typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, | ||||
| 	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>  | ||||
| class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> { | ||||
| public: | ||||
|   bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge. | ||||
|   // Defaults true. | ||||
|   RealD Tolerance; | ||||
|   Integer MaxIterations; | ||||
|   Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion | ||||
|   Integer ReliableUpdatesPerformed; | ||||
|  | ||||
|     bool DoFinalCleanup; //Final DP cleanup, defaults to true | ||||
|     Integer IterationsToCleanup; //Final DP cleanup step iterations | ||||
|   bool DoFinalCleanup; //Final DP cleanup, defaults to true | ||||
|   Integer IterationsToCleanup; //Final DP cleanup step iterations | ||||
|      | ||||
|     LinearOperatorBase<FieldF> &Linop_f; | ||||
|     LinearOperatorBase<FieldD> &Linop_d; | ||||
|     GridBase* SinglePrecGrid; | ||||
|     RealD Delta; //reliable update parameter | ||||
|   LinearOperatorBase<FieldF> &Linop_f; | ||||
|   LinearOperatorBase<FieldD> &Linop_d; | ||||
|   GridBase* SinglePrecGrid; | ||||
|   RealD Delta; //reliable update parameter | ||||
|  | ||||
|     //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single | ||||
|     LinearOperatorBase<FieldF> *Linop_fallback; | ||||
|     RealD fallback_transition_tol; | ||||
|   //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single | ||||
|   LinearOperatorBase<FieldF> *Linop_fallback; | ||||
|   RealD fallback_transition_tol; | ||||
|  | ||||
|      | ||||
|     ConjugateGradientReliableUpdate(RealD tol, Integer maxit, RealD _delta, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, bool err_on_no_conv = true) | ||||
|       : Tolerance(tol), | ||||
|         MaxIterations(maxit), | ||||
| 	Delta(_delta), | ||||
| 	Linop_f(_Linop_f), | ||||
| 	Linop_d(_Linop_d), | ||||
| 	SinglePrecGrid(_sp_grid), | ||||
|         ErrorOnNoConverge(err_on_no_conv), | ||||
| 	DoFinalCleanup(true), | ||||
| 	Linop_fallback(NULL) | ||||
|     {}; | ||||
|   ConjugateGradientReliableUpdate(RealD tol, Integer maxit, RealD _delta, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, bool err_on_no_conv = true) | ||||
|     : Tolerance(tol), | ||||
|       MaxIterations(maxit), | ||||
|       Delta(_delta), | ||||
|       Linop_f(_Linop_f), | ||||
|       Linop_d(_Linop_d), | ||||
|       SinglePrecGrid(_sp_grid), | ||||
|       ErrorOnNoConverge(err_on_no_conv), | ||||
|       DoFinalCleanup(true), | ||||
|       Linop_fallback(NULL) | ||||
|   {}; | ||||
|  | ||||
|     void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){ | ||||
|       Linop_fallback = &_Linop_fallback; | ||||
|       fallback_transition_tol = _fallback_transition_tol;       | ||||
|     } | ||||
|   void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){ | ||||
|     Linop_fallback = &_Linop_fallback; | ||||
|     fallback_transition_tol = _fallback_transition_tol;       | ||||
|   } | ||||
|      | ||||
|     void operator()(const FieldD &src, FieldD &psi) { | ||||
|       LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f; | ||||
|       bool using_fallback = false; | ||||
|   void operator()(const FieldD &src, FieldD &psi) { | ||||
|     LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f; | ||||
|     bool using_fallback = false; | ||||
|        | ||||
|       psi.checkerboard = src.checkerboard; | ||||
|       conformable(psi, src); | ||||
|     psi.Checkerboard() = src.Checkerboard(); | ||||
|     conformable(psi, src); | ||||
|  | ||||
|       RealD cp, c, a, d, b, ssq, qq, b_pred; | ||||
|     RealD cp, c, a, d, b, ssq, qq, b_pred; | ||||
|  | ||||
|       FieldD p(src); | ||||
|       FieldD mmp(src); | ||||
|       FieldD r(src); | ||||
|     FieldD p(src); | ||||
|     FieldD mmp(src); | ||||
|     FieldD r(src); | ||||
|  | ||||
|       // Initial residual computation & set up | ||||
|       RealD guess = norm2(psi); | ||||
|       assert(std::isnan(guess) == 0); | ||||
|     // Initial residual computation & set up | ||||
|     RealD guess = norm2(psi); | ||||
|     assert(std::isnan(guess) == 0); | ||||
|      | ||||
|       Linop_d.HermOpAndNorm(psi, mmp, d, b); | ||||
|     Linop_d.HermOpAndNorm(psi, mmp, d, b); | ||||
|      | ||||
|       r = src - mmp; | ||||
|       p = r; | ||||
|     r = src - mmp; | ||||
|     p = r; | ||||
|  | ||||
|       a = norm2(p); | ||||
|       cp = a; | ||||
|       ssq = norm2(src); | ||||
|     a = norm2(p); | ||||
|     cp = a; | ||||
|     ssq = norm2(src); | ||||
|  | ||||
|       std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: guess " << guess << std::endl; | ||||
|       std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:   src " << ssq << std::endl; | ||||
|       std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:    mp " << d << std::endl; | ||||
|       std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:   mmp " << b << std::endl; | ||||
|       std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:  cp,r " << cp << std::endl; | ||||
|       std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:     p " << a << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: guess " << guess << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:   src " << ssq << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:    mp " << d << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:   mmp " << b << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:  cp,r " << cp << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:     p " << a << std::endl; | ||||
|  | ||||
|       RealD rsq = Tolerance * Tolerance * ssq; | ||||
|     RealD rsq = Tolerance * Tolerance * ssq; | ||||
|  | ||||
|       // Check if guess is really REALLY good :) | ||||
|     // Check if guess is really REALLY good :) | ||||
|     if (cp <= rsq) { | ||||
|       std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n"; | ||||
|       std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl; | ||||
|       return; | ||||
|     } | ||||
|  | ||||
|     //Single prec initialization | ||||
|     FieldF r_f(SinglePrecGrid); | ||||
|     r_f.Checkerboard() = r.Checkerboard(); | ||||
|     precisionChange(r_f, r); | ||||
|  | ||||
|     FieldF psi_f(r_f); | ||||
|     psi_f = Zero(); | ||||
|  | ||||
|     FieldF p_f(r_f); | ||||
|     FieldF mmp_f(r_f); | ||||
|  | ||||
|     RealD MaxResidSinceLastRelUp = cp; //initial residual     | ||||
|      | ||||
|     std::cout << GridLogIterative << std::setprecision(4) | ||||
| 	      << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl; | ||||
|  | ||||
|     GridStopWatch LinalgTimer; | ||||
|     GridStopWatch MatrixTimer; | ||||
|     GridStopWatch SolverTimer; | ||||
|  | ||||
|     SolverTimer.Start(); | ||||
|     int k = 0; | ||||
|     int l = 0; | ||||
|      | ||||
|     for (k = 1; k <= MaxIterations; k++) { | ||||
|       c = cp; | ||||
|  | ||||
|       MatrixTimer.Start(); | ||||
|       Linop_f_use->HermOpAndNorm(p_f, mmp_f, d, qq); | ||||
|       MatrixTimer.Stop(); | ||||
|  | ||||
|       LinalgTimer.Start(); | ||||
|  | ||||
|       a = c / d; | ||||
|       b_pred = a * (a * qq - d) / c; | ||||
|  | ||||
|       cp = axpy_norm(r_f, -a, mmp_f, r_f); | ||||
|       b = cp / c; | ||||
|  | ||||
|       // Fuse these loops ; should be really easy | ||||
|       psi_f = a * p_f + psi_f; | ||||
|       //p_f = p_f * b + r_f; | ||||
|  | ||||
|       LinalgTimer.Stop(); | ||||
|  | ||||
|       std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: Iteration " << k | ||||
| 		<< " residual " << cp << " target " << rsq << std::endl; | ||||
|       std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << "  b = "<< b << std::endl; | ||||
|       std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << "  c = "<< c << std::endl; | ||||
|  | ||||
|       if(cp > MaxResidSinceLastRelUp){ | ||||
| 	std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: updating MaxResidSinceLastRelUp : " << MaxResidSinceLastRelUp << " -> " << cp << std::endl; | ||||
| 	MaxResidSinceLastRelUp = cp; | ||||
|       } | ||||
| 	   | ||||
|       // Stopping condition | ||||
|       if (cp <= rsq) { | ||||
| 	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n"; | ||||
| 	std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl; | ||||
| 	//Although not written in the paper, I assume that I have to add on the final solution | ||||
| 	precisionChange(mmp, psi_f); | ||||
| 	psi = psi + mmp; | ||||
| 	 | ||||
| 	 | ||||
| 	SolverTimer.Stop(); | ||||
| 	Linop_d.HermOpAndNorm(psi, mmp, d, qq); | ||||
| 	p = mmp - src; | ||||
|  | ||||
| 	RealD srcnorm = std::sqrt(norm2(src)); | ||||
| 	RealD resnorm = std::sqrt(norm2(p)); | ||||
| 	RealD true_residual = resnorm / srcnorm; | ||||
|  | ||||
| 	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl; | ||||
| 	std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl; | ||||
|  | ||||
| 	std::cout << GridLogMessage << "Time breakdown "<<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl; | ||||
|  | ||||
| 	IterationsToComplete = k;	 | ||||
| 	ReliableUpdatesPerformed = l; | ||||
| 	   | ||||
| 	if(DoFinalCleanup){ | ||||
| 	  //Do a final CG to cleanup | ||||
| 	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate performing final cleanup.\n"; | ||||
| 	  ConjugateGradient<FieldD> CG(Tolerance,MaxIterations); | ||||
| 	  CG.ErrorOnNoConverge = ErrorOnNoConverge; | ||||
| 	  CG(Linop_d,src,psi); | ||||
| 	  IterationsToCleanup = CG.IterationsToComplete; | ||||
| 	} | ||||
| 	else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); | ||||
|  | ||||
| 	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n"; | ||||
| 	return; | ||||
|       } | ||||
|       else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update | ||||
| 	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate " | ||||
| 		  << cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n"; | ||||
| 	precisionChange(mmp, psi_f); | ||||
| 	psi = psi + mmp; | ||||
|  | ||||
|       //Single prec initialization | ||||
|       FieldF r_f(SinglePrecGrid); | ||||
|       r_f.checkerboard = r.checkerboard; | ||||
|       precisionChange(r_f, r); | ||||
| 	Linop_d.HermOpAndNorm(psi, mmp, d, qq); | ||||
| 	r = src - mmp; | ||||
|  | ||||
|       FieldF psi_f(r_f); | ||||
|       psi_f = zero; | ||||
| 	psi_f = Zero(); | ||||
| 	precisionChange(r_f, r); | ||||
| 	cp = norm2(r); | ||||
| 	MaxResidSinceLastRelUp = cp; | ||||
|  | ||||
|       FieldF p_f(r_f); | ||||
|       FieldF mmp_f(r_f); | ||||
|  | ||||
|       RealD MaxResidSinceLastRelUp = cp; //initial residual     | ||||
|      | ||||
|       std::cout << GridLogIterative << std::setprecision(4) | ||||
| 		<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl; | ||||
|  | ||||
|       GridStopWatch LinalgTimer; | ||||
|       GridStopWatch MatrixTimer; | ||||
|       GridStopWatch SolverTimer; | ||||
|  | ||||
|       SolverTimer.Start(); | ||||
|       int k = 0; | ||||
|       int l = 0; | ||||
|      | ||||
|       for (k = 1; k <= MaxIterations; k++) { | ||||
| 	c = cp; | ||||
|  | ||||
| 	MatrixTimer.Start(); | ||||
| 	Linop_f_use->HermOpAndNorm(p_f, mmp_f, d, qq); | ||||
| 	MatrixTimer.Stop(); | ||||
|  | ||||
| 	LinalgTimer.Start(); | ||||
|  | ||||
| 	a = c / d; | ||||
| 	b_pred = a * (a * qq - d) / c; | ||||
|  | ||||
| 	cp = axpy_norm(r_f, -a, mmp_f, r_f); | ||||
| 	b = cp / c; | ||||
|  | ||||
| 	// Fuse these loops ; should be really easy | ||||
| 	psi_f = a * p_f + psi_f; | ||||
| 	//p_f = p_f * b + r_f; | ||||
|  | ||||
| 	LinalgTimer.Stop(); | ||||
|  | ||||
| 	std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: Iteration " << k | ||||
| 		  << " residual " << cp << " target " << rsq << std::endl; | ||||
| 	std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << "  b = "<< b << std::endl; | ||||
| 	std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << "  c = "<< c << std::endl; | ||||
|  | ||||
| 	if(cp > MaxResidSinceLastRelUp){ | ||||
| 	  std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: updating MaxResidSinceLastRelUp : " << MaxResidSinceLastRelUp << " -> " << cp << std::endl; | ||||
| 	  MaxResidSinceLastRelUp = cp; | ||||
| 	} | ||||
| 	b = cp/c; | ||||
| 	   | ||||
| 	// Stopping condition | ||||
| 	if (cp <= rsq) { | ||||
| 	  //Although not written in the paper, I assume that I have to add on the final solution | ||||
| 	  precisionChange(mmp, psi_f); | ||||
| 	  psi = psi + mmp; | ||||
| 	 | ||||
| 	 | ||||
| 	  SolverTimer.Stop(); | ||||
| 	  Linop_d.HermOpAndNorm(psi, mmp, d, qq); | ||||
| 	  p = mmp - src; | ||||
|  | ||||
| 	  RealD srcnorm = sqrt(norm2(src)); | ||||
| 	  RealD resnorm = sqrt(norm2(p)); | ||||
| 	  RealD true_residual = resnorm / srcnorm; | ||||
|  | ||||
| 	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl; | ||||
| 	  std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl; | ||||
| 	  std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl; | ||||
| 	  std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl; | ||||
|  | ||||
| 	  std::cout << GridLogMessage << "Time breakdown "<<std::endl; | ||||
| 	  std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl; | ||||
| 	  std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl; | ||||
| 	  std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl; | ||||
|  | ||||
| 	  IterationsToComplete = k;	 | ||||
| 	  ReliableUpdatesPerformed = l; | ||||
| 	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate new residual " << cp << std::endl; | ||||
| 	   | ||||
| 	  if(DoFinalCleanup){ | ||||
| 	    //Do a final CG to cleanup | ||||
| 	    std::cout << GridLogMessage << "ConjugateGradientReliableUpdate performing final cleanup.\n"; | ||||
| 	    ConjugateGradient<FieldD> CG(Tolerance,MaxIterations); | ||||
| 	    CG.ErrorOnNoConverge = ErrorOnNoConverge; | ||||
| 	    CG(Linop_d,src,psi); | ||||
| 	    IterationsToCleanup = CG.IterationsToComplete; | ||||
| 	  } | ||||
| 	  else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); | ||||
|  | ||||
| 	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n"; | ||||
| 	  return; | ||||
| 	} | ||||
| 	else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update | ||||
| 	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate " | ||||
| 		    << cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n"; | ||||
| 	  precisionChange(mmp, psi_f); | ||||
| 	  psi = psi + mmp; | ||||
|  | ||||
| 	  Linop_d.HermOpAndNorm(psi, mmp, d, qq); | ||||
| 	  r = src - mmp; | ||||
|  | ||||
| 	  psi_f = zero; | ||||
| 	  precisionChange(r_f, r); | ||||
| 	  cp = norm2(r); | ||||
| 	  MaxResidSinceLastRelUp = cp; | ||||
|  | ||||
| 	  b = cp/c; | ||||
| 	   | ||||
| 	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate new residual " << cp << std::endl; | ||||
| 	   | ||||
| 	  l = l+1; | ||||
| 	} | ||||
|  | ||||
| 	p_f = p_f * b + r_f; //update search vector after reliable update appears to help convergence | ||||
|  | ||||
| 	if(!using_fallback && Linop_fallback != NULL && cp < fallback_transition_tol){ | ||||
| 	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate switching to fallback linear operator on iteration " << k << " at residual " << cp << std::endl; | ||||
| 	  Linop_f_use = Linop_fallback; | ||||
| 	  using_fallback = true; | ||||
| 	} | ||||
|  | ||||
| 	 | ||||
| 	l = l+1; | ||||
|       } | ||||
|       std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge" | ||||
| 		<< std::endl; | ||||
|  | ||||
|       p_f = p_f * b + r_f; //update search vector after reliable update appears to help convergence | ||||
|  | ||||
|       if(!using_fallback && Linop_fallback != NULL && cp < fallback_transition_tol){ | ||||
| 	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate switching to fallback linear operator on iteration " << k << " at residual " << cp << std::endl; | ||||
| 	Linop_f_use = Linop_fallback; | ||||
| 	using_fallback = true; | ||||
|       } | ||||
|  | ||||
| 	 | ||||
|     } | ||||
|     std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge" | ||||
| 	      << std::endl; | ||||
|        | ||||
|       if (ErrorOnNoConverge) assert(0); | ||||
|       IterationsToComplete = k; | ||||
|       ReliableUpdatesPerformed = l;       | ||||
|     }     | ||||
|   }; | ||||
|  | ||||
|  | ||||
|     if (ErrorOnNoConverge) assert(0); | ||||
|     IterationsToComplete = k; | ||||
|     ReliableUpdatesPerformed = l;       | ||||
|   }     | ||||
| }; | ||||
|  | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
|  | ||||
|  | ||||
| #endif | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -24,88 +24,90 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_CONJUGATE_RESIDUAL_H | ||||
| #define GRID_CONJUGATE_RESIDUAL_H | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
|     ///////////////////////////////////////////////////////////// | ||||
|     // Base classes for iterative processes based on operators | ||||
|     // single input vec, single output vec. | ||||
|     ///////////////////////////////////////////////////////////// | ||||
| ///////////////////////////////////////////////////////////// | ||||
| // Base classes for iterative processes based on operators | ||||
| // single input vec, single output vec. | ||||
| ///////////////////////////////////////////////////////////// | ||||
|  | ||||
|   template<class Field>  | ||||
|     class ConjugateResidual : public OperatorFunction<Field> { | ||||
|   public:                                                 | ||||
|     RealD   Tolerance; | ||||
|     Integer MaxIterations; | ||||
|     int verbose; | ||||
| template<class Field>  | ||||
| class ConjugateResidual : public OperatorFunction<Field> { | ||||
| public:                                                 | ||||
|   using OperatorFunction<Field>::operator(); | ||||
|  | ||||
|     ConjugateResidual(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) {  | ||||
|       verbose=0; | ||||
|     }; | ||||
|   RealD   Tolerance; | ||||
|   Integer MaxIterations; | ||||
|   int verbose; | ||||
|  | ||||
|     void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){ | ||||
|   ConjugateResidual(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) {  | ||||
|     verbose=0; | ||||
|   }; | ||||
|  | ||||
|       RealD a, b, c, d; | ||||
|       RealD cp, ssq,rsq; | ||||
|   void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){ | ||||
|  | ||||
|     RealD a, b; // c, d; | ||||
|     RealD cp, ssq,rsq; | ||||
|        | ||||
|       RealD rAr, rAAr, rArp; | ||||
|       RealD pAp, pAAp; | ||||
|     RealD rAr, rAAr, rArp; | ||||
|     RealD pAp, pAAp; | ||||
|  | ||||
|       GridBase *grid = src._grid; | ||||
|       psi=zero; | ||||
|       Field r(grid),  p(grid), Ap(grid), Ar(grid); | ||||
|     GridBase *grid = src.Grid(); | ||||
|     psi=Zero(); | ||||
|     Field r(grid),  p(grid), Ap(grid), Ar(grid); | ||||
|        | ||||
|       r=src; | ||||
|       p=src; | ||||
|     r=src; | ||||
|     p=src; | ||||
|  | ||||
|     Linop.HermOpAndNorm(p,Ap,pAp,pAAp); | ||||
|     Linop.HermOpAndNorm(r,Ar,rAr,rAAr); | ||||
|  | ||||
|     cp =norm2(r); | ||||
|     ssq=norm2(src); | ||||
|     rsq=Tolerance*Tolerance*ssq; | ||||
|  | ||||
|     if (verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl; | ||||
|  | ||||
|     for(int k=1;k<MaxIterations;k++){ | ||||
|  | ||||
|       a = rAr/pAAp; | ||||
|  | ||||
|       axpy(psi,a,p,psi); | ||||
|  | ||||
|       cp = axpy_norm(r,-a,Ap,r); | ||||
|  | ||||
|       rArp=rAr; | ||||
|  | ||||
|       Linop.HermOpAndNorm(p,Ap,pAp,pAAp); | ||||
|       Linop.HermOpAndNorm(r,Ar,rAr,rAAr); | ||||
|  | ||||
|       cp =norm2(r); | ||||
|       ssq=norm2(src); | ||||
|       rsq=Tolerance*Tolerance*ssq; | ||||
|  | ||||
|       if (verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl; | ||||
|  | ||||
|       for(int k=1;k<MaxIterations;k++){ | ||||
|  | ||||
| 	a = rAr/pAAp; | ||||
|  | ||||
| 	axpy(psi,a,p,psi); | ||||
|  | ||||
| 	cp = axpy_norm(r,-a,Ap,r); | ||||
|  | ||||
| 	rArp=rAr; | ||||
|  | ||||
| 	Linop.HermOpAndNorm(r,Ar,rAr,rAAr); | ||||
|  | ||||
| 	b   =rAr/rArp; | ||||
|       b   =rAr/rArp; | ||||
|   | ||||
| 	axpy(p,b,p,r); | ||||
| 	pAAp=axpy_norm(Ap,b,Ap,Ar); | ||||
|       axpy(p,b,p,r); | ||||
|       pAAp=axpy_norm(Ap,b,Ap,Ar); | ||||
| 	 | ||||
| 	if(verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl; | ||||
|  | ||||
| 	if(cp<rsq) { | ||||
| 	  Linop.HermOp(psi,Ap); | ||||
| 	  axpy(r,-1.0,src,Ap); | ||||
| 	  RealD true_resid = norm2(r)/ssq; | ||||
| 	  std::cout<<GridLogMessage<<"ConjugateResidual: Converged on iteration " <<k | ||||
| 		   << " computed residual "<<sqrt(cp/ssq) | ||||
| 	           << " true residual "<<sqrt(true_resid) | ||||
| 	           << " target "       <<Tolerance <<std::endl; | ||||
| 	  return; | ||||
| 	} | ||||
|       if(verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl; | ||||
|  | ||||
|       if(cp<rsq) { | ||||
| 	Linop.HermOp(psi,Ap); | ||||
| 	axpy(r,-1.0,src,Ap); | ||||
| 	RealD true_resid = norm2(r)/ssq; | ||||
| 	std::cout<<GridLogMessage<<"ConjugateResidual: Converged on iteration " <<k | ||||
| 		 << " computed residual "<<std::sqrt(cp/ssq) | ||||
| 		 << " true residual "<<std::sqrt(true_resid) | ||||
| 		 << " target "       <<Tolerance <<std::endl; | ||||
| 	return; | ||||
|       } | ||||
|  | ||||
|       std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl; | ||||
|       assert(0); | ||||
|     } | ||||
|   }; | ||||
| } | ||||
|  | ||||
|     std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl; | ||||
|     assert(0); | ||||
|   } | ||||
| }; | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
|   | ||||
| @@ -33,7 +33,7 @@ namespace Grid { | ||||
| template<class Field> | ||||
| class ZeroGuesser: public LinearFunction<Field> { | ||||
| public: | ||||
|   virtual void operator()(const Field &src, Field &guess) { guess = zero; }; | ||||
|     virtual void operator()(const Field &src, Field &guess) { guess = Zero(); }; | ||||
| }; | ||||
| template<class Field> | ||||
| class DoNothingGuesser: public LinearFunction<Field> { | ||||
| @@ -60,14 +60,14 @@ public: | ||||
|   DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {}; | ||||
|  | ||||
|   virtual void operator()(const Field &src,Field &guess) { | ||||
|     guess = zero; | ||||
|     guess = Zero(); | ||||
|     assert(evec.size()==eval.size()); | ||||
|     auto N = evec.size(); | ||||
|     for (int i=0;i<N;i++) { | ||||
|       const Field& tmp = evec[i]; | ||||
|       axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess); | ||||
|     } | ||||
|     guess.checkerboard = src.checkerboard; | ||||
|     guess.Checkerboard() = src.Checkerboard(); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| @@ -90,15 +90,15 @@ public: | ||||
|    | ||||
|   void operator()(const FineField &src,FineField &guess) {  | ||||
|     int N = (int)evec_coarse.size(); | ||||
|     CoarseField src_coarse(evec_coarse[0]._grid); | ||||
|     CoarseField guess_coarse(evec_coarse[0]._grid);    guess_coarse = zero; | ||||
|     CoarseField src_coarse(evec_coarse[0].Grid()); | ||||
|     CoarseField guess_coarse(evec_coarse[0].Grid());    guess_coarse = Zero(); | ||||
|     blockProject(src_coarse,src,subspace);     | ||||
|     for (int i=0;i<N;i++) { | ||||
|       const CoarseField & tmp = evec_coarse[i]; | ||||
|       axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse); | ||||
|     } | ||||
|     blockPromote(guess_coarse,guess,subspace); | ||||
|     guess.checkerboard = src.checkerboard; | ||||
|     guess.Checkerboard() = src.Checkerboard(); | ||||
|   }; | ||||
| }; | ||||
|  | ||||
|   | ||||
| @@ -34,6 +34,8 @@ namespace Grid { | ||||
| template<class Field> | ||||
| class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> { | ||||
|  public: | ||||
|   using OperatorFunction<Field>::operator(); | ||||
|  | ||||
|   bool ErrorOnNoConverge; // Throw an assert when FCAGMRES fails to converge, | ||||
|                           // defaults to true | ||||
|  | ||||
| @@ -53,10 +55,10 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF | ||||
|  | ||||
|   Eigen::MatrixXcd H; | ||||
|  | ||||
|   std::vector<std::complex<double>> y; | ||||
|   std::vector<std::complex<double>> gamma; | ||||
|   std::vector<std::complex<double>> c; | ||||
|   std::vector<std::complex<double>> s; | ||||
|   std::vector<ComplexD> y; | ||||
|   std::vector<ComplexD> gamma; | ||||
|   std::vector<ComplexD> c; | ||||
|   std::vector<ComplexD> s; | ||||
|  | ||||
|   LinearFunction<Field> &Preconditioner; | ||||
|  | ||||
| @@ -81,7 +83,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF | ||||
|  | ||||
|     std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular FGMRES" << std::endl; | ||||
|  | ||||
|     psi.checkerboard = src.checkerboard; | ||||
|     psi.Checkerboard() = src.Checkerboard(); | ||||
|     conformable(psi, src); | ||||
|  | ||||
|     RealD guess = norm2(psi); | ||||
| @@ -91,7 +93,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF | ||||
|     RealD ssq = norm2(src); | ||||
|     RealD rsq = Tolerance * Tolerance * ssq; | ||||
|  | ||||
|     Field r(src._grid); | ||||
|     Field r(src.Grid()); | ||||
|  | ||||
|     std::cout << std::setprecision(4) << std::scientific; | ||||
|     std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl; | ||||
| @@ -149,12 +151,12 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF | ||||
|  | ||||
|     RealD cp = 0; | ||||
|  | ||||
|     Field w(src._grid); | ||||
|     Field r(src._grid); | ||||
|     Field w(src.Grid()); | ||||
|     Field r(src.Grid()); | ||||
|  | ||||
|     // these should probably be made class members so that they are only allocated once, not in every restart | ||||
|     std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero; | ||||
|     std::vector<Field> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero; | ||||
|     std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero(); | ||||
|     std::vector<Field> z(RestartLength + 1, src.Grid()); for (auto &elem : z) elem = Zero(); | ||||
|  | ||||
|     MatrixTimer.Start(); | ||||
|     LinOp.Op(psi, w); | ||||
| @@ -176,7 +178,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF | ||||
|  | ||||
|       qrUpdate(i); | ||||
|  | ||||
|       cp = std::norm(gamma[i+1]); | ||||
|       cp = norm(gamma[i+1]); | ||||
|  | ||||
|       std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount | ||||
|                 << " residual " << cp << " target " << rsq << std::endl; | ||||
| @@ -206,11 +208,11 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF | ||||
|     LinalgTimer.Start(); | ||||
|     for (int i = 0; i <= iter; ++i) { | ||||
|       H(iter, i) = innerProduct(v[i], w); | ||||
|       w = w - H(iter, i) * v[i]; | ||||
|       w = w - ComplexD(H(iter, i)) * v[i]; | ||||
|     } | ||||
|  | ||||
|     H(iter, iter + 1) = sqrt(norm2(w)); | ||||
|     v[iter + 1] = (1. / H(iter, iter + 1)) * w; | ||||
|     v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w; | ||||
|     LinalgTimer.Stop(); | ||||
|   } | ||||
|  | ||||
| @@ -218,13 +220,13 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF | ||||
|  | ||||
|     QrTimer.Start(); | ||||
|     for (int i = 0; i < iter ; ++i) { | ||||
|       auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1); | ||||
|       H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1); | ||||
|       auto tmp       = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1)); | ||||
|       H(iter, i)     = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1)); | ||||
|       H(iter, i + 1) = tmp; | ||||
|     } | ||||
|  | ||||
|     // Compute new Givens Rotation | ||||
|     ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1))); | ||||
|     auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1))); | ||||
|     c[iter]     = H(iter, iter) / nu; | ||||
|     s[iter]     = H(iter, iter + 1) / nu; | ||||
|  | ||||
| @@ -233,7 +235,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF | ||||
|     H(iter, iter + 1) = 0.; | ||||
|  | ||||
|     gamma[iter + 1] = -s[iter] * gamma[iter]; | ||||
|     gamma[iter]     = std::conj(c[iter]) * gamma[iter]; | ||||
|     gamma[iter]     = conjugate(c[iter]) * gamma[iter]; | ||||
|     QrTimer.Stop(); | ||||
|   } | ||||
|  | ||||
| @@ -243,8 +245,8 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF | ||||
|     for (int i = iter; i >= 0; i--) { | ||||
|       y[i] = gamma[i]; | ||||
|       for (int k = i + 1; k <= iter; k++) | ||||
|         y[i] = y[i] - H(k, i) * y[k]; | ||||
|       y[i] = y[i] / H(i, i); | ||||
|         y[i] = y[i] - ComplexD(H(k, i)) * y[k]; | ||||
|       y[i] = y[i] / ComplexD(H(i, i)); | ||||
|     } | ||||
|  | ||||
|     for (int i = 0; i <= iter; i++) | ||||
|   | ||||
| @@ -34,6 +34,8 @@ namespace Grid { | ||||
| template<class Field> | ||||
| class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> { | ||||
|  public: | ||||
|   using OperatorFunction<Field>::operator(); | ||||
|  | ||||
|   bool ErrorOnNoConverge; // Throw an assert when FGMRES fails to converge, | ||||
|                           // defaults to true | ||||
|  | ||||
| @@ -53,10 +55,10 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> { | ||||
|  | ||||
|   Eigen::MatrixXcd H; | ||||
|  | ||||
|   std::vector<std::complex<double>> y; | ||||
|   std::vector<std::complex<double>> gamma; | ||||
|   std::vector<std::complex<double>> c; | ||||
|   std::vector<std::complex<double>> s; | ||||
|   std::vector<ComplexD> y; | ||||
|   std::vector<ComplexD> gamma; | ||||
|   std::vector<ComplexD> c; | ||||
|   std::vector<ComplexD> s; | ||||
|  | ||||
|   LinearFunction<Field> &Preconditioner; | ||||
|  | ||||
| @@ -79,7 +81,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> { | ||||
|  | ||||
|   void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) { | ||||
|  | ||||
|     psi.checkerboard = src.checkerboard; | ||||
|     psi.Checkerboard() = src.Checkerboard(); | ||||
|     conformable(psi, src); | ||||
|  | ||||
|     RealD guess = norm2(psi); | ||||
| @@ -89,7 +91,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> { | ||||
|     RealD ssq = norm2(src); | ||||
|     RealD rsq = Tolerance * Tolerance * ssq; | ||||
|  | ||||
|     Field r(src._grid); | ||||
|     Field r(src.Grid()); | ||||
|  | ||||
|     std::cout << std::setprecision(4) << std::scientific; | ||||
|     std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: guess " << guess << std::endl; | ||||
| @@ -147,12 +149,12 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> { | ||||
|  | ||||
|     RealD cp = 0; | ||||
|  | ||||
|     Field w(src._grid); | ||||
|     Field r(src._grid); | ||||
|     Field w(src.Grid()); | ||||
|     Field r(src.Grid()); | ||||
|  | ||||
|     // these should probably be made class members so that they are only allocated once, not in every restart | ||||
|     std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero; | ||||
|     std::vector<Field> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero; | ||||
|     std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero(); | ||||
|     std::vector<Field> z(RestartLength + 1, src.Grid()); for (auto &elem : z) elem = Zero(); | ||||
|  | ||||
|     MatrixTimer.Start(); | ||||
|     LinOp.Op(psi, w); | ||||
| @@ -174,7 +176,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> { | ||||
|  | ||||
|       qrUpdate(i); | ||||
|  | ||||
|       cp = std::norm(gamma[i+1]); | ||||
|       cp = norm(gamma[i+1]); | ||||
|  | ||||
|       std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: Iteration " << IterationCount | ||||
|                 << " residual " << cp << " target " << rsq << std::endl; | ||||
| @@ -204,11 +206,11 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> { | ||||
|     LinalgTimer.Start(); | ||||
|     for (int i = 0; i <= iter; ++i) { | ||||
|       H(iter, i) = innerProduct(v[i], w); | ||||
|       w = w - H(iter, i) * v[i]; | ||||
|       w = w - ComplexD(H(iter, i)) * v[i]; | ||||
|     } | ||||
|  | ||||
|     H(iter, iter + 1) = sqrt(norm2(w)); | ||||
|     v[iter + 1] = (1. / H(iter, iter + 1)) * w; | ||||
|     v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w; | ||||
|     LinalgTimer.Stop(); | ||||
|   } | ||||
|  | ||||
| @@ -216,13 +218,13 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> { | ||||
|  | ||||
|     QrTimer.Start(); | ||||
|     for (int i = 0; i < iter ; ++i) { | ||||
|       auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1); | ||||
|       H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1); | ||||
|       auto tmp       = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1)); | ||||
|       H(iter, i)     = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1)); | ||||
|       H(iter, i + 1) = tmp; | ||||
|     } | ||||
|  | ||||
|     // Compute new Givens Rotation | ||||
|     ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1))); | ||||
|     auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1))); | ||||
|     c[iter]     = H(iter, iter) / nu; | ||||
|     s[iter]     = H(iter, iter + 1) / nu; | ||||
|  | ||||
| @@ -231,7 +233,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> { | ||||
|     H(iter, iter + 1) = 0.; | ||||
|  | ||||
|     gamma[iter + 1] = -s[iter] * gamma[iter]; | ||||
|     gamma[iter]     = std::conj(c[iter]) * gamma[iter]; | ||||
|     gamma[iter]     = conjugate(c[iter]) * gamma[iter]; | ||||
|     QrTimer.Stop(); | ||||
|   } | ||||
|  | ||||
| @@ -241,8 +243,8 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> { | ||||
|     for (int i = iter; i >= 0; i--) { | ||||
|       y[i] = gamma[i]; | ||||
|       for (int k = i + 1; k <= iter; k++) | ||||
|         y[i] = y[i] - H(k, i) * y[k]; | ||||
|       y[i] = y[i] / H(i, i); | ||||
|         y[i] = y[i] - ComplexD(H(k, i)) * y[k]; | ||||
|       y[i] = y[i] / ComplexD(H(i, i)); | ||||
|     } | ||||
|  | ||||
|     for (int i = 0; i <= iter; i++) | ||||
|   | ||||
| @@ -34,6 +34,8 @@ namespace Grid { | ||||
| template<class Field> | ||||
| class GeneralisedMinimalResidual : public OperatorFunction<Field> { | ||||
|  public: | ||||
|   using OperatorFunction<Field>::operator(); | ||||
|  | ||||
|   bool ErrorOnNoConverge; // Throw an assert when GMRES fails to converge, | ||||
|                           // defaults to true | ||||
|  | ||||
| @@ -52,10 +54,10 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> { | ||||
|  | ||||
|   Eigen::MatrixXcd H; | ||||
|  | ||||
|   std::vector<std::complex<double>> y; | ||||
|   std::vector<std::complex<double>> gamma; | ||||
|   std::vector<std::complex<double>> c; | ||||
|   std::vector<std::complex<double>> s; | ||||
|   std::vector<ComplexD> y; | ||||
|   std::vector<ComplexD> gamma; | ||||
|   std::vector<ComplexD> c; | ||||
|   std::vector<ComplexD> s; | ||||
|  | ||||
|   GeneralisedMinimalResidual(RealD   tol, | ||||
|                              Integer maxit, | ||||
| @@ -74,7 +76,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> { | ||||
|  | ||||
|   void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) { | ||||
|  | ||||
|     psi.checkerboard = src.checkerboard; | ||||
|     psi.Checkerboard() = src.Checkerboard(); | ||||
|     conformable(psi, src); | ||||
|  | ||||
|     RealD guess = norm2(psi); | ||||
| @@ -84,7 +86,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> { | ||||
|     RealD ssq = norm2(src); | ||||
|     RealD rsq = Tolerance * Tolerance * ssq; | ||||
|  | ||||
|     Field r(src._grid); | ||||
|     Field r(src.Grid()); | ||||
|  | ||||
|     std::cout << std::setprecision(4) << std::scientific; | ||||
|     std::cout << GridLogIterative << "GeneralisedMinimalResidual: guess " << guess << std::endl; | ||||
| @@ -140,11 +142,11 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> { | ||||
|  | ||||
|     RealD cp = 0; | ||||
|  | ||||
|     Field w(src._grid); | ||||
|     Field r(src._grid); | ||||
|     Field w(src.Grid()); | ||||
|     Field r(src.Grid()); | ||||
|  | ||||
|     // this should probably be made a class member so that it is only allocated once, not in every restart | ||||
|     std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero; | ||||
|     std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero(); | ||||
|  | ||||
|     MatrixTimer.Start(); | ||||
|     LinOp.Op(psi, w); | ||||
| @@ -166,7 +168,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> { | ||||
|  | ||||
|       qrUpdate(i); | ||||
|  | ||||
|       cp = std::norm(gamma[i+1]); | ||||
|       cp = norm(gamma[i+1]); | ||||
|  | ||||
|       std::cout << GridLogIterative << "GeneralisedMinimalResidual: Iteration " << IterationCount | ||||
|                 << " residual " << cp << " target " << rsq << std::endl; | ||||
| @@ -192,11 +194,11 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> { | ||||
|     LinalgTimer.Start(); | ||||
|     for (int i = 0; i <= iter; ++i) { | ||||
|       H(iter, i) = innerProduct(v[i], w); | ||||
|       w = w - H(iter, i) * v[i]; | ||||
|       w = w - ComplexD(H(iter, i)) * v[i]; | ||||
|     } | ||||
|  | ||||
|     H(iter, iter + 1) = sqrt(norm2(w)); | ||||
|     v[iter + 1] = (1. / H(iter, iter + 1)) * w; | ||||
|     v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w; | ||||
|     LinalgTimer.Stop(); | ||||
|   } | ||||
|  | ||||
| @@ -204,13 +206,13 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> { | ||||
|  | ||||
|     QrTimer.Start(); | ||||
|     for (int i = 0; i < iter ; ++i) { | ||||
|       auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1); | ||||
|       H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1); | ||||
|       auto tmp       = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1)); | ||||
|       H(iter, i)     = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1)); | ||||
|       H(iter, i + 1) = tmp; | ||||
|     } | ||||
|  | ||||
|     // Compute new Givens Rotation | ||||
|     ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1))); | ||||
|     auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1))); | ||||
|     c[iter]     = H(iter, iter) / nu; | ||||
|     s[iter]     = H(iter, iter + 1) / nu; | ||||
|  | ||||
| @@ -219,7 +221,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> { | ||||
|     H(iter, iter + 1) = 0.; | ||||
|  | ||||
|     gamma[iter + 1] = -s[iter] * gamma[iter]; | ||||
|     gamma[iter]     = std::conj(c[iter]) * gamma[iter]; | ||||
|     gamma[iter]     = conjugate(c[iter]) * gamma[iter]; | ||||
|     QrTimer.Stop(); | ||||
|   } | ||||
|  | ||||
| @@ -229,8 +231,8 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> { | ||||
|     for (int i = iter; i >= 0; i--) { | ||||
|       y[i] = gamma[i]; | ||||
|       for (int k = i + 1; k <= iter; k++) | ||||
|         y[i] = y[i] - H(k, i) * y[k]; | ||||
|       y[i] = y[i] / H(i, i); | ||||
|         y[i] = y[i] - ComplexD(H(k, i)) * y[k]; | ||||
|       y[i] = y[i] / ComplexD(H(i, i)); | ||||
|     } | ||||
|  | ||||
|     for (int i = 0; i <= iter; i++) | ||||
|   | ||||
| @@ -35,120 +35,7 @@ Author: Christoph Lehner <clehner@bnl.gov> | ||||
| //#include <zlib.h> | ||||
| #include <sys/stat.h> | ||||
|  | ||||
| namespace Grid {  | ||||
|  | ||||
|   //////////////////////////////////////////////////////// | ||||
|   // Move following 100 LOC to lattice/Lattice_basis.h | ||||
|   //////////////////////////////////////////////////////// | ||||
| template<class Field> | ||||
| void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)  | ||||
| { | ||||
|   for(int j=0; j<k; ++j){ | ||||
|     auto ip = innerProduct(basis[j],w); | ||||
|     w = w - ip*basis[j]; | ||||
|   } | ||||
| } | ||||
|  | ||||
| template<class Field> | ||||
| void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm)  | ||||
| { | ||||
|   typedef typename Field::vector_object vobj; | ||||
|   GridBase* grid = basis[0]._grid; | ||||
|        | ||||
|   parallel_region | ||||
|   { | ||||
|  | ||||
|     std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private | ||||
|         | ||||
|     parallel_for_internal(int ss=0;ss < grid->oSites();ss++){ | ||||
|       for(int j=j0; j<j1; ++j) B[j]=0.; | ||||
|        | ||||
|       for(int j=j0; j<j1; ++j){ | ||||
| 	for(int k=k0; k<k1; ++k){ | ||||
| 	  B[j] +=Qt(j,k) * basis[k]._odata[ss]; | ||||
| 	} | ||||
|       } | ||||
|       for(int j=j0; j<j1; ++j){ | ||||
| 	  basis[j]._odata[ss] = B[j]; | ||||
|       } | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| // Extract a single rotated vector | ||||
| template<class Field> | ||||
| void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)  | ||||
| { | ||||
|   typedef typename Field::vector_object vobj; | ||||
|   GridBase* grid = basis[0]._grid; | ||||
|  | ||||
|   result.checkerboard = basis[0].checkerboard; | ||||
|   parallel_for(int ss=0;ss < grid->oSites();ss++){ | ||||
|     vobj B = zero; | ||||
|     for(int k=k0; k<k1; ++k){ | ||||
|       B +=Qt(j,k) * basis[k]._odata[ss]; | ||||
|     } | ||||
|     result._odata[ss] = B; | ||||
|   } | ||||
| } | ||||
|  | ||||
| template<class Field> | ||||
| void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx)  | ||||
| { | ||||
|   int vlen = idx.size(); | ||||
|  | ||||
|   assert(vlen>=1); | ||||
|   assert(vlen<=sort_vals.size()); | ||||
|   assert(vlen<=_v.size()); | ||||
|  | ||||
|   for (size_t i=0;i<vlen;i++) { | ||||
|  | ||||
|     if (idx[i] != i) { | ||||
|  | ||||
|       ////////////////////////////////////// | ||||
|       // idx[i] is a table of desired sources giving a permutation. | ||||
|       // Swap v[i] with v[idx[i]]. | ||||
|       // Find  j>i for which _vnew[j] = _vold[i], | ||||
|       // track the move idx[j] => idx[i] | ||||
|       // track the move idx[i] => i | ||||
|       ////////////////////////////////////// | ||||
|       size_t j; | ||||
|       for (j=i;j<idx.size();j++) | ||||
| 	if (idx[j]==i) | ||||
| 	  break; | ||||
|  | ||||
|       assert(idx[i] > i);     assert(j!=idx.size());      assert(idx[j]==i); | ||||
|  | ||||
|       std::swap(_v[i]._odata,_v[idx[i]]._odata); // should use vector move constructor, no data copy | ||||
|       std::swap(sort_vals[i],sort_vals[idx[i]]); | ||||
|  | ||||
|       idx[j] = idx[i]; | ||||
|       idx[i] = i; | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals)  | ||||
| { | ||||
|   std::vector<int> idx(sort_vals.size()); | ||||
|   std::iota(idx.begin(), idx.end(), 0); | ||||
|  | ||||
|   // sort indexes based on comparing values in v | ||||
|   std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) { | ||||
|     return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]); | ||||
|   }); | ||||
|   return idx; | ||||
| } | ||||
|  | ||||
| template<class Field> | ||||
| void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse)  | ||||
| { | ||||
|   std::vector<int> idx = basisSortGetIndex(sort_vals); | ||||
|   if (reverse) | ||||
|     std::reverse(idx.begin(), idx.end()); | ||||
|    | ||||
|   basisReorderInPlace(_v,sort_vals,idx); | ||||
| } | ||||
| NAMESPACE_BEGIN(Grid);  | ||||
|  | ||||
| ///////////////////////////////////////////////////////////// | ||||
| // Implicitly restarted lanczos | ||||
| @@ -259,7 +146,7 @@ public: | ||||
| 			    RealD _eresid, // resid in lmdue deficit  | ||||
| 			    int _MaxIter, // Max iterations | ||||
| 			    RealD _betastp=0.0, // if beta(k) < betastp: converged | ||||
| 			    int _MinRestart=1, int _orth_period = 1, | ||||
| 			    int _MinRestart=0, int _orth_period = 1, | ||||
| 			    IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) : | ||||
|     SimpleTester(HermOp), _PolyOp(PolyOp),      _HermOp(HermOp), _Tester(Tester), | ||||
|     Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm), | ||||
| @@ -275,7 +162,7 @@ public: | ||||
| 			       RealD _eresid, // resid in lmdue deficit  | ||||
| 			       int _MaxIter, // Max iterations | ||||
| 			       RealD _betastp=0.0, // if beta(k) < betastp: converged | ||||
| 			       int _MinRestart=1, int _orth_period = 1, | ||||
| 			       int _MinRestart=0, int _orth_period = 1, | ||||
| 			       IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) : | ||||
|     SimpleTester(HermOp),  _PolyOp(PolyOp),      _HermOp(HermOp), _Tester(SimpleTester), | ||||
|     Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm), | ||||
| @@ -289,7 +176,7 @@ public: | ||||
|   template<typename T>  static RealD normalise(T& v)  | ||||
|   { | ||||
|     RealD nn = norm2(v); | ||||
|     nn = sqrt(nn); | ||||
|     nn = std::sqrt(nn); | ||||
|     v = v * (1.0/nn); | ||||
|     return nn; | ||||
|   } | ||||
| @@ -321,10 +208,10 @@ until convergence | ||||
| */ | ||||
|   void calc(std::vector<RealD>& eval, std::vector<Field>& evec,  const Field& src, int& Nconv, bool reverse=false) | ||||
|   { | ||||
|     GridBase *grid = src._grid; | ||||
|     assert(grid == evec[0]._grid); | ||||
|     GridBase *grid = src.Grid(); | ||||
|     assert(grid == evec[0].Grid()); | ||||
|      | ||||
|     GridLogIRL.TimingMode(1); | ||||
|     //    GridLogIRL.TimingMode(1); | ||||
|     std::cout << GridLogIRL <<"**************************************************************************"<< std::endl; | ||||
|     std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 /  "<< MaxIter<< std::endl; | ||||
|     std::cout << GridLogIRL <<"**************************************************************************"<< std::endl; | ||||
| @@ -349,14 +236,17 @@ until convergence | ||||
|     { | ||||
|       auto src_n = src; | ||||
|       auto tmp = src; | ||||
|       std::cout << GridLogIRL << " IRL source norm " << norm2(src) << std::endl; | ||||
|       const int _MAX_ITER_IRL_MEVAPP_ = 50; | ||||
|       for (int i=0;i<_MAX_ITER_IRL_MEVAPP_;i++) { | ||||
| 	normalise(src_n); | ||||
| 	_HermOp(src_n,tmp); | ||||
| 	//	std::cout << GridLogMessage<< tmp<<std::endl; exit(0); | ||||
| 	//	std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl; | ||||
| 	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp. | ||||
| 	RealD vden = norm2(src_n); | ||||
| 	RealD na = vnum/vden; | ||||
| 	if (fabs(evalMaxApprox/na - 1.0) < 0.05) | ||||
| 	if (fabs(evalMaxApprox/na - 1.0) < 0.0001) | ||||
| 	  i=_MAX_ITER_IRL_MEVAPP_; | ||||
| 	evalMaxApprox = na; | ||||
| 	std::cout << GridLogIRL << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl; | ||||
| @@ -446,7 +336,7 @@ until convergence | ||||
|       assert(k2<Nm);      assert(k2<Nm);      assert(k1>0); | ||||
|  | ||||
|       basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis | ||||
|       std::cout<<GridLogIRL <<"basisRotated  by Qt"<<std::endl; | ||||
|       std::cout<<GridLogIRL <<"basisRotated  by Qt *"<<k1-1<<","<<k2+1<<")"<<std::endl; | ||||
|        | ||||
|       //////////////////////////////////////////////////// | ||||
|       // Compressed vector f and beta(k2) | ||||
| @@ -454,7 +344,7 @@ until convergence | ||||
|       f *= Qt(k2-1,Nm-1); | ||||
|       f += lme[k2-1] * evec[k2]; | ||||
|       beta_k = norm2(f); | ||||
|       beta_k = sqrt(beta_k); | ||||
|       beta_k = std::sqrt(beta_k); | ||||
|       std::cout<<GridLogIRL<<" beta(k) = "<<beta_k<<std::endl; | ||||
| 	   | ||||
|       RealD betar = 1.0/beta_k; | ||||
| @@ -477,7 +367,7 @@ until convergence | ||||
|  | ||||
| 	std::cout << GridLogIRL << "Test convergence: rotate subset of vectors to test convergence " << std::endl; | ||||
|  | ||||
| 	Field B(grid); B.checkerboard = evec[0].checkerboard; | ||||
| 	Field B(grid); B.Checkerboard() = evec[0].Checkerboard(); | ||||
|  | ||||
| 	//  power of two search pattern;  not every evalue in eval2 is assessed. | ||||
| 	int allconv =1; | ||||
| @@ -515,7 +405,7 @@ until convergence | ||||
| 	 | ||||
|   converged: | ||||
|     { | ||||
|       Field B(grid); B.checkerboard = evec[0].checkerboard; | ||||
|       Field B(grid); B.Checkerboard() = evec[0].Checkerboard(); | ||||
|       basisRotate(evec,Qt,0,Nk,0,Nk,Nm);	     | ||||
|       std::cout << GridLogIRL << " Rotated basis"<<std::endl; | ||||
|       Nconv=0; | ||||
| @@ -554,11 +444,11 @@ until convergence | ||||
| /* Saad PP. 195 | ||||
| 1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0 | ||||
| 2. For k = 1,2,...,m Do: | ||||
| 3. wk:=Avk−βkv_{k−1}       | ||||
| 4. αk:=(wk,vk)       //  | ||||
| 5. wk:=wk−αkvk       // wk orthog vk  | ||||
| 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop | ||||
| 7. vk+1 := wk/βk+1 | ||||
| 3. wk:=Avk - b_k v_{k-1}       | ||||
| 4. ak:=(wk,vk)       //  | ||||
| 5. wk:=wk-akvk       // wk orthog vk  | ||||
| 6. bk+1 := ||wk||_2. If b_k+1 = 0 then Stop | ||||
| 7. vk+1 := wk/b_k+1 | ||||
| 8. EndDo | ||||
|  */ | ||||
|   void step(std::vector<RealD>& lmd, | ||||
| @@ -566,6 +456,7 @@ until convergence | ||||
| 	    std::vector<Field>& evec, | ||||
| 	    Field& w,int Nm,int k) | ||||
|   { | ||||
|     std::cout<<GridLogIRL << "Lanczos step " <<k<<std::endl; | ||||
|     const RealD tiny = 1.0e-20; | ||||
|     assert( k< Nm ); | ||||
|  | ||||
| @@ -577,20 +468,20 @@ until convergence | ||||
|  | ||||
|     if(k>0) w -= lme[k-1] * evec[k-1]; | ||||
|  | ||||
|     ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk) | ||||
|     ComplexD zalph = innerProduct(evec_k,w); | ||||
|     RealD     alph = real(zalph); | ||||
|  | ||||
|     w = w - alph * evec_k;// 5. wk:=wk−αkvk | ||||
|     w = w - alph * evec_k; | ||||
|  | ||||
|     RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop | ||||
|     // 7. vk+1 := wk/βk+1 | ||||
|     RealD beta = normalise(w);  | ||||
|  | ||||
|     lmd[k] = alph; | ||||
|     lme[k] = beta; | ||||
|  | ||||
|     if (k>0 && k % orth_period == 0) { | ||||
|     if ( (k>0) && ( (k % orth_period) == 0 )) { | ||||
|       std::cout<<GridLogIRL << "Orthogonalising " <<k<<std::endl; | ||||
|       orthogonalize(w,evec,k); // orthonormalise | ||||
|       std::cout<<GridLogIRL << "Orthogonalised " <<std::endl; | ||||
|       std::cout<<GridLogIRL << "Orthogonalised " <<k<<std::endl; | ||||
|     } | ||||
|  | ||||
|     if(k < Nm-1) evec[k+1] = w; | ||||
| @@ -598,6 +489,8 @@ until convergence | ||||
|     std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl; | ||||
|     if ( beta < tiny )  | ||||
|       std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl; | ||||
|  | ||||
|     std::cout<<GridLogIRL << "Lanczos step complete " <<k<<std::endl; | ||||
|   } | ||||
|  | ||||
|   void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme,  | ||||
| @@ -807,7 +700,7 @@ void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme, | ||||
|      | ||||
|     // determination of 2x2 leading submatrix | ||||
|     RealD dsub = lmd[kmax-1]-lmd[kmax-2]; | ||||
|     RealD dd = sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]); | ||||
|     RealD dd = std::sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]); | ||||
|     RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub))); | ||||
|     // (Dsh: shift) | ||||
|      | ||||
| @@ -838,5 +731,6 @@ void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme, | ||||
|   abort(); | ||||
| } | ||||
| }; | ||||
| } | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -24,16 +24,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_LOCAL_COHERENCE_IRL_H | ||||
| #define GRID_LOCAL_COHERENCE_IRL_H | ||||
|  | ||||
| namespace Grid {  | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid);  | ||||
|  | ||||
| struct LanczosParams : Serializable { | ||||
|  public: | ||||
| public: | ||||
|   GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams, | ||||
| 				  ChebyParams, Cheby,/*Chebyshev*/ | ||||
| 				  int, Nstop,    /*Vecs in Lanczos must converge Nstop < Nk < Nm*/ | ||||
| @@ -46,7 +45,7 @@ struct LanczosParams : Serializable { | ||||
| }; | ||||
|  | ||||
| struct LocalCoherenceLanczosParams : Serializable { | ||||
|  public: | ||||
| public: | ||||
|   GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams, | ||||
| 				  bool, saveEvecs, | ||||
| 				  bool, doFine, | ||||
| @@ -59,7 +58,7 @@ struct LocalCoherenceLanczosParams : Serializable { | ||||
| 				  RealD        , coarse_relax_tol, | ||||
| 				  std::vector<int>, blockSize, | ||||
| 				  std::string, config, | ||||
| 				  std::vector < std::complex<double>  >, omega, | ||||
| 				  std::vector < ComplexD  >, omega, | ||||
| 				  RealD, mass, | ||||
| 				  RealD, M5); | ||||
| }; | ||||
| @@ -83,14 +82,14 @@ public: | ||||
|   }; | ||||
|  | ||||
|   void operator()(const CoarseField& in, CoarseField& out) { | ||||
|     GridBase *FineGrid = subspace[0]._grid;     | ||||
|     int   checkerboard = subspace[0].checkerboard; | ||||
|        | ||||
|     FineField fin (FineGrid);     fin.checkerboard= checkerboard; | ||||
|     FineField fout(FineGrid);   fout.checkerboard = checkerboard; | ||||
|     GridBase *FineGrid = subspace[0].Grid();     | ||||
|     int   checkerboard = subspace[0].Checkerboard(); | ||||
|  | ||||
|     FineField fin (FineGrid);     fin.Checkerboard()= checkerboard; | ||||
|     FineField fout(FineGrid);   fout.Checkerboard() = checkerboard; | ||||
|  | ||||
|     blockPromote(in,fin,subspace);       std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl; | ||||
|     _Linop.HermOp(fin,fout);             std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl; | ||||
|     _Linop.HermOp(fin,fout);                   std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl; | ||||
|     blockProject(out,fout,subspace);     std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl; | ||||
|   } | ||||
| }; | ||||
| @@ -117,12 +116,12 @@ public: | ||||
|   {  }; | ||||
|  | ||||
|   void operator()(const CoarseField& in, CoarseField& out) { | ||||
|      | ||||
|     GridBase *FineGrid = subspace[0]._grid;     | ||||
|     int   checkerboard = subspace[0].checkerboard; | ||||
|  | ||||
|     FineField fin (FineGrid); fin.checkerboard =checkerboard; | ||||
|     FineField fout(FineGrid);fout.checkerboard =checkerboard; | ||||
|     GridBase *FineGrid = subspace[0].Grid();     | ||||
|     int   checkerboard = subspace[0].Checkerboard(); | ||||
|  | ||||
|     FineField fin (FineGrid); fin.Checkerboard() =checkerboard; | ||||
|     FineField fout(FineGrid);fout.Checkerboard() =checkerboard; | ||||
|      | ||||
|     blockPromote(in,fin,subspace);             std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl; | ||||
|     _poly(_Linop,fin,fout);                    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl; | ||||
| @@ -133,7 +132,7 @@ public: | ||||
| template<class Fobj,class CComplex,int nbasis> | ||||
| class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanczosTester<Lattice<iVector<CComplex,nbasis > > > | ||||
| { | ||||
|  public: | ||||
| public: | ||||
|   typedef iVector<CComplex,nbasis >           CoarseSiteVector; | ||||
|   typedef Lattice<CoarseSiteVector>           CoarseField; | ||||
|   typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field | ||||
| @@ -142,7 +141,7 @@ class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanc | ||||
|   LinearFunction<CoarseField> & _Poly; | ||||
|   OperatorFunction<FineField>   & _smoother; | ||||
|   LinearOperatorBase<FineField> &_Linop; | ||||
|   RealD                          _coarse_relax_tol; | ||||
|   RealD                             _coarse_relax_tol; | ||||
|   std::vector<FineField>        &_subspace; | ||||
|    | ||||
|   ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly, | ||||
| @@ -182,10 +181,10 @@ class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanc | ||||
|   } | ||||
|   int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox) | ||||
|   { | ||||
|     GridBase *FineGrid = _subspace[0]._grid;     | ||||
|     int checkerboard   = _subspace[0].checkerboard; | ||||
|     FineField fB(FineGrid);fB.checkerboard =checkerboard; | ||||
|     FineField fv(FineGrid);fv.checkerboard =checkerboard; | ||||
|     GridBase *FineGrid = _subspace[0].Grid();     | ||||
|     int checkerboard   = _subspace[0].Checkerboard(); | ||||
|     FineField fB(FineGrid);fB.Checkerboard() =checkerboard; | ||||
|     FineField fv(FineGrid);fv.Checkerboard() =checkerboard; | ||||
|  | ||||
|     blockPromote(B,fv,_subspace);   | ||||
|      | ||||
| @@ -305,11 +304,11 @@ public: | ||||
|     int Nk = nbasis; | ||||
|     subspace.resize(Nk,_FineGrid); | ||||
|     subspace[0]=1.0; | ||||
|     subspace[0].checkerboard=_checkerboard; | ||||
|     subspace[0].Checkerboard()=_checkerboard; | ||||
|     normalise(subspace[0]); | ||||
|     PlainHermOp<FineField>    Op(_FineOp); | ||||
|     for(int k=1;k<Nk;k++){ | ||||
|       subspace[k].checkerboard=_checkerboard; | ||||
|       subspace[k].Checkerboard()=_checkerboard; | ||||
|       Op(subspace[k-1],subspace[k]); | ||||
|       normalise(subspace[k]); | ||||
|     } | ||||
| @@ -360,7 +359,11 @@ public: | ||||
|  | ||||
|     ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); | ||||
|  | ||||
|     FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard; | ||||
|     FineField src(_FineGrid);  | ||||
|     typedef typename FineField::scalar_type Scalar; | ||||
|     // src=1.0;  | ||||
|     src=Scalar(1.0);  | ||||
|     src.Checkerboard() = _checkerboard; | ||||
|  | ||||
|     int Nconv; | ||||
|     IRL.calc(evals_fine,subspace,src,Nconv,false); | ||||
| @@ -402,5 +405,5 @@ public: | ||||
|   } | ||||
| }; | ||||
|  | ||||
| } | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
|   | ||||
| @@ -33,6 +33,8 @@ namespace Grid { | ||||
|  | ||||
| template<class Field> class MinimalResidual : public OperatorFunction<Field> { | ||||
|  public: | ||||
|   using OperatorFunction<Field>::operator(); | ||||
|  | ||||
|   bool ErrorOnNoConverge; // throw an assert when the MR fails to converge. | ||||
|                           // Defaults true. | ||||
|   RealD   Tolerance; | ||||
| @@ -46,11 +48,11 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> { | ||||
|  | ||||
|   void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) { | ||||
|  | ||||
|     psi.checkerboard = src.checkerboard; | ||||
|     psi.Checkerboard() = src.Checkerboard(); | ||||
|     conformable(psi, src); | ||||
|  | ||||
|     Complex a, c; | ||||
|     Real    d; | ||||
|     ComplexD a, c; | ||||
|     RealD    d; | ||||
|  | ||||
|     Field Mr(src); | ||||
|     Field r(src); | ||||
| @@ -71,7 +73,6 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> { | ||||
|     std::cout << std::setprecision(4) << std::scientific; | ||||
|     std::cout << GridLogIterative << "MinimalResidual: guess " << guess << std::endl; | ||||
|     std::cout << GridLogIterative << "MinimalResidual:   src " << ssq << std::endl; | ||||
|     std::cout << GridLogIterative << "MinimalResidual:    mp " << d << std::endl; | ||||
|     std::cout << GridLogIterative << "MinimalResidual:  cp,r " << cp << std::endl; | ||||
|  | ||||
|     if (cp <= rsq) { | ||||
|   | ||||
| @@ -34,6 +34,9 @@ namespace Grid { | ||||
| template<class FieldD, class FieldF, typename std::enable_if<getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> | ||||
| class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction<FieldD> { | ||||
|  public: | ||||
|  | ||||
|   using OperatorFunction<FieldD>::operator(); | ||||
|  | ||||
|   bool ErrorOnNoConverge; // Throw an assert when MPFGMRES fails to converge, | ||||
|                           // defaults to true | ||||
|  | ||||
| @@ -54,10 +57,10 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction | ||||
|  | ||||
|   Eigen::MatrixXcd H; | ||||
|  | ||||
|   std::vector<std::complex<double>> y; | ||||
|   std::vector<std::complex<double>> gamma; | ||||
|   std::vector<std::complex<double>> c; | ||||
|   std::vector<std::complex<double>> s; | ||||
|   std::vector<ComplexD> y; | ||||
|   std::vector<ComplexD> gamma; | ||||
|   std::vector<ComplexD> c; | ||||
|   std::vector<ComplexD> s; | ||||
|  | ||||
|   GridBase* SinglePrecGrid; | ||||
|  | ||||
| @@ -84,7 +87,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction | ||||
|  | ||||
|   void operator()(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi) { | ||||
|  | ||||
|     psi.checkerboard = src.checkerboard; | ||||
|     psi.Checkerboard() = src.Checkerboard(); | ||||
|     conformable(psi, src); | ||||
|  | ||||
|     RealD guess = norm2(psi); | ||||
| @@ -94,7 +97,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction | ||||
|     RealD ssq = norm2(src); | ||||
|     RealD rsq = Tolerance * Tolerance * ssq; | ||||
|  | ||||
|     FieldD r(src._grid); | ||||
|     FieldD r(src.Grid()); | ||||
|  | ||||
|     std::cout << std::setprecision(4) << std::scientific; | ||||
|     std::cout << GridLogIterative << "MPFGMRES: guess " << guess << std::endl; | ||||
| @@ -154,12 +157,12 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction | ||||
|  | ||||
|     RealD cp = 0; | ||||
|  | ||||
|     FieldD w(src._grid); | ||||
|     FieldD r(src._grid); | ||||
|     FieldD w(src.Grid()); | ||||
|     FieldD r(src.Grid()); | ||||
|  | ||||
|     // these should probably be made class members so that they are only allocated once, not in every restart | ||||
|     std::vector<FieldD> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero; | ||||
|     std::vector<FieldD> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero; | ||||
|     std::vector<FieldD> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero(); | ||||
|     std::vector<FieldD> z(RestartLength + 1, src.Grid()); for (auto &elem : z) elem = Zero(); | ||||
|  | ||||
|     MatrixTimer.Start(); | ||||
|     LinOp.Op(psi, w); | ||||
| @@ -181,7 +184,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction | ||||
|  | ||||
|       qrUpdate(i); | ||||
|  | ||||
|       cp = std::norm(gamma[i+1]); | ||||
|       cp = norm(gamma[i+1]); | ||||
|  | ||||
|       std::cout << GridLogIterative << "MPFGMRES: Iteration " << IterationCount | ||||
|                 << " residual " << cp << " target " << rsq << std::endl; | ||||
| @@ -223,11 +226,11 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction | ||||
|     LinalgTimer.Start(); | ||||
|     for (int i = 0; i <= iter; ++i) { | ||||
|       H(iter, i) = innerProduct(v[i], w); | ||||
|       w = w - H(iter, i) * v[i]; | ||||
|       w = w - ComplexD(H(iter, i)) * v[i]; | ||||
|     } | ||||
|  | ||||
|     H(iter, iter + 1) = sqrt(norm2(w)); | ||||
|     v[iter + 1] = (1. / H(iter, iter + 1)) * w; | ||||
|     v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w; | ||||
|     LinalgTimer.Stop(); | ||||
|   } | ||||
|  | ||||
| @@ -235,13 +238,13 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction | ||||
|  | ||||
|     QrTimer.Start(); | ||||
|     for (int i = 0; i < iter ; ++i) { | ||||
|       auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1); | ||||
|       H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1); | ||||
|       auto tmp       = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1)); | ||||
|       H(iter, i)     = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1)); | ||||
|       H(iter, i + 1) = tmp; | ||||
|     } | ||||
|  | ||||
|     // Compute new Givens Rotation | ||||
|     ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1))); | ||||
|     auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1))); | ||||
|     c[iter]     = H(iter, iter) / nu; | ||||
|     s[iter]     = H(iter, iter + 1) / nu; | ||||
|  | ||||
| @@ -250,7 +253,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction | ||||
|     H(iter, iter + 1) = 0.; | ||||
|  | ||||
|     gamma[iter + 1] = -s[iter] * gamma[iter]; | ||||
|     gamma[iter]     = std::conj(c[iter]) * gamma[iter]; | ||||
|     gamma[iter]     = conjugate(c[iter]) * gamma[iter]; | ||||
|     QrTimer.Stop(); | ||||
|   } | ||||
|  | ||||
| @@ -260,8 +263,8 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction | ||||
|     for (int i = iter; i >= 0; i--) { | ||||
|       y[i] = gamma[i]; | ||||
|       for (int k = i + 1; k <= iter; k++) | ||||
|         y[i] = y[i] - H(k, i) * y[k]; | ||||
|       y[i] = y[i] / H(i, i); | ||||
|         y[i] = y[i] - ComplexD(H(k, i)) * y[k]; | ||||
|       y[i] = y[i] / ComplexD(H(i, i)); | ||||
|     } | ||||
|  | ||||
|     for (int i = 0; i <= iter; i++) | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -23,38 +23,90 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_NORMAL_EQUATIONS_H | ||||
| #define GRID_NORMAL_EQUATIONS_H | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // Take a matrix and form an NE solver calling a Herm solver | ||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   template<class Field> class NormalEquations : public OperatorFunction<Field>{ | ||||
|   private: | ||||
|     SparseMatrixBase<Field> & _Matrix; | ||||
|     OperatorFunction<Field> & _HermitianSolver; | ||||
| /////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // Take a matrix and form an NE solver calling a Herm solver | ||||
| /////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| template<class Field> class NormalEquations { | ||||
| private: | ||||
|   SparseMatrixBase<Field> & _Matrix; | ||||
|   OperatorFunction<Field> & _HermitianSolver; | ||||
|   LinearFunction<Field>   & _Guess; | ||||
| public: | ||||
|  | ||||
|   public: | ||||
|   ///////////////////////////////////////////////////// | ||||
|   // Wrap the usual normal equations trick | ||||
|   ///////////////////////////////////////////////////// | ||||
|  NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver, | ||||
| 		 LinearFunction<Field> &Guess)  | ||||
|    :  _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};  | ||||
|  | ||||
|     ///////////////////////////////////////////////////// | ||||
|     // Wrap the usual normal equations trick | ||||
|     ///////////////////////////////////////////////////// | ||||
|   NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver)  | ||||
|     :  _Matrix(Matrix), _HermitianSolver(HermitianSolver) {};  | ||||
|  | ||||
|     void operator() (const Field &in, Field &out){ | ||||
|   void operator() (const Field &in, Field &out){ | ||||
|   | ||||
|       Field src(in._grid); | ||||
|     Field src(in.Grid()); | ||||
|     Field tmp(in.Grid()); | ||||
|  | ||||
|       _Matrix.Mdag(in,src); | ||||
|       _HermitianSolver(src,out);  // Mdag M out = Mdag in | ||||
|     MdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(_Matrix); | ||||
|     _Matrix.Mdag(in,src); | ||||
|     _Guess(src,out); | ||||
|     _HermitianSolver(MdagMOp,src,out);  // Mdag M out = Mdag in | ||||
|  | ||||
|   }      | ||||
| }; | ||||
|  | ||||
| template<class Field> class HPDSolver { | ||||
| private: | ||||
|   LinearOperatorBase<Field> & _Matrix; | ||||
|   OperatorFunction<Field> & _HermitianSolver; | ||||
|   LinearFunction<Field>   & _Guess; | ||||
| public: | ||||
|  | ||||
|   ///////////////////////////////////////////////////// | ||||
|   // Wrap the usual normal equations trick | ||||
|   ///////////////////////////////////////////////////// | ||||
|  HPDSolver(LinearOperatorBase<Field> &Matrix, | ||||
| 	   OperatorFunction<Field> &HermitianSolver, | ||||
| 	   LinearFunction<Field> &Guess)  | ||||
|    :  _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};  | ||||
|  | ||||
|   void operator() (const Field &in, Field &out){ | ||||
|   | ||||
|     }      | ||||
|   }; | ||||
|     _Guess(in,out); | ||||
|     _HermitianSolver(_Matrix,in,out);  // Mdag M out = Mdag in | ||||
|  | ||||
| } | ||||
|   }      | ||||
| }; | ||||
|  | ||||
|  | ||||
| template<class Field> class MdagMSolver { | ||||
| private: | ||||
|   SparseMatrixBase<Field> & _Matrix; | ||||
|   OperatorFunction<Field> & _HermitianSolver; | ||||
|   LinearFunction<Field>   & _Guess; | ||||
| public: | ||||
|  | ||||
|   ///////////////////////////////////////////////////// | ||||
|   // Wrap the usual normal equations trick | ||||
|   ///////////////////////////////////////////////////// | ||||
|  MdagMSolver(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver, | ||||
| 	     LinearFunction<Field> &Guess)  | ||||
|    :  _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};  | ||||
|  | ||||
|   void operator() (const Field &in, Field &out){ | ||||
|   | ||||
|     MdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(_Matrix); | ||||
|     _Guess(in,out); | ||||
|  | ||||
|     _HermitianSolver(MdagMOp,in,out);  // Mdag M out = Mdag in | ||||
|  | ||||
|   }      | ||||
| }; | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
|   | ||||
| @@ -14,7 +14,7 @@ template<class Field> class PowerMethod | ||||
|  | ||||
|   RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src)  | ||||
|   {  | ||||
|     GridBase *grid = src._grid;  | ||||
|     GridBase *grid = src.Grid();  | ||||
|      | ||||
|     // quickly get an idea of the largest eigenvalue to more properly normalize the residuum  | ||||
|     RealD evalMaxApprox = 0.0;  | ||||
| @@ -30,12 +30,12 @@ template<class Field> class PowerMethod | ||||
|       RealD vden = norm2(src_n);  | ||||
|       RealD na = vnum/vden;  | ||||
|        | ||||
|       if ( (fabs(evalMaxApprox/na - 1.0) < 0.01) || (i==_MAX_ITER_EST_-1) ) {  | ||||
|       if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) {  | ||||
|  	evalMaxApprox = na;  | ||||
| 	std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl; | ||||
|  	return evalMaxApprox;  | ||||
|       }  | ||||
|       evalMaxApprox = na;  | ||||
|       std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl; | ||||
|       src_n = tmp; | ||||
|     } | ||||
|     assert(0); | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -23,97 +23,97 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_PREC_CONJUGATE_RESIDUAL_H | ||||
| #define GRID_PREC_CONJUGATE_RESIDUAL_H | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
|     ///////////////////////////////////////////////////////////// | ||||
|     // Base classes for iterative processes based on operators | ||||
|     // single input vec, single output vec. | ||||
|     ///////////////////////////////////////////////////////////// | ||||
| ///////////////////////////////////////////////////////////// | ||||
| // Base classes for iterative processes based on operators | ||||
| // single input vec, single output vec. | ||||
| ///////////////////////////////////////////////////////////// | ||||
|  | ||||
|   template<class Field>  | ||||
|     class PrecConjugateResidual : public OperatorFunction<Field> { | ||||
|   public:                                                 | ||||
|     RealD   Tolerance; | ||||
|     Integer MaxIterations; | ||||
|     int verbose; | ||||
|     LinearFunction<Field> &Preconditioner; | ||||
| template<class Field>  | ||||
| class PrecConjugateResidual : public OperatorFunction<Field> { | ||||
| public:                                                 | ||||
|   RealD   Tolerance; | ||||
|   Integer MaxIterations; | ||||
|   int verbose; | ||||
|   LinearFunction<Field> &Preconditioner; | ||||
|  | ||||
|     PrecConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec) : Tolerance(tol), MaxIterations(maxit),      Preconditioner(Prec) | ||||
|     {  | ||||
|       verbose=1; | ||||
|     }; | ||||
|   PrecConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec) : Tolerance(tol), MaxIterations(maxit),      Preconditioner(Prec) | ||||
|   {  | ||||
|     verbose=1; | ||||
|   }; | ||||
|  | ||||
|     void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){ | ||||
|   void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){ | ||||
|  | ||||
|       RealD a, b, c, d; | ||||
|       RealD cp, ssq,rsq; | ||||
|     RealD a, b, c, d; | ||||
|     RealD cp, ssq,rsq; | ||||
|        | ||||
|       RealD rAr, rAAr, rArp; | ||||
|       RealD pAp, pAAp; | ||||
|     RealD rAr, rAAr, rArp; | ||||
|     RealD pAp, pAAp; | ||||
|  | ||||
|       GridBase *grid = src._grid; | ||||
|       Field r(grid),  p(grid), Ap(grid), Ar(grid), z(grid); | ||||
|     GridBase *grid = src.Grid(); | ||||
|     Field r(grid),  p(grid), Ap(grid), Ar(grid), z(grid); | ||||
|        | ||||
|       psi=zero; | ||||
|       r  = src; | ||||
|       Preconditioner(r,p); | ||||
|     psi=zero; | ||||
|     r  = src; | ||||
|     Preconditioner(r,p); | ||||
|  | ||||
|        | ||||
|  | ||||
|       Linop.HermOpAndNorm(p,Ap,pAp,pAAp); | ||||
|       Ar=Ap; | ||||
|       rAr=pAp; | ||||
|       rAAr=pAAp; | ||||
|     Linop.HermOpAndNorm(p,Ap,pAp,pAAp); | ||||
|     Ar=Ap; | ||||
|     rAr=pAp; | ||||
|     rAAr=pAAp; | ||||
|  | ||||
|       cp =norm2(r); | ||||
|       ssq=norm2(src); | ||||
|       rsq=Tolerance*Tolerance*ssq; | ||||
|     cp =norm2(r); | ||||
|     ssq=norm2(src); | ||||
|     rsq=Tolerance*Tolerance*ssq; | ||||
|  | ||||
|       if (verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl; | ||||
|     if (verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl; | ||||
|  | ||||
|       for(int k=0;k<MaxIterations;k++){ | ||||
|     for(int k=0;k<MaxIterations;k++){ | ||||
|  | ||||
|  | ||||
| 	Preconditioner(Ap,z); | ||||
| 	RealD rq= real(innerProduct(Ap,z));  | ||||
|       Preconditioner(Ap,z); | ||||
|       RealD rq= real(innerProduct(Ap,z));  | ||||
|  | ||||
| 	a = rAr/rq; | ||||
|       a = rAr/rq; | ||||
|  | ||||
|    	axpy(psi,a,p,psi); | ||||
|    cp = axpy_norm(r,-a,z,r); | ||||
|       axpy(psi,a,p,psi); | ||||
|       cp = axpy_norm(r,-a,z,r); | ||||
|  | ||||
| 	rArp=rAr; | ||||
|       rArp=rAr; | ||||
|  | ||||
| 	Linop.HermOpAndNorm(r,Ar,rAr,rAAr); | ||||
|       Linop.HermOpAndNorm(r,Ar,rAr,rAAr); | ||||
|  | ||||
| 	b   =rAr/rArp; | ||||
|       b   =rAr/rArp; | ||||
|   | ||||
| 	axpy(p,b,p,r); | ||||
| 	pAAp=axpy_norm(Ap,b,Ap,Ar); | ||||
|       axpy(p,b,p,r); | ||||
|       pAAp=axpy_norm(Ap,b,Ap,Ar); | ||||
| 	 | ||||
| 	if(verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl; | ||||
|  | ||||
| 	if(cp<rsq) { | ||||
| 	  Linop.HermOp(psi,Ap); | ||||
| 	  axpy(r,-1.0,src,Ap); | ||||
| 	  RealD true_resid = norm2(r)/ssq; | ||||
| 	  std::cout<<GridLogMessage<<"PrecConjugateResidual: Converged on iteration " <<k | ||||
| 		   << " computed residual "<<sqrt(cp/ssq) | ||||
| 	           << " true residual "<<sqrt(true_resid) | ||||
| 	           << " target "       <<Tolerance <<std::endl; | ||||
| 	  return; | ||||
| 	} | ||||
|       if(verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl; | ||||
|  | ||||
|       if(cp<rsq) { | ||||
| 	Linop.HermOp(psi,Ap); | ||||
| 	axpy(r,-1.0,src,Ap); | ||||
| 	RealD true_resid = norm2(r)/ssq; | ||||
| 	std::cout<<GridLogMessage<<"PrecConjugateResidual: Converged on iteration " <<k | ||||
| 		 << " computed residual "<<sqrt(cp/ssq) | ||||
| 		 << " true residual "<<sqrt(true_resid) | ||||
| 		 << " target "       <<Tolerance <<std::endl; | ||||
| 	return; | ||||
|       } | ||||
|  | ||||
|       std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl; | ||||
|       assert(0); | ||||
|     } | ||||
|   }; | ||||
| } | ||||
|  | ||||
|     std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl; | ||||
|     assert(0); | ||||
|   } | ||||
| }; | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -24,8 +24,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_PREC_GCR_H | ||||
| #define GRID_PREC_GCR_H | ||||
|  | ||||
| @@ -36,206 +36,204 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| //NB. Likely not original reference since they are focussing on a preconditioner variant. | ||||
| //    but VPGCR was nicely written up in their paper | ||||
| /////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
|   template<class Field> | ||||
|     class PrecGeneralisedConjugateResidual : public OperatorFunction<Field> { | ||||
|   public:                                                 | ||||
|     RealD   Tolerance; | ||||
|     Integer MaxIterations; | ||||
|     int verbose; | ||||
|     int mmax; | ||||
|     int nstep; | ||||
|     int steps; | ||||
|     GridStopWatch PrecTimer; | ||||
|     GridStopWatch MatTimer; | ||||
|     GridStopWatch LinalgTimer; | ||||
| #define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" "  | ||||
|  | ||||
|     LinearFunction<Field> &Preconditioner; | ||||
| template<class Field> | ||||
| class PrecGeneralisedConjugateResidual : public LinearFunction<Field> { | ||||
| public:                                                 | ||||
|  | ||||
|    PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) :  | ||||
|       Tolerance(tol),  | ||||
|       MaxIterations(maxit), | ||||
|       Preconditioner(Prec), | ||||
|       mmax(_mmax), | ||||
|       nstep(_nstep) | ||||
|     {  | ||||
|       verbose=1; | ||||
|     }; | ||||
|   RealD   Tolerance; | ||||
|   Integer MaxIterations; | ||||
|   int verbose; | ||||
|   int mmax; | ||||
|   int nstep; | ||||
|   int steps; | ||||
|   int level; | ||||
|   GridStopWatch PrecTimer; | ||||
|   GridStopWatch MatTimer; | ||||
|   GridStopWatch LinalgTimer; | ||||
|  | ||||
|     void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){ | ||||
|   LinearFunction<Field>     &Preconditioner; | ||||
|   LinearOperatorBase<Field> &Linop; | ||||
|  | ||||
|       psi=zero; | ||||
|       RealD cp, ssq,rsq; | ||||
|       ssq=norm2(src); | ||||
|       rsq=Tolerance*Tolerance*ssq; | ||||
|   void Level(int lv) { level=lv; }; | ||||
|  | ||||
|   PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) :  | ||||
|     Tolerance(tol),  | ||||
|     MaxIterations(maxit), | ||||
|     Linop(_Linop), | ||||
|     Preconditioner(Prec), | ||||
|     mmax(_mmax), | ||||
|     nstep(_nstep) | ||||
|   {  | ||||
|     level=1; | ||||
|     verbose=1; | ||||
|   }; | ||||
|  | ||||
|   void operator() (const Field &src, Field &psi){ | ||||
|  | ||||
|     psi=Zero(); | ||||
|     RealD cp, ssq,rsq; | ||||
|     ssq=norm2(src); | ||||
|     rsq=Tolerance*Tolerance*ssq; | ||||
|        | ||||
|       Field r(src._grid); | ||||
|     Field r(src.Grid()); | ||||
|  | ||||
|         PrecTimer.Reset(); | ||||
|          MatTimer.Reset(); | ||||
|       LinalgTimer.Reset(); | ||||
|     PrecTimer.Reset(); | ||||
|     MatTimer.Reset(); | ||||
|     LinalgTimer.Reset(); | ||||
|  | ||||
|       GridStopWatch SolverTimer; | ||||
|       SolverTimer.Start(); | ||||
|     GridStopWatch SolverTimer; | ||||
|     SolverTimer.Start(); | ||||
|  | ||||
|       steps=0; | ||||
|       for(int k=0;k<MaxIterations;k++){ | ||||
|     steps=0; | ||||
|     for(int k=0;k<MaxIterations;k++){ | ||||
|  | ||||
| 	cp=GCRnStep(Linop,src,psi,rsq); | ||||
|       cp=GCRnStep(src,psi,rsq); | ||||
|  | ||||
| 	std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl; | ||||
|       GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl; | ||||
|  | ||||
| 	if(cp<rsq) { | ||||
|       if(cp<rsq) { | ||||
|  | ||||
| 	  SolverTimer.Stop(); | ||||
| 	SolverTimer.Stop(); | ||||
|  | ||||
| 	  Linop.HermOp(psi,r); | ||||
| 	  axpy(r,-1.0,src,r); | ||||
| 	  RealD tr = norm2(r); | ||||
| 	  std::cout<<GridLogMessage<<"PrecGeneralisedConjugateResidual: Converged on iteration " <<steps | ||||
| 		   << " computed residual "<<sqrt(cp/ssq) | ||||
| 	           << " true residual "    <<sqrt(tr/ssq) | ||||
| 	           << " target "           <<Tolerance <<std::endl; | ||||
|  | ||||
| 	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl; | ||||
| 	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Precon "<<   PrecTimer.Elapsed() <<std::endl; | ||||
| 	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Matrix "<<    MatTimer.Elapsed() <<std::endl; | ||||
| 	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl; | ||||
| 	  return; | ||||
| 	} | ||||
| 	Linop.HermOp(psi,r); | ||||
| 	axpy(r,-1.0,src,r); | ||||
| 	RealD tr = norm2(r); | ||||
| 	GCRLogLevel<<"PGCR: Converged on iteration " <<steps | ||||
| 		 << " computed residual "<<sqrt(cp/ssq) | ||||
| 		 << " true residual "    <<sqrt(tr/ssq) | ||||
| 		 << " target "           <<Tolerance <<std::endl; | ||||
|  | ||||
| 	GCRLogLevel<<"PGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl; | ||||
| 	/* | ||||
| 	  GCRLogLevel<<"PGCR Time elapsed: Precon "<<   PrecTimer.Elapsed() <<std::endl; | ||||
| 	  GCRLogLevel<<"PGCR Time elapsed: Matrix "<<    MatTimer.Elapsed() <<std::endl; | ||||
| 	  GCRLogLevel<<"PGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl; | ||||
| 	*/ | ||||
| 	return; | ||||
|       } | ||||
|       std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl; | ||||
|       assert(0); | ||||
|  | ||||
|     } | ||||
|     GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl; | ||||
|     //    assert(0); | ||||
|   } | ||||
|  | ||||
|     RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){ | ||||
|   RealD GCRnStep(const Field &src, Field &psi,RealD rsq){ | ||||
|  | ||||
|       RealD cp; | ||||
|       RealD a, b, c, d; | ||||
|       RealD zAz, zAAz; | ||||
|       RealD rAq, rq; | ||||
|     RealD cp; | ||||
|     RealD a, b; | ||||
|     RealD zAz, zAAz; | ||||
|     RealD rq; | ||||
|  | ||||
|       GridBase *grid = src._grid; | ||||
|     GridBase *grid = src.Grid(); | ||||
|  | ||||
|       Field r(grid); | ||||
|       Field z(grid); | ||||
|       Field tmp(grid); | ||||
|       Field ttmp(grid); | ||||
|       Field Az(grid); | ||||
|     Field r(grid); | ||||
|     Field z(grid); | ||||
|     Field tmp(grid); | ||||
|     Field ttmp(grid); | ||||
|     Field Az(grid); | ||||
|  | ||||
|       //////////////////////////////// | ||||
|       // history for flexible orthog | ||||
|       //////////////////////////////// | ||||
|       std::vector<Field> q(mmax,grid); | ||||
|       std::vector<Field> p(mmax,grid); | ||||
|       std::vector<RealD> qq(mmax); | ||||
|     //////////////////////////////// | ||||
|     // history for flexible orthog | ||||
|     //////////////////////////////// | ||||
|     std::vector<Field> q(mmax,grid); | ||||
|     std::vector<Field> p(mmax,grid); | ||||
|     std::vector<RealD> qq(mmax); | ||||
|        | ||||
|       ////////////////////////////////// | ||||
|       // initial guess x0 is taken as nonzero. | ||||
|       // r0=src-A x0 = src | ||||
|       ////////////////////////////////// | ||||
|       MatTimer.Start(); | ||||
|       Linop.HermOpAndNorm(psi,Az,zAz,zAAz);  | ||||
|       MatTimer.Stop(); | ||||
|     GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl; | ||||
|  | ||||
|     ////////////////////////////////// | ||||
|     // initial guess x0 is taken as nonzero. | ||||
|     // r0=src-A x0 = src | ||||
|     ////////////////////////////////// | ||||
|     MatTimer.Start(); | ||||
|     Linop.HermOpAndNorm(psi,Az,zAz,zAAz);  | ||||
|     MatTimer.Stop(); | ||||
|      | ||||
|  | ||||
|     LinalgTimer.Start(); | ||||
|     r=src-Az; | ||||
|     LinalgTimer.Stop(); | ||||
|     GCRLogLevel<< "PGCR true residual r = src - A psi   "<<norm2(r) <<std::endl; | ||||
|      | ||||
|     ///////////////////// | ||||
|     // p = Prec(r) | ||||
|     ///////////////////// | ||||
|  | ||||
|     PrecTimer.Start(); | ||||
|     Preconditioner(r,z); | ||||
|     PrecTimer.Stop(); | ||||
|  | ||||
|     MatTimer.Start(); | ||||
|     Linop.HermOpAndNorm(z,Az,zAz,zAAz);  | ||||
|     MatTimer.Stop(); | ||||
|  | ||||
|     LinalgTimer.Start(); | ||||
|  | ||||
|     //p[0],q[0],qq[0]  | ||||
|     p[0]= z; | ||||
|     q[0]= Az; | ||||
|     qq[0]= zAAz; | ||||
|      | ||||
|     cp =norm2(r); | ||||
|     LinalgTimer.Stop(); | ||||
|  | ||||
|     for(int k=0;k<nstep;k++){ | ||||
|  | ||||
|       steps++; | ||||
|  | ||||
|       int kp     = k+1; | ||||
|       int peri_k = k %mmax; | ||||
|       int peri_kp= kp%mmax; | ||||
|  | ||||
|       LinalgTimer.Start(); | ||||
|       r=src-Az; | ||||
|       rq= real(innerProduct(r,q[peri_k])); // what if rAr not real? | ||||
|       a = rq/qq[peri_k]; | ||||
|  | ||||
|       axpy(psi,a,p[peri_k],psi);          | ||||
|  | ||||
|       cp = axpy_norm(r,-a,q[peri_k],r); | ||||
|       LinalgTimer.Stop(); | ||||
|  | ||||
|       ///////////////////// | ||||
|       // p = Prec(r) | ||||
|       ///////////////////// | ||||
|       GCRLogLevel<< "PGCR step["<<steps<<"]  resid " << cp << " target " <<rsq<<std::endl;  | ||||
|  | ||||
|       if((k==nstep-1)||(cp<rsq)){ | ||||
| 	return cp; | ||||
|       } | ||||
|  | ||||
|  | ||||
|       PrecTimer.Start(); | ||||
|       Preconditioner(r,z); | ||||
|       Preconditioner(r,z);// solve Az = r | ||||
|       PrecTimer.Stop(); | ||||
|  | ||||
|       MatTimer.Start(); | ||||
|       Linop.HermOp(z,tmp);  | ||||
|       Linop.HermOpAndNorm(z,Az,zAz,zAAz); | ||||
|       MatTimer.Stop(); | ||||
|  | ||||
|       LinalgTimer.Start(); | ||||
|       ttmp=tmp; | ||||
|       tmp=tmp-r; | ||||
|       LinalgTimer.Stop(); | ||||
|  | ||||
|       /* | ||||
|       std::cout<<GridLogMessage<<r<<std::endl; | ||||
|       std::cout<<GridLogMessage<<z<<std::endl; | ||||
|       std::cout<<GridLogMessage<<ttmp<<std::endl; | ||||
|       std::cout<<GridLogMessage<<tmp<<std::endl; | ||||
|       */ | ||||
|       q[peri_kp]=Az; | ||||
|       p[peri_kp]=z; | ||||
|  | ||||
|       MatTimer.Start(); | ||||
|       Linop.HermOpAndNorm(z,Az,zAz,zAAz);  | ||||
|       MatTimer.Stop(); | ||||
|       int northog = ((kp)>(mmax-1))?(mmax-1):(kp);  // if more than mmax done, we orthog all mmax history. | ||||
|       for(int back=0;back<northog;back++){ | ||||
|  | ||||
|       LinalgTimer.Start(); | ||||
|       //p[0],q[0],qq[0]  | ||||
|       p[0]= z; | ||||
|       q[0]= Az; | ||||
|       qq[0]= zAAz; | ||||
| 	int peri_back=(k-back)%mmax;   	  assert((k-back)>=0); | ||||
|  | ||||
|       cp =norm2(r); | ||||
|       LinalgTimer.Stop(); | ||||
| 	b=-real(innerProduct(q[peri_back],Az))/qq[peri_back]; | ||||
| 	p[peri_kp]=p[peri_kp]+b*p[peri_back]; | ||||
| 	q[peri_kp]=q[peri_kp]+b*q[peri_back]; | ||||
|  | ||||
|       for(int k=0;k<nstep;k++){ | ||||
|  | ||||
| 	steps++; | ||||
|  | ||||
| 	int kp     = k+1; | ||||
| 	int peri_k = k %mmax; | ||||
| 	int peri_kp= kp%mmax; | ||||
|  | ||||
|         LinalgTimer.Start(); | ||||
| 	rq= real(innerProduct(r,q[peri_k])); // what if rAr not real? | ||||
| 	a = rq/qq[peri_k]; | ||||
|  | ||||
| 	axpy(psi,a,p[peri_k],psi);          | ||||
|  | ||||
| 	cp = axpy_norm(r,-a,q[peri_k],r); | ||||
|         LinalgTimer.Stop(); | ||||
|  | ||||
| 	if((k==nstep-1)||(cp<rsq)){ | ||||
| 	  return cp; | ||||
| 	} | ||||
|  | ||||
| 	std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"]  resid " <<sqrt(cp/rsq)<<std::endl;  | ||||
|  | ||||
| 	PrecTimer.Start(); | ||||
| 	Preconditioner(r,z);// solve Az = r | ||||
| 	PrecTimer.Stop(); | ||||
|  | ||||
| 	MatTimer.Start(); | ||||
| 	Linop.HermOpAndNorm(z,Az,zAz,zAAz); | ||||
| 	Linop.HermOp(z,tmp); | ||||
| 	MatTimer.Stop(); | ||||
|  | ||||
|         LinalgTimer.Start(); | ||||
|         tmp=tmp-r; | ||||
| 	std::cout<<GridLogMessage<< " Preconditioner resid " <<sqrt(norm2(tmp)/norm2(r))<<std::endl;  | ||||
|  | ||||
| 	q[peri_kp]=Az; | ||||
| 	p[peri_kp]=z; | ||||
|  | ||||
| 	int northog = ((kp)>(mmax-1))?(mmax-1):(kp);  // if more than mmax done, we orthog all mmax history. | ||||
| 	for(int back=0;back<northog;back++){ | ||||
|  | ||||
| 	  int peri_back=(k-back)%mmax;   	  assert((k-back)>=0); | ||||
|  | ||||
| 	  b=-real(innerProduct(q[peri_back],Az))/qq[peri_back]; | ||||
| 	  p[peri_kp]=p[peri_kp]+b*p[peri_back]; | ||||
| 	  q[peri_kp]=q[peri_kp]+b*q[peri_back]; | ||||
|  | ||||
| 	} | ||||
| 	qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm | ||||
|         LinalgTimer.Stop(); | ||||
|       } | ||||
|  | ||||
|       assert(0); // never reached | ||||
|       return cp; | ||||
|       qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm | ||||
|       LinalgTimer.Stop(); | ||||
|     } | ||||
|   }; | ||||
| } | ||||
|     assert(0); // never reached | ||||
|     return cp; | ||||
|   } | ||||
| }; | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
|   | ||||
| @@ -0,0 +1,241 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
|     Source file: ./lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h | ||||
|  | ||||
|     Copyright (C) 2015 | ||||
|  | ||||
| Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
|  | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
|  | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_PREC_GCR_NON_HERM_H | ||||
| #define GRID_PREC_GCR_NON_HERM_H | ||||
|  | ||||
| /////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| //VPGCR Abe and Zhang, 2005. | ||||
| //INTERNATIONAL JOURNAL OF NUMERICAL ANALYSIS AND MODELING | ||||
| //Computing and Information Volume 2, Number 2, Pages 147-161 | ||||
| //NB. Likely not original reference since they are focussing on a preconditioner variant. | ||||
| //    but VPGCR was nicely written up in their paper | ||||
| /////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| #define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" "  | ||||
|  | ||||
| template<class Field> | ||||
| class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> { | ||||
| public:                                                 | ||||
|  | ||||
|   RealD   Tolerance; | ||||
|   Integer MaxIterations; | ||||
|   int verbose; | ||||
|   int mmax; | ||||
|   int nstep; | ||||
|   int steps; | ||||
|   int level; | ||||
|   GridStopWatch PrecTimer; | ||||
|   GridStopWatch MatTimer; | ||||
|   GridStopWatch LinalgTimer; | ||||
|  | ||||
|   LinearFunction<Field>     &Preconditioner; | ||||
|   LinearOperatorBase<Field> &Linop; | ||||
|  | ||||
|   void Level(int lv) { level=lv; }; | ||||
|  | ||||
|   PrecGeneralisedConjugateResidualNonHermitian(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) :  | ||||
|     Tolerance(tol),  | ||||
|     MaxIterations(maxit), | ||||
|     Linop(_Linop), | ||||
|     Preconditioner(Prec), | ||||
|     mmax(_mmax), | ||||
|     nstep(_nstep) | ||||
|   {  | ||||
|     level=1; | ||||
|     verbose=1; | ||||
|   }; | ||||
|  | ||||
|   void operator() (const Field &src, Field &psi){ | ||||
|  | ||||
|     psi=Zero(); | ||||
|     RealD cp, ssq,rsq; | ||||
|     ssq=norm2(src); | ||||
|     rsq=Tolerance*Tolerance*ssq; | ||||
|        | ||||
|     Field r(src.Grid()); | ||||
|  | ||||
|     PrecTimer.Reset(); | ||||
|     MatTimer.Reset(); | ||||
|     LinalgTimer.Reset(); | ||||
|  | ||||
|     GridStopWatch SolverTimer; | ||||
|     SolverTimer.Start(); | ||||
|  | ||||
|     steps=0; | ||||
|     for(int k=0;k<MaxIterations;k++){ | ||||
|  | ||||
|       cp=GCRnStep(src,psi,rsq); | ||||
|  | ||||
|       GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl; | ||||
|  | ||||
|       if(cp<rsq) { | ||||
|  | ||||
| 	SolverTimer.Stop(); | ||||
|  | ||||
| 	Linop.Op(psi,r); | ||||
| 	axpy(r,-1.0,src,r); | ||||
| 	RealD tr = norm2(r); | ||||
| 	GCRLogLevel<<"PGCR: Converged on iteration " <<steps | ||||
| 		 << " computed residual "<<sqrt(cp/ssq) | ||||
| 		 << " true residual "    <<sqrt(tr/ssq) | ||||
| 		 << " target "           <<Tolerance <<std::endl; | ||||
|  | ||||
| 	GCRLogLevel<<"PGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl; | ||||
| 	return; | ||||
|       } | ||||
|  | ||||
|     } | ||||
|     GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl; | ||||
|     //    assert(0); | ||||
|   } | ||||
|  | ||||
|   RealD GCRnStep(const Field &src, Field &psi,RealD rsq){ | ||||
|  | ||||
|     RealD cp; | ||||
|     ComplexD a, b, zAz; | ||||
|     RealD zAAz; | ||||
|     ComplexD rq; | ||||
|  | ||||
|     GridBase *grid = src.Grid(); | ||||
|  | ||||
|     Field r(grid); | ||||
|     Field z(grid); | ||||
|     Field tmp(grid); | ||||
|     Field ttmp(grid); | ||||
|     Field Az(grid); | ||||
|  | ||||
|     //////////////////////////////// | ||||
|     // history for flexible orthog | ||||
|     //////////////////////////////// | ||||
|     std::vector<Field> q(mmax,grid); | ||||
|     std::vector<Field> p(mmax,grid); | ||||
|     std::vector<RealD> qq(mmax); | ||||
|        | ||||
|     GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl; | ||||
|  | ||||
|     ////////////////////////////////// | ||||
|     // initial guess x0 is taken as nonzero. | ||||
|     // r0=src-A x0 = src | ||||
|     ////////////////////////////////// | ||||
|     MatTimer.Start(); | ||||
|     Linop.Op(psi,Az); | ||||
|     zAz = innerProduct(Az,psi); | ||||
|     zAAz= norm2(Az); | ||||
|     MatTimer.Stop(); | ||||
|      | ||||
|  | ||||
|     LinalgTimer.Start(); | ||||
|     r=src-Az; | ||||
|     LinalgTimer.Stop(); | ||||
|     GCRLogLevel<< "PGCR true residual r = src - A psi   "<<norm2(r) <<std::endl; | ||||
|      | ||||
|     ///////////////////// | ||||
|     // p = Prec(r) | ||||
|     ///////////////////// | ||||
|  | ||||
|     PrecTimer.Start(); | ||||
|     Preconditioner(r,z); | ||||
|     PrecTimer.Stop(); | ||||
|  | ||||
|     MatTimer.Start(); | ||||
|     Linop.Op(z,Az); | ||||
|     MatTimer.Stop(); | ||||
|  | ||||
|     LinalgTimer.Start(); | ||||
|  | ||||
|     zAz = innerProduct(Az,psi); | ||||
|     zAAz= norm2(Az); | ||||
|  | ||||
|     //p[0],q[0],qq[0]  | ||||
|     p[0]= z; | ||||
|     q[0]= Az; | ||||
|     qq[0]= zAAz; | ||||
|      | ||||
|     cp =norm2(r); | ||||
|     LinalgTimer.Stop(); | ||||
|  | ||||
|     for(int k=0;k<nstep;k++){ | ||||
|  | ||||
|       steps++; | ||||
|  | ||||
|       int kp     = k+1; | ||||
|       int peri_k = k %mmax; | ||||
|       int peri_kp= kp%mmax; | ||||
|  | ||||
|       LinalgTimer.Start(); | ||||
|       rq= innerProduct(q[peri_k],r); // what if rAr not real? | ||||
|       a = rq/qq[peri_k]; | ||||
|  | ||||
|       axpy(psi,a,p[peri_k],psi);          | ||||
|  | ||||
|       cp = axpy_norm(r,-a,q[peri_k],r); | ||||
|       LinalgTimer.Stop(); | ||||
|  | ||||
|       GCRLogLevel<< "PGCR step["<<steps<<"]  resid " << cp << " target " <<rsq<<std::endl;  | ||||
|  | ||||
|       if((k==nstep-1)||(cp<rsq)){ | ||||
| 	return cp; | ||||
|       } | ||||
|  | ||||
|  | ||||
|       PrecTimer.Start(); | ||||
|       Preconditioner(r,z);// solve Az = r | ||||
|       PrecTimer.Stop(); | ||||
|  | ||||
|       MatTimer.Start(); | ||||
|       Linop.Op(z,Az); | ||||
|       MatTimer.Stop(); | ||||
|       zAz = innerProduct(Az,psi); | ||||
|       zAAz= norm2(Az); | ||||
|  | ||||
|       LinalgTimer.Start(); | ||||
|  | ||||
|       q[peri_kp]=Az; | ||||
|       p[peri_kp]=z; | ||||
|  | ||||
|       int northog = ((kp)>(mmax-1))?(mmax-1):(kp);  // if more than mmax done, we orthog all mmax history. | ||||
|       for(int back=0;back<northog;back++){ | ||||
|  | ||||
| 	int peri_back=(k-back)%mmax;   	  assert((k-back)>=0); | ||||
|  | ||||
| 	b=-real(innerProduct(q[peri_back],Az))/qq[peri_back]; | ||||
| 	p[peri_kp]=p[peri_kp]+b*p[peri_back]; | ||||
| 	q[peri_kp]=q[peri_kp]+b*q[peri_back]; | ||||
|  | ||||
|       } | ||||
|       qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm | ||||
|       LinalgTimer.Stop(); | ||||
|     } | ||||
|     assert(0); // never reached | ||||
|     return cp; | ||||
|   } | ||||
| }; | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
							
								
								
									
										371
									
								
								Grid/algorithms/iterative/QuasiMinimalResidual.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										371
									
								
								Grid/algorithms/iterative/QuasiMinimalResidual.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,371 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
| Grid physics library, www.github.com/paboyle/Grid | ||||
|  | ||||
| Source file: ./lib/algorithmsf/iterative/QuasiMinimalResidual.h | ||||
|  | ||||
| Copyright (C) 2019 | ||||
|  | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
|  | ||||
| You should have received a copy of the GNU General Public License along | ||||
| with this program; if not, write to the Free Software Foundation, Inc., | ||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
| See the full license in the file "LICENSE" in the top level distribution | ||||
| directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #pragma once | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| template<class Field>  | ||||
| RealD innerG5ProductReal(Field &l, Field &r) | ||||
| { | ||||
|   Gamma G5(Gamma::Algebra::Gamma5); | ||||
|   Field tmp(l.Grid()); | ||||
|   //  tmp = G5*r; | ||||
|   G5R5(tmp,r); | ||||
|   ComplexD ip =innerProduct(l,tmp); | ||||
|   std::cout << "innerProductRealG5R5 "<<ip<<std::endl; | ||||
|   return ip.real(); | ||||
| } | ||||
|  | ||||
| template<class Field> | ||||
| class QuasiMinimalResidual : public OperatorFunction<Field> { | ||||
|  public: | ||||
|   using OperatorFunction<Field>::operator(); | ||||
|  | ||||
|   bool ErrorOnNoConverge;  | ||||
|   RealD   Tolerance; | ||||
|   Integer MaxIterations; | ||||
|   Integer IterationCount; | ||||
|  | ||||
|   QuasiMinimalResidual(RealD   tol, | ||||
| 		       Integer maxit, | ||||
| 		       bool    err_on_no_conv = true) | ||||
|       : Tolerance(tol) | ||||
|       , MaxIterations(maxit) | ||||
|       , ErrorOnNoConverge(err_on_no_conv)  | ||||
|   {}; | ||||
|  | ||||
| #if 1 | ||||
|   void operator()(LinearOperatorBase<Field> &LinOp, const Field &b, Field &x)  | ||||
|   { | ||||
|     RealD resid; | ||||
|     IterationCount=0; | ||||
|  | ||||
|     RealD  rho, rho_1, xi, gamma, gamma_1, theta, theta_1; | ||||
|     RealD  eta, delta, ep, beta;  | ||||
|  | ||||
|     GridBase *Grid = b.Grid(); | ||||
|     Field r(Grid), d(Grid), s(Grid); | ||||
|     Field v(Grid), w(Grid), y(Grid),  z(Grid); | ||||
|     Field v_tld(Grid), w_tld(Grid), y_tld(Grid), z_tld(Grid); | ||||
|     Field p(Grid), q(Grid), p_tld(Grid); | ||||
|  | ||||
|     Real normb = norm2(b); | ||||
|  | ||||
|     LinOp.Op(x,r); r = b - r; | ||||
|  | ||||
|     assert(normb> 0.0); | ||||
|  | ||||
|     resid = norm2(r)/normb; | ||||
|     if (resid <= Tolerance) { | ||||
|       return; | ||||
|     } | ||||
|  | ||||
|     v_tld = r; | ||||
|     y = v_tld; | ||||
|     rho = norm2(y); | ||||
|  | ||||
|     // Take Gamma5 conjugate | ||||
|     //    Gamma G5(Gamma::Algebra::Gamma5); | ||||
|     //    G5R5(w_tld,r); | ||||
|     //    w_tld = G5* v_tld; | ||||
|     w_tld=v_tld; | ||||
|     z = w_tld; | ||||
|     xi = norm2(z); | ||||
|  | ||||
|     gamma = 1.0; | ||||
|     eta   = -1.0; | ||||
|     theta = 0.0; | ||||
|  | ||||
|     for (int i = 1; i <= MaxIterations; i++) { | ||||
|  | ||||
|       // Breakdown tests | ||||
|       assert( rho != 0.0); | ||||
|       assert( xi  != 0.0); | ||||
|  | ||||
|       v = (1. / rho) * v_tld; | ||||
|       y = (1. / rho) * y; | ||||
|  | ||||
|       w = (1. / xi) * w_tld; | ||||
|       z = (1. / xi) * z; | ||||
|  | ||||
|       ComplexD Zdelta = innerProduct(z, y); // Complex? | ||||
|       std::cout << "Zdelta "<<Zdelta<<std::endl; | ||||
|       delta = Zdelta.real(); | ||||
|  | ||||
|       y_tld = y;  | ||||
|       z_tld = z; | ||||
|  | ||||
|       if (i > 1) { | ||||
| 	p = y_tld - (xi  * delta / ep) * p; | ||||
| 	q = z_tld - (rho * delta / ep) * q; | ||||
|       } else { | ||||
| 	p = y_tld; | ||||
| 	q = z_tld; | ||||
|       } | ||||
|  | ||||
|       LinOp.Op(p,p_tld);      //     p_tld = A * p; | ||||
|       ComplexD Zep = innerProduct(q, p_tld); | ||||
|       ep=Zep.real(); | ||||
|       std::cout << "Zep "<<Zep <<std::endl; | ||||
|       // Complex Audit | ||||
|       assert(abs(ep)>0); | ||||
|  | ||||
|       beta = ep / delta; | ||||
|       assert(abs(beta)>0); | ||||
|  | ||||
|       v_tld = p_tld - beta * v; | ||||
|       y = v_tld; | ||||
|  | ||||
|       rho_1 = rho; | ||||
|       rho   = norm2(y); | ||||
|       LinOp.AdjOp(q,w_tld); | ||||
|       w_tld = w_tld - beta * w; | ||||
|       z = w_tld; | ||||
|  | ||||
|       xi = norm2(z); | ||||
|  | ||||
|       gamma_1 = gamma; | ||||
|       theta_1 = theta; | ||||
|  | ||||
|       theta   = rho / (gamma_1 * beta); | ||||
|       gamma   = 1.0 / sqrt(1.0 + theta * theta); | ||||
|       std::cout << "theta "<<theta<<std::endl; | ||||
|       std::cout << "gamma "<<gamma<<std::endl; | ||||
|  | ||||
|       assert(abs(gamma)> 0.0); | ||||
|  | ||||
|       eta = -eta * rho_1 * gamma* gamma / (beta * gamma_1 * gamma_1); | ||||
|  | ||||
|       if (i > 1) { | ||||
| 	d = eta * p + (theta_1 * theta_1 * gamma * gamma) * d; | ||||
| 	s = eta * p_tld + (theta_1 * theta_1 * gamma * gamma) * s; | ||||
|       } else { | ||||
| 	d = eta * p; | ||||
| 	s = eta * p_tld; | ||||
|       } | ||||
|  | ||||
|       x =x+d;                            // update approximation vector | ||||
|       r =r-s;                            // compute residual | ||||
|  | ||||
|       if ((resid = norm2(r) / normb) <= Tolerance) { | ||||
| 	return; | ||||
|       } | ||||
|       std::cout << "Iteration "<<i<<" resid " << resid<<std::endl; | ||||
|     } | ||||
|     assert(0); | ||||
|     return;                            // no convergence | ||||
|   } | ||||
| #else | ||||
|   // QMRg5 SMP thesis | ||||
|   void operator()(LinearOperatorBase<Field> &LinOp, const Field &b, Field &x)  | ||||
|   { | ||||
|     // Real scalars | ||||
|     GridBase *grid = b.Grid(); | ||||
|  | ||||
|     Field    r(grid); | ||||
|     Field    p_m(grid), p_m_minus_1(grid), p_m_minus_2(grid); | ||||
|     Field    v_m(grid), v_m_minus_1(grid), v_m_plus_1(grid); | ||||
|     Field    tmp(grid); | ||||
|  | ||||
|     RealD    w; | ||||
|     RealD    z1, z2; | ||||
|     RealD    delta_m, delta_m_minus_1; | ||||
|     RealD    c_m_plus_1, c_m, c_m_minus_1; | ||||
|     RealD    s_m_plus_1, s_m, s_m_minus_1; | ||||
|     RealD    alpha, beta, gamma, epsilon; | ||||
|     RealD    mu, nu, rho, theta, xi, chi; | ||||
|     RealD    mod2r, mod2b; | ||||
|     RealD    tau2, target2; | ||||
|  | ||||
|     mod2b=norm2(b); | ||||
|  | ||||
|     ///////////////////////// | ||||
|     // Initial residual | ||||
|     ///////////////////////// | ||||
|     LinOp.Op(x,tmp); | ||||
|     r = b - tmp; | ||||
|  | ||||
|     ///////////////////////// | ||||
|     // \mu = \rho = |r_0| | ||||
|     ///////////////////////// | ||||
|     mod2r = norm2(r); | ||||
|     rho = sqrt( mod2r); | ||||
|     mu=rho; | ||||
|      | ||||
|     std::cout << "QuasiMinimalResidual rho "<< rho<<std::endl; | ||||
|     ///////////////////////// | ||||
|     // Zero negative history | ||||
|     ///////////////////////// | ||||
|     v_m_plus_1  = Zero(); | ||||
|     v_m_minus_1 = Zero(); | ||||
|     p_m_minus_1 = Zero(); | ||||
|     p_m_minus_2 = Zero(); | ||||
|  | ||||
|     // v0 | ||||
|     v_m = (1.0/rho)*r; | ||||
|  | ||||
|     ///////////////////////// | ||||
|     // Initial coeffs | ||||
|     ///////////////////////// | ||||
|     delta_m_minus_1 = 1.0; | ||||
|     c_m_minus_1     = 1.0; | ||||
|     c_m             = 1.0; | ||||
|     s_m_minus_1     = 0.0; | ||||
|     s_m             = 0.0; | ||||
|  | ||||
|     ///////////////////////// | ||||
|     // Set up convergence check | ||||
|     ///////////////////////// | ||||
|     tau2    = mod2r; | ||||
|     target2 = mod2b * Tolerance*Tolerance; | ||||
|   | ||||
|     for(int iter = 0 ; iter < MaxIterations; iter++){ | ||||
|  | ||||
|       ///////////////////////// | ||||
|       // \delta_m = (v_m, \gamma_5 v_m)  | ||||
|       ///////////////////////// | ||||
|       delta_m = innerG5ProductReal(v_m,v_m); | ||||
|       std::cout << "QuasiMinimalResidual delta_m "<< delta_m<<std::endl; | ||||
|  | ||||
|       ///////////////////////// | ||||
|       // tmp = A v_m | ||||
|       ///////////////////////// | ||||
|       LinOp.Op(v_m,tmp); | ||||
|  | ||||
|       ///////////////////////// | ||||
|       // \alpha = (v_m, \gamma_5 temp) / \delta_m  | ||||
|       ///////////////////////// | ||||
|       alpha = innerG5ProductReal(v_m,tmp); | ||||
|       alpha = alpha/delta_m ; | ||||
|       std::cout << "QuasiMinimalResidual alpha "<< alpha<<std::endl; | ||||
|  | ||||
|       ///////////////////////// | ||||
|       // \beta = \rho \delta_m / \delta_{m-1} | ||||
|       ///////////////////////// | ||||
|       beta = rho * delta_m / delta_m_minus_1; | ||||
|       std::cout << "QuasiMinimalResidual beta "<< beta<<std::endl; | ||||
|  | ||||
|       ///////////////////////// | ||||
|       // \tilde{v}_{m+1} = temp - \alpha v_m - \beta v_{m-1} | ||||
|       ///////////////////////// | ||||
|       v_m_plus_1 = tmp - alpha*v_m - beta*v_m_minus_1; | ||||
|  | ||||
|       /////////////////////////////// | ||||
|       // \rho = || \tilde{v}_{m+1} || | ||||
|       /////////////////////////////// | ||||
|       rho = sqrt( norm2(v_m_plus_1) ); | ||||
|       std::cout << "QuasiMinimalResidual rho "<< rho<<std::endl; | ||||
|  | ||||
|       /////////////////////////////// | ||||
|       //      v_{m+1} = \tilde{v}_{m+1} | ||||
|       /////////////////////////////// | ||||
|       v_m_plus_1 = (1.0 / rho) * v_m_plus_1; | ||||
|  | ||||
|       //////////////////////////////// | ||||
|       // QMR recurrence coefficients. | ||||
|       //////////////////////////////// | ||||
|       theta      = s_m_minus_1 * beta; | ||||
|       gamma      = c_m_minus_1 * beta; | ||||
|       epsilon    =  c_m * gamma + s_m * alpha; | ||||
|       xi         = -s_m * gamma + c_m * alpha; | ||||
|       nu         = sqrt( xi*xi + rho*rho ); | ||||
|       c_m_plus_1 = fabs(xi) / nu; | ||||
|       if ( xi == 0.0 ) { | ||||
| 	s_m_plus_1 = 1.0; | ||||
|       } else { | ||||
| 	s_m_plus_1 = c_m_plus_1 * rho / xi; | ||||
|       } | ||||
|       chi = c_m_plus_1 * xi + s_m_plus_1 * rho; | ||||
|  | ||||
|       std::cout << "QuasiMinimalResidual coeffs "<< theta <<" "<<gamma<<" "<< epsilon<<" "<< xi<<" "<< nu<<std::endl; | ||||
|       std::cout << "QuasiMinimalResidual coeffs "<< chi   <<std::endl; | ||||
|  | ||||
|       //////////////////////////////// | ||||
|       //p_m=(v_m - \epsilon p_{m-1} - \theta p_{m-2}) / \chi | ||||
|       //////////////////////////////// | ||||
|       p_m = (1.0/chi) * v_m - (epsilon/chi) * p_m_minus_1 - (theta/chi) * p_m_minus_2; | ||||
|  | ||||
|       //////////////////////////////////////////////////////////////// | ||||
|       //      \psi = \psi + c_{m+1} \mu p_m	 | ||||
|       //////////////////////////////////////////////////////////////// | ||||
|       x = x + ( c_m_plus_1 * mu ) * p_m; | ||||
|  | ||||
|       //////////////////////////////////////// | ||||
|       // | ||||
|       //////////////////////////////////////// | ||||
|       mu              = -s_m_plus_1 * mu; | ||||
|       delta_m_minus_1 = delta_m; | ||||
|       c_m_minus_1     = c_m; | ||||
|       c_m             = c_m_plus_1; | ||||
|       s_m_minus_1     = s_m; | ||||
|       s_m             = s_m_plus_1; | ||||
|  | ||||
|       //////////////////////////////////// | ||||
|       // Could use pointer swizzle games. | ||||
|       //////////////////////////////////// | ||||
|       v_m_minus_1 = v_m; | ||||
|       v_m         = v_m_plus_1; | ||||
|       p_m_minus_2 = p_m_minus_1; | ||||
|       p_m_minus_1 = p_m; | ||||
|  | ||||
|  | ||||
|       ///////////////////////////////////// | ||||
|       // Convergence checks | ||||
|       ///////////////////////////////////// | ||||
|       z1 = RealD(iter+1.0); | ||||
|       z2 = z1 + 1.0; | ||||
|       tau2 = tau2 *( z2 / z1 ) * s_m * s_m; | ||||
|       std::cout << " QuasiMinimumResidual iteration "<< iter<<std::endl; | ||||
|       std::cout << " QuasiMinimumResidual tau bound "<< tau2<<std::endl; | ||||
|  | ||||
|       // Compute true residual | ||||
|       mod2r = tau2; | ||||
|       if ( 1 || (tau2 < (100.0 * target2)) ) { | ||||
| 	LinOp.Op(x,tmp); | ||||
| 	r = b - tmp; | ||||
| 	mod2r = norm2(r); | ||||
| 	std::cout << " QuasiMinimumResidual true residual is "<< mod2r<<std::endl; | ||||
|       } | ||||
|  | ||||
|  | ||||
|       if ( mod2r < target2 ) {  | ||||
|  | ||||
| 	std::cout << " QuasiMinimumResidual has converged"<<std::endl; | ||||
| 	return; | ||||
|  | ||||
|       } | ||||
|  | ||||
|     } | ||||
|  | ||||
|  | ||||
|   } | ||||
| #endif | ||||
| }; | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
| @@ -297,9 +297,9 @@ namespace Grid { | ||||
|       ///////////////////////////////////////////////////// | ||||
|       // src_o = (source_o - Moe MeeInv source_e) | ||||
|       ///////////////////////////////////////////////////// | ||||
|       _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even); | ||||
|       _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);      | ||||
|       tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);      | ||||
|       _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even); | ||||
|       _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);      | ||||
|       tmp=src_o-Mtmp;                  assert(  tmp.Checkerboard() ==Odd);      | ||||
|  | ||||
|       _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm. | ||||
|     } | ||||
| @@ -317,17 +317,17 @@ namespace Grid { | ||||
|       /////////////////////////////////////////////////// | ||||
|       // sol_e = M_ee^-1 * ( src_e - Meo sol_o )... | ||||
|       /////////////////////////////////////////////////// | ||||
|       _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even); | ||||
|       src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even); | ||||
|       _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even); | ||||
|       _Matrix.Meooe(sol_o,tmp);        assert(  tmp.Checkerboard()   ==Even); | ||||
|       src_e = src_e-tmp;               assert(  src_e.Checkerboard() ==Even); | ||||
|       _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.Checkerboard() ==Even); | ||||
|       | ||||
|       setCheckerboard(sol,sol_e); assert(  sol_e.checkerboard ==Even); | ||||
|       setCheckerboard(sol,sol_o); assert(  sol_o.checkerboard ==Odd ); | ||||
|       setCheckerboard(sol,sol_e); assert(  sol_e.Checkerboard() ==Even); | ||||
|       setCheckerboard(sol,sol_o); assert(  sol_o.Checkerboard() ==Odd ); | ||||
|     } | ||||
|     virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o) | ||||
|     { | ||||
|       SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix); | ||||
|       this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd); | ||||
|       this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.Checkerboard()==Odd); | ||||
|     }; | ||||
|     virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o) | ||||
|     { | ||||
| @@ -366,13 +366,13 @@ namespace Grid { | ||||
|       ///////////////////////////////////////////////////// | ||||
|       // src_o = Mdag * (source_o - Moe MeeInv source_e) | ||||
|       ///////////////////////////////////////////////////// | ||||
|       _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even); | ||||
|       _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);      | ||||
|       tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);      | ||||
|       _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even); | ||||
|       _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);      | ||||
|       tmp=src_o-Mtmp;                  assert(  tmp.Checkerboard() ==Odd);      | ||||
|  | ||||
|       // get the right MpcDag | ||||
|       SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix); | ||||
|       _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);        | ||||
|       _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);        | ||||
|  | ||||
|     } | ||||
|     virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) | ||||
| @@ -386,17 +386,17 @@ namespace Grid { | ||||
|       /////////////////////////////////////////////////// | ||||
|       // sol_e = M_ee^-1 * ( src_e - Meo sol_o )... | ||||
|       /////////////////////////////////////////////////// | ||||
|       _Matrix.Meooe(sol_o,tmp);          assert(  tmp.checkerboard   ==Even); | ||||
|       src_e_i = src_e-tmp;               assert(  src_e_i.checkerboard ==Even); | ||||
|       _Matrix.MooeeInv(src_e_i,sol_e);   assert(  sol_e.checkerboard ==Even); | ||||
|       _Matrix.Meooe(sol_o,tmp);          assert(  tmp.Checkerboard()   ==Even); | ||||
|       src_e_i = src_e-tmp;               assert(  src_e_i.Checkerboard() ==Even); | ||||
|       _Matrix.MooeeInv(src_e_i,sol_e);   assert(  sol_e.Checkerboard() ==Even); | ||||
|       | ||||
|       setCheckerboard(sol,sol_e); assert(  sol_e.checkerboard ==Even); | ||||
|       setCheckerboard(sol,sol_o); assert(  sol_o.checkerboard ==Odd ); | ||||
|       setCheckerboard(sol,sol_e); assert(  sol_e.Checkerboard() ==Even); | ||||
|       setCheckerboard(sol,sol_o); assert(  sol_o.Checkerboard() ==Odd ); | ||||
|     } | ||||
|     virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o) | ||||
|     { | ||||
|       SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix); | ||||
|       this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd); | ||||
|       this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.Checkerboard()==Odd); | ||||
|     }; | ||||
|     virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o) | ||||
|     { | ||||
| @@ -405,6 +405,70 @@ namespace Grid { | ||||
|     } | ||||
|   }; | ||||
|  | ||||
|   template<class Field> class NonHermitianSchurRedBlackDiagMooeeSolve : public SchurRedBlackBase<Field>  | ||||
|   { | ||||
|     public: | ||||
|       typedef CheckerBoardedSparseMatrixBase<Field> Matrix; | ||||
|  | ||||
|       NonHermitianSchurRedBlackDiagMooeeSolve(OperatorFunction<Field>& RBSolver, const bool initSubGuess = false, | ||||
|           const bool _solnAsInitGuess = false)   | ||||
|       : SchurRedBlackBase<Field>(RBSolver, initSubGuess, _solnAsInitGuess) {}; | ||||
|  | ||||
|       ////////////////////////////////////////////////////// | ||||
|       // Override RedBlack specialisation | ||||
|       ////////////////////////////////////////////////////// | ||||
|       virtual void RedBlackSource(Matrix& _Matrix, const Field& src, Field& src_e, Field& src_o) | ||||
|       { | ||||
|         GridBase* grid  = _Matrix.RedBlackGrid(); | ||||
|         GridBase* fgrid = _Matrix.Grid(); | ||||
|  | ||||
|         Field  tmp(grid); | ||||
|         Field Mtmp(grid); | ||||
|  | ||||
|         pickCheckerboard(Even, src_e, src); | ||||
|         pickCheckerboard(Odd , src_o, src); | ||||
|  | ||||
|         ///////////////////////////////////////////////////// | ||||
|         // src_o = Mdag * (source_o - Moe MeeInv source_e) | ||||
|         ///////////////////////////////////////////////////// | ||||
|         _Matrix.MooeeInv(src_e, tmp);   assert(   tmp.Checkerboard() == Even ); | ||||
|         _Matrix.Meooe   (tmp, Mtmp);    assert(  Mtmp.Checkerboard() == Odd  );      | ||||
|         src_o -= Mtmp;                  assert( src_o.Checkerboard() == Odd  );      | ||||
|       } | ||||
|        | ||||
|       virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol) | ||||
|       { | ||||
|         GridBase* grid  = _Matrix.RedBlackGrid(); | ||||
|         GridBase* fgrid = _Matrix.Grid(); | ||||
|  | ||||
|         Field     tmp(grid); | ||||
|         Field   sol_e(grid); | ||||
|         Field src_e_i(grid); | ||||
|          | ||||
|         /////////////////////////////////////////////////// | ||||
|         // sol_e = M_ee^-1 * ( src_e - Meo sol_o )... | ||||
|         /////////////////////////////////////////////////// | ||||
|         _Matrix.Meooe(sol_o, tmp);         assert(     tmp.Checkerboard() == Even ); | ||||
|         src_e_i = src_e - tmp;             assert( src_e_i.Checkerboard() == Even ); | ||||
|         _Matrix.MooeeInv(src_e_i, sol_e);  assert(   sol_e.Checkerboard() == Even ); | ||||
|         | ||||
|         setCheckerboard(sol, sol_e); assert( sol_e.Checkerboard() == Even ); | ||||
|         setCheckerboard(sol, sol_o); assert( sol_o.Checkerboard() == Odd  ); | ||||
|       } | ||||
|  | ||||
|       virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o) | ||||
|       { | ||||
|         NonHermitianSchurDiagMooeeOperator<Matrix,Field> _OpEO(_Matrix); | ||||
|         this->_HermitianRBSolver(_OpEO, src_o, sol_o);  assert(sol_o.Checkerboard() == Odd); | ||||
|       } | ||||
|  | ||||
|       virtual void RedBlackSolve(Matrix& _Matrix, const std::vector<Field>& src_o, std::vector<Field>& sol_o) | ||||
|       { | ||||
|         NonHermitianSchurDiagMooeeOperator<Matrix,Field> _OpEO(_Matrix); | ||||
|         this->_HermitianRBSolver(_OpEO, src_o, sol_o);  | ||||
|       } | ||||
|   }; | ||||
|  | ||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // Site diagonal is identity, right preconditioned by Mee^inv | ||||
|   // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta | ||||
| @@ -437,12 +501,12 @@ namespace Grid { | ||||
|       ///////////////////////////////////////////////////// | ||||
|       // src_o = Mdag * (source_o - Moe MeeInv source_e) | ||||
|       ///////////////////////////////////////////////////// | ||||
|       _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even); | ||||
|       _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);      | ||||
|       tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);      | ||||
|       _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even); | ||||
|       _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);      | ||||
|       tmp=src_o-Mtmp;                  assert(  tmp.Checkerboard() ==Odd);      | ||||
|  | ||||
|       // get the right MpcDag | ||||
|       _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);        | ||||
|       _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);        | ||||
|     } | ||||
|  | ||||
|     virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) | ||||
| @@ -463,12 +527,12 @@ namespace Grid { | ||||
|       /////////////////////////////////////////////////// | ||||
|       // sol_e = M_ee^-1 * ( src_e - Meo sol_o )... | ||||
|       /////////////////////////////////////////////////// | ||||
|       _Matrix.Meooe(sol_o_i,tmp);    assert(  tmp.checkerboard   ==Even); | ||||
|       tmp = src_e-tmp;               assert(  src_e.checkerboard ==Even); | ||||
|       _Matrix.MooeeInv(tmp,sol_e);   assert(  sol_e.checkerboard ==Even); | ||||
|       _Matrix.Meooe(sol_o_i,tmp);    assert(  tmp.Checkerboard()   ==Even); | ||||
|       tmp = src_e-tmp;               assert(  src_e.Checkerboard() ==Even); | ||||
|       _Matrix.MooeeInv(tmp,sol_e);   assert(  sol_e.Checkerboard() ==Even); | ||||
|       | ||||
|       setCheckerboard(sol,sol_e);    assert(  sol_e.checkerboard ==Even); | ||||
|       setCheckerboard(sol,sol_o_i);  assert(  sol_o_i.checkerboard ==Odd ); | ||||
|       setCheckerboard(sol,sol_e);    assert(  sol_e.Checkerboard() ==Even); | ||||
|       setCheckerboard(sol,sol_o_i);  assert(  sol_o_i.Checkerboard() ==Odd ); | ||||
|     }; | ||||
|  | ||||
|     virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o) | ||||
| @@ -482,5 +546,76 @@ namespace Grid { | ||||
|       this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  | ||||
|     } | ||||
|   }; | ||||
|  | ||||
|   template<class Field> class NonHermitianSchurRedBlackDiagTwoSolve : public SchurRedBlackBase<Field>  | ||||
|   { | ||||
|     public: | ||||
|       typedef CheckerBoardedSparseMatrixBase<Field> Matrix; | ||||
|  | ||||
|       ///////////////////////////////////////////////////// | ||||
|       // Wrap the usual normal equations Schur trick | ||||
|       ///////////////////////////////////////////////////// | ||||
|       NonHermitianSchurRedBlackDiagTwoSolve(OperatorFunction<Field>& RBSolver, const bool initSubGuess = false, | ||||
|           const bool _solnAsInitGuess = false)   | ||||
|       : SchurRedBlackBase<Field>(RBSolver, initSubGuess, _solnAsInitGuess) {}; | ||||
|  | ||||
|       virtual void RedBlackSource(Matrix& _Matrix, const Field& src, Field& src_e, Field& src_o) | ||||
|       { | ||||
|         GridBase* grid  = _Matrix.RedBlackGrid(); | ||||
|         GridBase* fgrid = _Matrix.Grid(); | ||||
|  | ||||
|         Field  tmp(grid); | ||||
|         Field Mtmp(grid); | ||||
|  | ||||
|         pickCheckerboard(Even, src_e, src); | ||||
|         pickCheckerboard(Odd , src_o, src); | ||||
|        | ||||
|         ///////////////////////////////////////////////////// | ||||
|         // src_o = Mdag * (source_o - Moe MeeInv source_e) | ||||
|         ///////////////////////////////////////////////////// | ||||
|         _Matrix.MooeeInv(src_e, tmp);   assert(   tmp.Checkerboard() == Even ); | ||||
|         _Matrix.Meooe   (tmp, Mtmp);    assert(  Mtmp.Checkerboard() == Odd  );      | ||||
|         src_o -= Mtmp;                  assert( src_o.Checkerboard() == Odd  );      | ||||
|       } | ||||
|  | ||||
|       virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol) | ||||
|       { | ||||
|         GridBase* grid  = _Matrix.RedBlackGrid(); | ||||
|         GridBase* fgrid = _Matrix.Grid(); | ||||
|  | ||||
|         Field sol_o_i(grid); | ||||
|         Field     tmp(grid); | ||||
|         Field   sol_e(grid); | ||||
|  | ||||
|         //////////////////////////////////////////////// | ||||
|         // MooeeInv due to pecond | ||||
|         //////////////////////////////////////////////// | ||||
|         _Matrix.MooeeInv(sol_o, tmp); | ||||
|         sol_o_i = tmp; | ||||
|  | ||||
|         /////////////////////////////////////////////////// | ||||
|         // sol_e = M_ee^-1 * ( src_e - Meo sol_o )... | ||||
|         /////////////////////////////////////////////////// | ||||
|         _Matrix.Meooe(sol_o_i, tmp);    assert(   tmp.Checkerboard() == Even ); | ||||
|         tmp = src_e - tmp;              assert( src_e.Checkerboard() == Even ); | ||||
|         _Matrix.MooeeInv(tmp, sol_e);   assert( sol_e.Checkerboard() == Even ); | ||||
|         | ||||
|         setCheckerboard(sol, sol_e);    assert(   sol_e.Checkerboard() == Even ); | ||||
|         setCheckerboard(sol, sol_o_i);  assert( sol_o_i.Checkerboard() == Odd  ); | ||||
|       }; | ||||
|  | ||||
|       virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o) | ||||
|       { | ||||
|         NonHermitianSchurDiagTwoOperator<Matrix,Field> _OpEO(_Matrix); | ||||
|         this->_HermitianRBSolver(_OpEO, src_o, sol_o); | ||||
|       }; | ||||
|  | ||||
|       virtual void RedBlackSolve(Matrix& _Matrix, const std::vector<Field>& src_o,  std::vector<Field>& sol_o) | ||||
|       { | ||||
|         NonHermitianSchurDiagTwoOperator<Matrix,Field> _OpEO(_Matrix); | ||||
|         this->_HermitianRBSolver(_OpEO, src_o, sol_o);  | ||||
|       } | ||||
|   }; | ||||
| } | ||||
|  | ||||
| #endif | ||||
|   | ||||
| @@ -1,70 +1,11 @@ | ||||
| #include <Grid/GridCore.h> | ||||
| #include <fcntl.h> | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| MemoryStats *MemoryProfiler::stats = nullptr; | ||||
| bool         MemoryProfiler::debug = false; | ||||
|  | ||||
| int PointerCache::victim; | ||||
|  | ||||
| PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache]; | ||||
|  | ||||
| void *PointerCache::Insert(void *ptr,size_t bytes) { | ||||
|  | ||||
|   if (bytes < 4096 ) return ptr; | ||||
|  | ||||
| #ifdef GRID_OMP | ||||
|   assert(omp_in_parallel()==0); | ||||
| #endif  | ||||
|  | ||||
|   void * ret = NULL; | ||||
|   int v = -1; | ||||
|  | ||||
|   for(int e=0;e<Ncache;e++) { | ||||
|     if ( Entries[e].valid==0 ) { | ||||
|       v=e;  | ||||
|       break; | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   if ( v==-1 ) { | ||||
|     v=victim; | ||||
|     victim = (victim+1)%Ncache; | ||||
|   } | ||||
|  | ||||
|   if ( Entries[v].valid ) { | ||||
|     ret = Entries[v].address; | ||||
|     Entries[v].valid = 0; | ||||
|     Entries[v].address = NULL; | ||||
|     Entries[v].bytes = 0; | ||||
|   } | ||||
|  | ||||
|   Entries[v].address=ptr; | ||||
|   Entries[v].bytes  =bytes; | ||||
|   Entries[v].valid  =1; | ||||
|  | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| void *PointerCache::Lookup(size_t bytes) { | ||||
|  | ||||
|  if (bytes < 4096 ) return NULL; | ||||
|  | ||||
| #ifdef _OPENMP | ||||
|   assert(omp_in_parallel()==0); | ||||
| #endif  | ||||
|  | ||||
|   for(int e=0;e<Ncache;e++){ | ||||
|     if ( Entries[e].valid && ( Entries[e].bytes == bytes ) ) { | ||||
|       Entries[e].valid = 0; | ||||
|       return Entries[e].address; | ||||
|     } | ||||
|   } | ||||
|   return NULL; | ||||
| } | ||||
|  | ||||
|  | ||||
| void check_huge_pages(void *Buf,uint64_t BYTES) | ||||
| { | ||||
| #ifdef __linux__ | ||||
| @@ -90,7 +31,7 @@ void check_huge_pages(void *Buf,uint64_t BYTES) | ||||
|       ++n4ktotal; | ||||
|       if (pageaddr != baseaddr + j * page_size) | ||||
| 	++nnothuge; | ||||
|       } | ||||
|     } | ||||
|   } | ||||
|   int rank = CartesianCommunicator::RankWorld(); | ||||
|   printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge); | ||||
| @@ -106,20 +47,21 @@ std::string sizeString(const size_t bytes) | ||||
|   double                 count = bytes; | ||||
|    | ||||
|   while (count >= 1024 && s < 7) | ||||
|   { | ||||
|     { | ||||
|       s++; | ||||
|       count /= 1024; | ||||
|   } | ||||
|     } | ||||
|   if (count - floor(count) == 0.0) | ||||
|   { | ||||
|     { | ||||
|       snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]); | ||||
|   } | ||||
|     } | ||||
|   else | ||||
|   { | ||||
|     { | ||||
|       snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]); | ||||
|   } | ||||
|     } | ||||
|    | ||||
|   return std::string(buf); | ||||
| } | ||||
|  | ||||
| } | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
|   | ||||
| @@ -24,109 +24,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #ifndef GRID_ALIGNED_ALLOCATOR_H | ||||
| #define GRID_ALIGNED_ALLOCATOR_H | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #pragma once | ||||
|  | ||||
| #ifdef HAVE_MALLOC_MALLOC_H | ||||
| #include <malloc/malloc.h> | ||||
| #endif | ||||
| #ifdef HAVE_MALLOC_H | ||||
| #include <malloc.h> | ||||
| #endif | ||||
|  | ||||
| #ifdef HAVE_MM_MALLOC_H | ||||
| #include <mm_malloc.h> | ||||
| #endif | ||||
|  | ||||
| namespace Grid { | ||||
|  | ||||
|   class PointerCache { | ||||
|   private: | ||||
|  | ||||
|     static const int Ncache=8; | ||||
|     static int victim; | ||||
|  | ||||
|     typedef struct {  | ||||
|       void *address; | ||||
|       size_t bytes; | ||||
|       int valid; | ||||
|     } PointerCacheEntry; | ||||
|      | ||||
|     static PointerCacheEntry Entries[Ncache]; | ||||
|  | ||||
|   public: | ||||
|  | ||||
|  | ||||
|     static void *Insert(void *ptr,size_t bytes) ; | ||||
|     static void *Lookup(size_t bytes) ; | ||||
|  | ||||
|   }; | ||||
|    | ||||
|   std::string sizeString(size_t bytes); | ||||
|  | ||||
|   struct MemoryStats | ||||
|   { | ||||
|     size_t totalAllocated{0}, maxAllocated{0},  | ||||
|            currentlyAllocated{0}, totalFreed{0}; | ||||
|   }; | ||||
|      | ||||
|   class MemoryProfiler | ||||
|   { | ||||
|   public: | ||||
|     static MemoryStats *stats; | ||||
|     static bool        debug; | ||||
|   }; | ||||
|  | ||||
|   #define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")" | ||||
|   #define profilerDebugPrint \ | ||||
|   if (MemoryProfiler::stats)\ | ||||
|   {\ | ||||
|     auto s = MemoryProfiler::stats;\ | ||||
|     std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl;\ | ||||
|     std::cout << GridLogDebug << "[Memory debug] total  : " << memString(s->totalAllocated) \ | ||||
|               << std::endl;\ | ||||
|     std::cout << GridLogDebug << "[Memory debug] max    : " << memString(s->maxAllocated) \ | ||||
|               << std::endl;\ | ||||
|     std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \ | ||||
|               << std::endl;\ | ||||
|     std::cout << GridLogDebug << "[Memory debug] freed  : " << memString(s->totalFreed) \ | ||||
|               << std::endl;\ | ||||
|   } | ||||
|  | ||||
|   #define profilerAllocate(bytes)\ | ||||
|   if (MemoryProfiler::stats)\ | ||||
|   {\ | ||||
|     auto s = MemoryProfiler::stats;\ | ||||
|     s->totalAllocated     += (bytes);\ | ||||
|     s->currentlyAllocated += (bytes);\ | ||||
|     s->maxAllocated        = std::max(s->maxAllocated, s->currentlyAllocated);\ | ||||
|   }\ | ||||
|   if (MemoryProfiler::debug)\ | ||||
|   {\ | ||||
|     std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl;\ | ||||
|     profilerDebugPrint;\ | ||||
|   } | ||||
|  | ||||
|   #define profilerFree(bytes)\ | ||||
|   if (MemoryProfiler::stats)\ | ||||
|   {\ | ||||
|     auto s = MemoryProfiler::stats;\ | ||||
|     s->totalFreed         += (bytes);\ | ||||
|     s->currentlyAllocated -= (bytes);\ | ||||
|   }\ | ||||
|   if (MemoryProfiler::debug)\ | ||||
|   {\ | ||||
|     std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl;\ | ||||
|     profilerDebugPrint;\ | ||||
|   } | ||||
|  | ||||
|   void check_huge_pages(void *Buf,uint64_t BYTES); | ||||
|  | ||||
| //////////////////////////////////////////////////////////////////// | ||||
| // A lattice of something, but assume the something is SIMDized. | ||||
| //////////////////////////////////////////////////////////////////// | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| template<typename _Tp> | ||||
| class alignedAllocator { | ||||
| @@ -151,68 +53,29 @@ public: | ||||
|   {  | ||||
|     size_type bytes = __n*sizeof(_Tp); | ||||
|     profilerAllocate(bytes); | ||||
|  | ||||
|     _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes); | ||||
|     //    if ( ptr != NULL )  | ||||
|     //      std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl; | ||||
|  | ||||
|     ////////////////// | ||||
|     // Hack 2MB align; could make option probably doesn't need configurability | ||||
|     ////////////////// | ||||
| //define GRID_ALLOC_ALIGN (128) | ||||
| #define GRID_ALLOC_ALIGN (2*1024*1024) | ||||
| #ifdef HAVE_MM_MALLOC_H | ||||
|     if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN); | ||||
| #else | ||||
|     if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes); | ||||
| #endif | ||||
|     //    std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl; | ||||
|     // First touch optimise in threaded loop | ||||
|     uint8_t *cp = (uint8_t *)ptr; | ||||
| #ifdef GRID_OMP | ||||
| #pragma omp parallel for | ||||
| #endif | ||||
|     for(size_type n=0;n<bytes;n+=4096){ | ||||
|       cp[n]=0; | ||||
|     } | ||||
|     _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes); | ||||
|     assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); | ||||
|     return ptr; | ||||
|   } | ||||
|  | ||||
|   void deallocate(pointer __p, size_type __n) {  | ||||
|   void deallocate(pointer __p, size_type __n)  | ||||
|   {  | ||||
|     size_type bytes = __n * sizeof(_Tp); | ||||
|  | ||||
|     profilerFree(bytes); | ||||
|  | ||||
|     pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes); | ||||
|  | ||||
| #ifdef HAVE_MM_MALLOC_H | ||||
|     if ( __freeme ) _mm_free((void *)__freeme);  | ||||
| #else | ||||
|     if ( __freeme ) free((void *)__freeme); | ||||
| #endif | ||||
|     MemoryManager::CpuFree((void *)__p,bytes); | ||||
|   } | ||||
|   void construct(pointer __p, const _Tp& __val) { }; | ||||
|  | ||||
|   // FIXME: hack for the copy constructor, eventually it must be avoided | ||||
|   //void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); }; | ||||
|   void construct(pointer __p, const _Tp& __val) { assert(0);}; | ||||
|   void construct(pointer __p) { }; | ||||
|   void destroy(pointer __p) { }; | ||||
| }; | ||||
| template<typename _Tp>  inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; } | ||||
| template<typename _Tp>  inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // MPI3 : comms must use shm region | ||||
| // SHMEM: comms must use symmetric heap | ||||
| ////////////////////////////////////////////////////////////////////////////////////////// | ||||
| #ifdef GRID_COMMS_SHMEM | ||||
| extern "C" {  | ||||
| #include <mpp/shmem.h> | ||||
| extern void * shmem_align(size_t, size_t); | ||||
| extern void  shmem_free(void *); | ||||
| } | ||||
| #define PARANOID_SYMMETRIC_HEAP | ||||
| #endif | ||||
|  | ||||
| template<typename _Tp> | ||||
| class commAllocator { | ||||
| class uvmAllocator { | ||||
| public:  | ||||
|   typedef std::size_t     size_type; | ||||
|   typedef std::ptrdiff_t  difference_type; | ||||
| @@ -222,94 +85,47 @@ public: | ||||
|   typedef const _Tp& const_reference; | ||||
|   typedef _Tp        value_type; | ||||
|  | ||||
|   template<typename _Tp1>  struct rebind { typedef commAllocator<_Tp1> other; }; | ||||
|   commAllocator() throw() { } | ||||
|   commAllocator(const commAllocator&) throw() { } | ||||
|   template<typename _Tp1> commAllocator(const commAllocator<_Tp1>&) throw() { } | ||||
|   ~commAllocator() throw() { } | ||||
|   template<typename _Tp1>  struct rebind { typedef uvmAllocator<_Tp1> other; }; | ||||
|   uvmAllocator() throw() { } | ||||
|   uvmAllocator(const uvmAllocator&) throw() { } | ||||
|   template<typename _Tp1> uvmAllocator(const uvmAllocator<_Tp1>&) throw() { } | ||||
|   ~uvmAllocator() throw() { } | ||||
|   pointer       address(reference __x)       const { return &__x; } | ||||
|   size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); } | ||||
|  | ||||
| #ifdef GRID_COMMS_SHMEM | ||||
|   pointer allocate(size_type __n, const void* _p= 0) | ||||
|   { | ||||
|   {  | ||||
|     size_type bytes = __n*sizeof(_Tp); | ||||
|  | ||||
|     profilerAllocate(bytes); | ||||
| #ifdef CRAY | ||||
|     _Tp *ptr = (_Tp *) shmem_align(bytes,64); | ||||
| #else | ||||
|     _Tp *ptr = (_Tp *) shmem_align(64,bytes); | ||||
| #endif | ||||
| #ifdef PARANOID_SYMMETRIC_HEAP | ||||
|     static void * bcast; | ||||
|     static long  psync[_SHMEM_REDUCE_SYNC_SIZE]; | ||||
|  | ||||
|     bcast = (void *) ptr; | ||||
|     shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync); | ||||
|  | ||||
|     if ( bcast != ptr ) { | ||||
|       std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout); | ||||
|       //      BACKTRACEFILE(); | ||||
|       exit(0); | ||||
|     } | ||||
|     assert( bcast == (void *) ptr); | ||||
| #endif  | ||||
|     _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes); | ||||
|     assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); | ||||
|     return ptr; | ||||
|   } | ||||
|   void deallocate(pointer __p, size_type __n) {  | ||||
|     size_type bytes = __n*sizeof(_Tp); | ||||
|  | ||||
|   void deallocate(pointer __p, size_type __n)  | ||||
|   {  | ||||
|     size_type bytes = __n * sizeof(_Tp); | ||||
|     profilerFree(bytes); | ||||
|     shmem_free((void *)__p); | ||||
|     MemoryManager::SharedFree((void *)__p,bytes); | ||||
|   } | ||||
| #else | ||||
|   pointer allocate(size_type __n, const void* _p= 0)  | ||||
|   { | ||||
|     size_type bytes = __n*sizeof(_Tp); | ||||
|      | ||||
|     profilerAllocate(bytes); | ||||
| #ifdef HAVE_MM_MALLOC_H | ||||
|     _Tp * ptr = (_Tp *) _mm_malloc(bytes, GRID_ALLOC_ALIGN); | ||||
| #else | ||||
|     _Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN, bytes); | ||||
| #endif | ||||
|     uint8_t *cp = (uint8_t *)ptr; | ||||
|     if ( ptr ) {  | ||||
|     // One touch per 4k page, static OMP loop to catch same loop order | ||||
| #ifdef GRID_OMP | ||||
| #pragma omp parallel for schedule(static) | ||||
| #endif | ||||
|       for(size_type n=0;n<bytes;n+=4096){ | ||||
| 	cp[n]=0; | ||||
|       } | ||||
|     } | ||||
|     return ptr; | ||||
|   } | ||||
|   void deallocate(pointer __p, size_type __n) { | ||||
|     size_type bytes = __n*sizeof(_Tp); | ||||
|  | ||||
|     profilerFree(bytes); | ||||
| #ifdef HAVE_MM_MALLOC_H | ||||
|     _mm_free((void *)__p);  | ||||
| #else | ||||
|     free((void *)__p); | ||||
| #endif | ||||
|   } | ||||
| #endif | ||||
|   void construct(pointer __p, const _Tp& __val) { }; | ||||
|   // FIXME: hack for the copy constructor, eventually it must be avoided | ||||
|   void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); }; | ||||
|   //void construct(pointer __p, const _Tp& __val) { }; | ||||
|   void construct(pointer __p) { }; | ||||
|   void destroy(pointer __p) { }; | ||||
| }; | ||||
| template<typename _Tp>  inline bool operator==(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return true; } | ||||
| template<typename _Tp>  inline bool operator!=(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return false; } | ||||
| template<typename _Tp>  inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; } | ||||
| template<typename _Tp>  inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; } | ||||
|  | ||||
| //////////////////////////////////////////////////////////////////////////////// | ||||
| // Template typedefs | ||||
| //////////////////////////////////////////////////////////////////////////////// | ||||
| template<class T> using Vector     = std::vector<T,alignedAllocator<T> >;            | ||||
| template<class T> using commVector = std::vector<T,commAllocator<T> >;               | ||||
| template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<T> > >; | ||||
|      | ||||
| }; // namespace Grid | ||||
| #endif | ||||
| template<class T> using commAllocator = uvmAllocator<T>; | ||||
| template<class T> using Vector     = std::vector<T,uvmAllocator<T> >;            | ||||
| template<class T> using commVector = std::vector<T,uvmAllocator<T> >; | ||||
| //template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<T> > >; | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
|  | ||||
|   | ||||
							
								
								
									
										4
									
								
								Grid/allocator/Allocator.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								Grid/allocator/Allocator.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,4 @@ | ||||
| #pragma once | ||||
| #include <Grid/allocator/MemoryStats.h> | ||||
| #include <Grid/allocator/MemoryManager.h> | ||||
| #include <Grid/allocator/AlignedAllocator.h> | ||||
							
								
								
									
										244
									
								
								Grid/allocator/MemoryManager.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										244
									
								
								Grid/allocator/MemoryManager.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,244 @@ | ||||
| #include <Grid/GridCore.h> | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| /*Allocation types, saying which pointer cache should be used*/ | ||||
| #define Cpu      (0) | ||||
| #define CpuSmall (1) | ||||
| #define Acc      (2) | ||||
| #define AccSmall (3) | ||||
| #define Shared   (4) | ||||
| #define SharedSmall (5) | ||||
| uint64_t total_shared; | ||||
| uint64_t total_device; | ||||
| uint64_t total_host;; | ||||
| void MemoryManager::PrintBytes(void) | ||||
| { | ||||
|   std::cout << " MemoryManager : "<<total_shared<<" shared      bytes "<<std::endl; | ||||
|   std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl; | ||||
|   std::cout << " MemoryManager : "<<total_host  <<" cpu         bytes "<<std::endl; | ||||
| } | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////// | ||||
| // Data tables for recently freed pooiniter caches | ||||
| ////////////////////////////////////////////////////////////////////// | ||||
| MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax]; | ||||
| int MemoryManager::Victim[MemoryManager::NallocType]; | ||||
| int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 }; | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////// | ||||
| // Actual allocation and deallocation utils | ||||
| ////////////////////////////////////////////////////////////////////// | ||||
| void *MemoryManager::AcceleratorAllocate(size_t bytes) | ||||
| { | ||||
|   void *ptr = (void *) Lookup(bytes,Acc); | ||||
|   if ( ptr == (void *) NULL ) { | ||||
|     ptr = (void *) acceleratorAllocDevice(bytes); | ||||
|     total_device+=bytes; | ||||
|   } | ||||
|   return ptr; | ||||
| } | ||||
| void  MemoryManager::AcceleratorFree    (void *ptr,size_t bytes) | ||||
| { | ||||
|   void *__freeme = Insert(ptr,bytes,Acc); | ||||
|   if ( __freeme ) { | ||||
|     acceleratorFreeDevice(__freeme); | ||||
|     total_device-=bytes; | ||||
|     //    PrintBytes(); | ||||
|   } | ||||
| } | ||||
| void *MemoryManager::SharedAllocate(size_t bytes) | ||||
| { | ||||
|   void *ptr = (void *) Lookup(bytes,Shared); | ||||
|   if ( ptr == (void *) NULL ) { | ||||
|     ptr = (void *) acceleratorAllocShared(bytes); | ||||
|     total_shared+=bytes; | ||||
|     //    std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl; | ||||
|     //    PrintBytes(); | ||||
|   } | ||||
|   return ptr; | ||||
| } | ||||
| void  MemoryManager::SharedFree    (void *ptr,size_t bytes) | ||||
| { | ||||
|   void *__freeme = Insert(ptr,bytes,Shared); | ||||
|   if ( __freeme ) { | ||||
|     acceleratorFreeShared(__freeme); | ||||
|     total_shared-=bytes; | ||||
|     //    PrintBytes(); | ||||
|   } | ||||
| } | ||||
| #ifdef GRID_UVM | ||||
| void *MemoryManager::CpuAllocate(size_t bytes) | ||||
| { | ||||
|   void *ptr = (void *) Lookup(bytes,Cpu); | ||||
|   if ( ptr == (void *) NULL ) { | ||||
|     ptr = (void *) acceleratorAllocShared(bytes); | ||||
|     total_host+=bytes; | ||||
|   } | ||||
|   return ptr; | ||||
| } | ||||
| void  MemoryManager::CpuFree    (void *_ptr,size_t bytes) | ||||
| { | ||||
|   NotifyDeletion(_ptr); | ||||
|   void *__freeme = Insert(_ptr,bytes,Cpu); | ||||
|   if ( __freeme ) {  | ||||
|     acceleratorFreeShared(__freeme); | ||||
|     total_host-=bytes; | ||||
|   } | ||||
| } | ||||
| #else | ||||
| void *MemoryManager::CpuAllocate(size_t bytes) | ||||
| { | ||||
|   void *ptr = (void *) Lookup(bytes,Cpu); | ||||
|   if ( ptr == (void *) NULL ) { | ||||
|     ptr = (void *) acceleratorAllocCpu(bytes); | ||||
|     total_host+=bytes; | ||||
|   } | ||||
|   return ptr; | ||||
| } | ||||
| void  MemoryManager::CpuFree    (void *_ptr,size_t bytes) | ||||
| { | ||||
|   NotifyDeletion(_ptr); | ||||
|   void *__freeme = Insert(_ptr,bytes,Cpu); | ||||
|   if ( __freeme ) {  | ||||
|     acceleratorFreeCpu(__freeme); | ||||
|     total_host-=bytes; | ||||
|   } | ||||
| } | ||||
| #endif | ||||
|  | ||||
| ////////////////////////////////////////// | ||||
| // call only once | ||||
| ////////////////////////////////////////// | ||||
| void MemoryManager::Init(void) | ||||
| { | ||||
|  | ||||
|   char * str; | ||||
|   int Nc; | ||||
|   int NcS; | ||||
|    | ||||
|   str= getenv("GRID_ALLOC_NCACHE_LARGE"); | ||||
|   if ( str ) { | ||||
|     Nc = atoi(str); | ||||
|     if ( (Nc>=0) && (Nc < NallocCacheMax)) { | ||||
|       Ncache[Cpu]=Nc; | ||||
|       Ncache[Acc]=Nc; | ||||
|       Ncache[Shared]=Nc; | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   str= getenv("GRID_ALLOC_NCACHE_SMALL"); | ||||
|   if ( str ) { | ||||
|     Nc = atoi(str); | ||||
|     if ( (Nc>=0) && (Nc < NallocCacheMax)) { | ||||
|       Ncache[CpuSmall]=Nc; | ||||
|       Ncache[AccSmall]=Nc; | ||||
|       Ncache[SharedSmall]=Nc; | ||||
|     } | ||||
|   } | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl; | ||||
| #ifdef ALLOCATION_CACHE | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl; | ||||
| #endif | ||||
|  | ||||
| #ifdef GRID_UVM | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() Unified memory space"<<std::endl; | ||||
| #ifdef GRID_CUDA | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMallocManaged"<<std::endl; | ||||
| #endif | ||||
| #ifdef GRID_HIP | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMallocManaged"<<std::endl; | ||||
| #endif | ||||
| #ifdef GRID_SYCL | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_shared"<<std::endl; | ||||
| #endif | ||||
| #else | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory"<<std::endl; | ||||
| #ifdef GRID_CUDA | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMalloc"<<std::endl; | ||||
| #endif | ||||
| #ifdef GRID_HIP | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMalloc"<<std::endl; | ||||
| #endif | ||||
| #ifdef GRID_SYCL | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl; | ||||
| #endif | ||||
| #endif | ||||
| } | ||||
|  | ||||
| void *MemoryManager::Insert(void *ptr,size_t bytes,int type)  | ||||
| { | ||||
| #ifdef ALLOCATION_CACHE | ||||
|   bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); | ||||
|   int cache = type + small; | ||||
|   return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);   | ||||
| #else | ||||
|   return ptr; | ||||
| #endif | ||||
| } | ||||
|  | ||||
| void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim)  | ||||
| { | ||||
|   assert(ncache>0); | ||||
| #ifdef GRID_OMP | ||||
|   assert(omp_in_parallel()==0); | ||||
| #endif  | ||||
|  | ||||
|   void * ret = NULL; | ||||
|   int v = -1; | ||||
|  | ||||
|   for(int e=0;e<ncache;e++) { | ||||
|     if ( entries[e].valid==0 ) { | ||||
|       v=e;  | ||||
|       break; | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   if ( v==-1 ) { | ||||
|     v=victim; | ||||
|     victim = (victim+1)%ncache; | ||||
|   } | ||||
|  | ||||
|   if ( entries[v].valid ) { | ||||
|     ret = entries[v].address; | ||||
|     entries[v].valid = 0; | ||||
|     entries[v].address = NULL; | ||||
|     entries[v].bytes = 0; | ||||
|   } | ||||
|  | ||||
|   entries[v].address=ptr; | ||||
|   entries[v].bytes  =bytes; | ||||
|   entries[v].valid  =1; | ||||
|  | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| void *MemoryManager::Lookup(size_t bytes,int type) | ||||
| { | ||||
| #ifdef ALLOCATION_CACHE | ||||
|   bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); | ||||
|   int cache = type+small; | ||||
|   return Lookup(bytes,Entries[cache],Ncache[cache]); | ||||
| #else | ||||
|   return NULL; | ||||
| #endif | ||||
| } | ||||
|  | ||||
| void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache)  | ||||
| { | ||||
|   assert(ncache>0); | ||||
| #ifdef GRID_OMP | ||||
|   assert(omp_in_parallel()==0); | ||||
| #endif  | ||||
|   for(int e=0;e<ncache;e++){ | ||||
|     if ( entries[e].valid && ( entries[e].bytes == bytes ) ) { | ||||
|       entries[e].valid = 0; | ||||
|       return entries[e].address; | ||||
|     } | ||||
|   } | ||||
|   return NULL; | ||||
| } | ||||
|  | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
							
								
								
									
										181
									
								
								Grid/allocator/MemoryManager.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										181
									
								
								Grid/allocator/MemoryManager.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,181 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
|     Source file: ./lib/MemoryManager.h | ||||
|  | ||||
|     Copyright (C) 2015 | ||||
|  | ||||
| Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
|  | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
|  | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #pragma once | ||||
| #include <list>  | ||||
| #include <unordered_map>   | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| // Move control to configure.ac and Config.h? | ||||
|  | ||||
| #define ALLOCATION_CACHE | ||||
| #define GRID_ALLOC_ALIGN (2*1024*1024) | ||||
| #define GRID_ALLOC_SMALL_LIMIT (4096) | ||||
|  | ||||
| /*Pinning pages is costly*/ | ||||
| //////////////////////////////////////////////////////////////////////////// | ||||
| // Advise the LatticeAccelerator class | ||||
| //////////////////////////////////////////////////////////////////////////// | ||||
| enum ViewAdvise { | ||||
|  AdviseDefault       = 0x0,    // Regular data | ||||
|  AdviseInfrequentUse = 0x1     // Advise that the data is used infrequently.  This can | ||||
|                                // significantly influence performance of bulk storage. | ||||
|   | ||||
|  // AdviseTransient      = 0x2,   // Data will mostly be read.  On some architectures | ||||
|                                // enables read-only copies of memory to be kept on | ||||
|                                // host and device. | ||||
|  | ||||
|  // AdviseAcceleratorWriteDiscard = 0x4  // Field will be written in entirety on device | ||||
|  | ||||
| }; | ||||
|  | ||||
| //////////////////////////////////////////////////////////////////////////// | ||||
| // View Access Mode | ||||
| //////////////////////////////////////////////////////////////////////////// | ||||
| enum ViewMode { | ||||
|   AcceleratorRead  = 0x01, | ||||
|   AcceleratorWrite = 0x02, | ||||
|   AcceleratorWriteDiscard = 0x04, | ||||
|   CpuRead  = 0x08, | ||||
|   CpuWrite = 0x10, | ||||
|   CpuWriteDiscard = 0x10 // same for now | ||||
| }; | ||||
|  | ||||
| class MemoryManager { | ||||
| private: | ||||
|  | ||||
|   //////////////////////////////////////////////////////////// | ||||
|   // For caching recently freed allocations | ||||
|   //////////////////////////////////////////////////////////// | ||||
|   typedef struct {  | ||||
|     void *address; | ||||
|     size_t bytes; | ||||
|     int valid; | ||||
|   } AllocationCacheEntry; | ||||
|  | ||||
|   static const int NallocCacheMax=128;  | ||||
|   static const int NallocType=6; | ||||
|   static AllocationCacheEntry Entries[NallocType][NallocCacheMax]; | ||||
|   static int Victim[NallocType]; | ||||
|   static int Ncache[NallocType]; | ||||
|  | ||||
|   ///////////////////////////////////////////////// | ||||
|   // Free pool | ||||
|   ///////////////////////////////////////////////// | ||||
|   static void *Insert(void *ptr,size_t bytes,int type) ; | ||||
|   static void *Lookup(size_t bytes,int type) ; | ||||
|   static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ; | ||||
|   static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ; | ||||
|  | ||||
|   static void *AcceleratorAllocate(size_t bytes); | ||||
|   static void  AcceleratorFree    (void *ptr,size_t bytes); | ||||
|   static void PrintBytes(void); | ||||
|  public: | ||||
|   static void Init(void); | ||||
|   static void *SharedAllocate(size_t bytes); | ||||
|   static void  SharedFree    (void *ptr,size_t bytes); | ||||
|   static void *CpuAllocate(size_t bytes); | ||||
|   static void  CpuFree    (void *ptr,size_t bytes); | ||||
|  | ||||
|   //////////////////////////////////////////////////////// | ||||
|   // Footprint tracking | ||||
|   //////////////////////////////////////////////////////// | ||||
|   static uint64_t     DeviceBytes; | ||||
|   static uint64_t     DeviceLRUBytes; | ||||
|   static uint64_t     DeviceMaxBytes; | ||||
|   static uint64_t     HostToDeviceBytes; | ||||
|   static uint64_t     DeviceToHostBytes; | ||||
|   static uint64_t     HostToDeviceXfer; | ||||
|   static uint64_t     DeviceToHostXfer; | ||||
|   | ||||
|  private: | ||||
| #ifndef GRID_UVM | ||||
|   ////////////////////////////////////////////////////////////////////// | ||||
|   // Data tables for ViewCache | ||||
|   ////////////////////////////////////////////////////////////////////// | ||||
|   typedef std::list<uint64_t> LRU_t; | ||||
|   typedef typename LRU_t::iterator LRUiterator; | ||||
|   typedef struct {  | ||||
|     int        LRU_valid; | ||||
|     LRUiterator LRU_entry; | ||||
|     uint64_t CpuPtr; | ||||
|     uint64_t AccPtr; | ||||
|     size_t   bytes; | ||||
|     uint32_t transient; | ||||
|     uint32_t state; | ||||
|     uint32_t accLock; | ||||
|     uint32_t cpuLock; | ||||
|   } AcceleratorViewEntry; | ||||
|    | ||||
|   typedef std::unordered_map<uint64_t,AcceleratorViewEntry> AccViewTable_t; | ||||
|   typedef typename AccViewTable_t::iterator AccViewTableIterator ; | ||||
|  | ||||
|   static AccViewTable_t AccViewTable; | ||||
|   static LRU_t LRU; | ||||
|  | ||||
|   ///////////////////////////////////////////////// | ||||
|   // Device motion | ||||
|   ///////////////////////////////////////////////// | ||||
|   static void  Create(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint); | ||||
|   static void  EvictVictims(uint64_t bytes); // Frees up <bytes> | ||||
|   static void  Evict(AcceleratorViewEntry &AccCache); | ||||
|   static void  Flush(AcceleratorViewEntry &AccCache); | ||||
|   static void  Clone(AcceleratorViewEntry &AccCache); | ||||
|   static void  AccDiscard(AcceleratorViewEntry &AccCache); | ||||
|   static void  CpuDiscard(AcceleratorViewEntry &AccCache); | ||||
|  | ||||
|   //  static void  LRUupdate(AcceleratorViewEntry &AccCache); | ||||
|   static void  LRUinsert(AcceleratorViewEntry &AccCache); | ||||
|   static void  LRUremove(AcceleratorViewEntry &AccCache); | ||||
|    | ||||
|   // manage entries in the table | ||||
|   static int                  EntryPresent(uint64_t CpuPtr); | ||||
|   static void                 EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint); | ||||
|   static void                 EntryErase (uint64_t CpuPtr); | ||||
|   static AccViewTableIterator EntryLookup(uint64_t CpuPtr); | ||||
|   static void                 EntrySet   (uint64_t CpuPtr,AcceleratorViewEntry &entry); | ||||
|  | ||||
|   static void     AcceleratorViewClose(uint64_t AccPtr); | ||||
|   static uint64_t AcceleratorViewOpen(uint64_t  CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint); | ||||
|   static void     CpuViewClose(uint64_t Ptr); | ||||
|   static uint64_t CpuViewOpen(uint64_t  CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint); | ||||
| #endif | ||||
|   static void NotifyDeletion(void * CpuPtr); | ||||
|  | ||||
|  public: | ||||
|   static void Print(void); | ||||
|   static int   isOpen   (void* CpuPtr); | ||||
|   static void  ViewClose(void* CpuPtr,ViewMode mode); | ||||
|   static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint); | ||||
|  | ||||
| }; | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
|  | ||||
							
								
								
									
										468
									
								
								Grid/allocator/MemoryManagerCache.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										468
									
								
								Grid/allocator/MemoryManagerCache.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,468 @@ | ||||
| #include <Grid/GridCore.h> | ||||
|  | ||||
| #ifndef GRID_UVM | ||||
|  | ||||
| #warning "Using explicit device memory copies" | ||||
| NAMESPACE_BEGIN(Grid); | ||||
| #define dprintf(...) | ||||
|  | ||||
| //////////////////////////////////////////////////////////// | ||||
| // For caching copies of data on device | ||||
| //////////////////////////////////////////////////////////// | ||||
| MemoryManager::AccViewTable_t MemoryManager::AccViewTable; | ||||
| MemoryManager::LRU_t MemoryManager::LRU; | ||||
|    | ||||
| //////////////////////////////////////////////////////// | ||||
| // Footprint tracking | ||||
| //////////////////////////////////////////////////////// | ||||
| uint64_t  MemoryManager::DeviceBytes; | ||||
| uint64_t  MemoryManager::DeviceLRUBytes; | ||||
| uint64_t  MemoryManager::DeviceMaxBytes = 1024*1024*128; | ||||
| uint64_t  MemoryManager::HostToDeviceBytes; | ||||
| uint64_t  MemoryManager::DeviceToHostBytes; | ||||
| uint64_t  MemoryManager::HostToDeviceXfer; | ||||
| uint64_t  MemoryManager::DeviceToHostXfer; | ||||
|  | ||||
| //////////////////////////////////// | ||||
| // Priority ordering for unlocked entries | ||||
| //  Empty | ||||
| //  CpuDirty  | ||||
| //  Consistent | ||||
| //  AccDirty | ||||
| //////////////////////////////////// | ||||
| #define Empty         (0x0)  /*Entry unoccupied  */ | ||||
| #define CpuDirty      (0x1)  /*CPU copy is golden, Acc buffer MAY not be allocated*/ | ||||
| #define Consistent    (0x2)  /*ACC copy AND CPU copy are valid */ | ||||
| #define AccDirty      (0x4)  /*ACC copy is golden */ | ||||
| #define EvictNext     (0x8)  /*Priority for eviction*/ | ||||
|  | ||||
| ///////////////////////////////////////////////// | ||||
| // Mechanics of data table maintenance | ||||
| ///////////////////////////////////////////////// | ||||
| int   MemoryManager::EntryPresent(uint64_t CpuPtr) | ||||
| { | ||||
|   if(AccViewTable.empty()) return 0; | ||||
|  | ||||
|   auto count = AccViewTable.count(CpuPtr);  assert((count==0)||(count==1)); | ||||
|   return count; | ||||
| } | ||||
| void  MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint) | ||||
| { | ||||
|   assert(!EntryPresent(CpuPtr)); | ||||
|   AcceleratorViewEntry AccCache; | ||||
|   AccCache.CpuPtr = CpuPtr; | ||||
|   AccCache.AccPtr = (uint64_t)NULL; | ||||
|   AccCache.bytes  = bytes; | ||||
|   AccCache.state  = CpuDirty; | ||||
|   AccCache.LRU_valid=0; | ||||
|   AccCache.transient=0; | ||||
|   AccCache.accLock=0; | ||||
|   AccCache.cpuLock=0; | ||||
|   AccViewTable[CpuPtr] = AccCache; | ||||
| } | ||||
| MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr) | ||||
| { | ||||
|   assert(EntryPresent(CpuPtr)); | ||||
|   auto AccCacheIterator = AccViewTable.find(CpuPtr); | ||||
|   assert(AccCacheIterator!=AccViewTable.end()); | ||||
|   return AccCacheIterator; | ||||
| } | ||||
| void MemoryManager::EntryErase(uint64_t CpuPtr) | ||||
| { | ||||
|   auto AccCache = EntryLookup(CpuPtr); | ||||
|   AccViewTable.erase(CpuPtr); | ||||
| } | ||||
| void  MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache) | ||||
| { | ||||
|   assert(AccCache.LRU_valid==0); | ||||
|   if (AccCache.transient) {  | ||||
|     LRU.push_back(AccCache.CpuPtr); | ||||
|     AccCache.LRU_entry = --LRU.end(); | ||||
|   } else { | ||||
|     LRU.push_front(AccCache.CpuPtr); | ||||
|     AccCache.LRU_entry = LRU.begin(); | ||||
|   } | ||||
|   AccCache.LRU_valid = 1; | ||||
|   DeviceLRUBytes+=AccCache.bytes; | ||||
| } | ||||
| void  MemoryManager::LRUremove(AcceleratorViewEntry &AccCache) | ||||
| { | ||||
|   assert(AccCache.LRU_valid==1); | ||||
|   LRU.erase(AccCache.LRU_entry); | ||||
|   AccCache.LRU_valid = 0; | ||||
|   DeviceLRUBytes-=AccCache.bytes; | ||||
| } | ||||
| ///////////////////////////////////////////////// | ||||
| // Accelerator cache motion & consistency logic | ||||
| ///////////////////////////////////////////////// | ||||
| void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) | ||||
| { | ||||
|   /////////////////////////////////////////////////////////// | ||||
|   // Remove from Accelerator, remove entry, without flush | ||||
|   // Cannot be locked. If allocated Must be in LRU pool. | ||||
|   /////////////////////////////////////////////////////////// | ||||
|   assert(AccCache.state!=Empty); | ||||
|    | ||||
|   //  dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);  | ||||
|   assert(AccCache.accLock==0); | ||||
|   assert(AccCache.cpuLock==0); | ||||
|   assert(AccCache.CpuPtr!=(uint64_t)NULL); | ||||
|   if(AccCache.AccPtr) { | ||||
|     AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); | ||||
|     DeviceBytes   -=AccCache.bytes; | ||||
|     LRUremove(AccCache); | ||||
|     //    dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);   | ||||
|   } | ||||
|   uint64_t CpuPtr = AccCache.CpuPtr; | ||||
|   EntryErase(CpuPtr); | ||||
| } | ||||
|  | ||||
| void MemoryManager::Evict(AcceleratorViewEntry &AccCache) | ||||
| { | ||||
|   /////////////////////////////////////////////////////////////////////////// | ||||
|   // Make CPU consistent, remove from Accelerator, remove entry | ||||
|   // Cannot be locked. If allocated must be in LRU pool. | ||||
|   /////////////////////////////////////////////////////////////////////////// | ||||
|   assert(AccCache.state!=Empty); | ||||
|    | ||||
|   //  dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);  | ||||
|   assert(AccCache.accLock==0); | ||||
|   assert(AccCache.cpuLock==0); | ||||
|   if(AccCache.state==AccDirty) { | ||||
|     Flush(AccCache); | ||||
|   } | ||||
|   assert(AccCache.CpuPtr!=(uint64_t)NULL); | ||||
|   if(AccCache.AccPtr) { | ||||
|     AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); | ||||
|     DeviceBytes   -=AccCache.bytes; | ||||
|     LRUremove(AccCache); | ||||
|     //    dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);   | ||||
|   } | ||||
|   uint64_t CpuPtr = AccCache.CpuPtr; | ||||
|   EntryErase(CpuPtr); | ||||
| } | ||||
| void MemoryManager::Flush(AcceleratorViewEntry &AccCache) | ||||
| { | ||||
|   assert(AccCache.state==AccDirty); | ||||
|   assert(AccCache.cpuLock==0); | ||||
|   assert(AccCache.accLock==0); | ||||
|   assert(AccCache.AccPtr!=(uint64_t)NULL); | ||||
|   assert(AccCache.CpuPtr!=(uint64_t)NULL); | ||||
|   acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); | ||||
|   //  dprintf("MemoryManager: Flush  %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); | ||||
|   DeviceToHostBytes+=AccCache.bytes; | ||||
|   DeviceToHostXfer++; | ||||
|   AccCache.state=Consistent; | ||||
| } | ||||
| void MemoryManager::Clone(AcceleratorViewEntry &AccCache) | ||||
| { | ||||
|   assert(AccCache.state==CpuDirty); | ||||
|   assert(AccCache.cpuLock==0); | ||||
|   assert(AccCache.accLock==0); | ||||
|   assert(AccCache.CpuPtr!=(uint64_t)NULL); | ||||
|   if(AccCache.AccPtr==(uint64_t)NULL){ | ||||
|     AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); | ||||
|     DeviceBytes+=AccCache.bytes; | ||||
|   } | ||||
|   //  dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); | ||||
|   acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); | ||||
|   HostToDeviceBytes+=AccCache.bytes; | ||||
|   HostToDeviceXfer++; | ||||
|   AccCache.state=Consistent; | ||||
| } | ||||
|  | ||||
| void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache) | ||||
| { | ||||
|   assert(AccCache.state!=Empty); | ||||
|   assert(AccCache.cpuLock==0); | ||||
|   assert(AccCache.accLock==0); | ||||
|   assert(AccCache.CpuPtr!=(uint64_t)NULL); | ||||
|   if(AccCache.AccPtr==(uint64_t)NULL){ | ||||
|     AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); | ||||
|     DeviceBytes+=AccCache.bytes; | ||||
|   } | ||||
|   AccCache.state=AccDirty; | ||||
| } | ||||
|  | ||||
| ///////////////////////////////////////////////////////////////////////////////// | ||||
| // View management | ||||
| ///////////////////////////////////////////////////////////////////////////////// | ||||
| void MemoryManager::ViewClose(void* Ptr,ViewMode mode) | ||||
| { | ||||
|   if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ | ||||
|     AcceleratorViewClose((uint64_t)Ptr); | ||||
|   } else if( (mode==CpuRead)||(mode==CpuWrite)){ | ||||
|     CpuViewClose((uint64_t)Ptr); | ||||
|   } else {  | ||||
|     assert(0); | ||||
|   } | ||||
| } | ||||
| void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint) | ||||
| { | ||||
|   uint64_t CpuPtr = (uint64_t)_CpuPtr; | ||||
|   if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ | ||||
|     return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint); | ||||
|   } else if( (mode==CpuRead)||(mode==CpuWrite)){ | ||||
|     return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint); | ||||
|   } else {  | ||||
|     assert(0); | ||||
|     return NULL; | ||||
|   } | ||||
| } | ||||
| void  MemoryManager::EvictVictims(uint64_t bytes) | ||||
| { | ||||
|   while(bytes+DeviceLRUBytes > DeviceMaxBytes){ | ||||
|     if ( DeviceLRUBytes > 0){ | ||||
|       assert(LRU.size()>0); | ||||
|       uint64_t victim = LRU.back(); | ||||
|       auto AccCacheIterator = EntryLookup(victim); | ||||
|       auto & AccCache = AccCacheIterator->second; | ||||
|       Evict(AccCache); | ||||
|     } | ||||
|   } | ||||
| } | ||||
| uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint) | ||||
| { | ||||
|   //////////////////////////////////////////////////////////////////////////// | ||||
|   // Find if present, otherwise get or force an empty | ||||
|   //////////////////////////////////////////////////////////////////////////// | ||||
|   if ( EntryPresent(CpuPtr)==0 ){ | ||||
|     EvictVictims(bytes); | ||||
|     EntryCreate(CpuPtr,bytes,mode,hint); | ||||
|   } | ||||
|  | ||||
|   auto AccCacheIterator = EntryLookup(CpuPtr); | ||||
|   auto & AccCache = AccCacheIterator->second; | ||||
|    | ||||
|   assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)); | ||||
|  | ||||
|   assert(AccCache.cpuLock==0);  // Programming error | ||||
|  | ||||
|   if(AccCache.state!=Empty) { | ||||
|     assert(AccCache.CpuPtr == CpuPtr); | ||||
|     assert(AccCache.bytes  ==bytes); | ||||
|   } | ||||
| /* | ||||
|  *  State transitions and actions | ||||
|  * | ||||
|  *  Action  State   StateNext         Flush    Clone | ||||
|  * | ||||
|  *  AccRead  Empty   Consistent        -        Y | ||||
|  *  AccWrite Empty   AccDirty          -        Y | ||||
|  *  AccRead  CpuDirty Consistent       -        Y | ||||
|  *  AccWrite CpuDirty AccDirty         -        Y | ||||
|  *  AccRead  Consistent Consistent     -        -  | ||||
|  *  AccWrite Consistent AccDirty       -        -  | ||||
|  *  AccRead  AccDirty   AccDirty       -        -  | ||||
|  *  AccWrite AccDirty   AccDirty       -        -  | ||||
|  */ | ||||
|   if(AccCache.state==Empty) { | ||||
|     assert(AccCache.LRU_valid==0); | ||||
|     AccCache.CpuPtr = CpuPtr; | ||||
|     AccCache.AccPtr = (uint64_t)NULL; | ||||
|     AccCache.bytes  = bytes; | ||||
|     AccCache.state  = CpuDirty;   // Cpu starts primary | ||||
|     if(mode==AcceleratorWriteDiscard){ | ||||
|       CpuDiscard(AccCache); | ||||
|       AccCache.state  = AccDirty;   // Empty + AcceleratorWrite=> AccDirty | ||||
|     } else if(mode==AcceleratorWrite){ | ||||
|       Clone(AccCache); | ||||
|       AccCache.state  = AccDirty;   // Empty + AcceleratorWrite=> AccDirty | ||||
|     } else { | ||||
|       Clone(AccCache); | ||||
|       AccCache.state  = Consistent; // Empty + AccRead => Consistent | ||||
|     } | ||||
|     AccCache.accLock= 1; | ||||
|   } else if(AccCache.state==CpuDirty ){ | ||||
|     if(mode==AcceleratorWriteDiscard) { | ||||
|       CpuDiscard(AccCache); | ||||
|       AccCache.state  = AccDirty;   // CpuDirty + AcceleratorWrite=> AccDirty | ||||
|     } else if(mode==AcceleratorWrite) { | ||||
|       Clone(AccCache); | ||||
|       AccCache.state  = AccDirty;   // CpuDirty + AcceleratorWrite=> AccDirty | ||||
|     } else { | ||||
|       Clone(AccCache); | ||||
|       AccCache.state  = Consistent; // CpuDirty + AccRead => Consistent | ||||
|     } | ||||
|     AccCache.accLock++; | ||||
|     //    printf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock); | ||||
|   } else if(AccCache.state==Consistent) { | ||||
|     if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) | ||||
|       AccCache.state  = AccDirty;   // Consistent + AcceleratorWrite=> AccDirty | ||||
|     else | ||||
|       AccCache.state  = Consistent; // Consistent + AccRead => Consistent | ||||
|     AccCache.accLock++; | ||||
|     //    printf("Consistent entry into device accLock %d\n",AccCache.accLock); | ||||
|   } else if(AccCache.state==AccDirty) { | ||||
|     if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) | ||||
|       AccCache.state  = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty | ||||
|     else | ||||
|       AccCache.state  = AccDirty; // AccDirty + AccRead => AccDirty | ||||
|     AccCache.accLock++; | ||||
|     //    printf("AccDirty entry into device accLock %d\n",AccCache.accLock); | ||||
|   } else { | ||||
|     assert(0); | ||||
|   } | ||||
|  | ||||
|   // If view is opened on device remove from LRU | ||||
|   if(AccCache.LRU_valid==1){ | ||||
|     // must possibly remove from LRU as now locked on GPU | ||||
|     LRUremove(AccCache); | ||||
|   } | ||||
|  | ||||
|   int transient =hint; | ||||
|   AccCache.transient= transient? EvictNext : 0; | ||||
|  | ||||
|   return AccCache.AccPtr; | ||||
| } | ||||
| //////////////////////////////////// | ||||
| // look up & decrement lock count | ||||
| //////////////////////////////////// | ||||
| void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr) | ||||
| { | ||||
|   auto AccCacheIterator = EntryLookup(CpuPtr); | ||||
|   auto & AccCache = AccCacheIterator->second; | ||||
|  | ||||
|   assert(AccCache.cpuLock==0); | ||||
|   assert(AccCache.accLock>0); | ||||
|  | ||||
|   AccCache.accLock--; | ||||
|  | ||||
|   // Move to LRU queue if not locked and close on device | ||||
|   if(AccCache.accLock==0) { | ||||
|     LRUinsert(AccCache); | ||||
|   } | ||||
| } | ||||
| void MemoryManager::CpuViewClose(uint64_t CpuPtr) | ||||
| { | ||||
|   auto AccCacheIterator = EntryLookup(CpuPtr); | ||||
|   auto & AccCache = AccCacheIterator->second; | ||||
|  | ||||
|   assert(AccCache.cpuLock>0); | ||||
|   assert(AccCache.accLock==0); | ||||
|  | ||||
|   AccCache.cpuLock--; | ||||
| } | ||||
| /* | ||||
|  *  Action  State   StateNext         Flush    Clone | ||||
|  * | ||||
|  *  CpuRead  Empty   CpuDirty          -        - | ||||
|  *  CpuWrite Empty   CpuDirty          -        - | ||||
|  *  CpuRead  CpuDirty CpuDirty         -        - | ||||
|  *  CpuWrite CpuDirty CpuDirty         -        -  | ||||
|  *  CpuRead  Consistent Consistent     -        -  | ||||
|  *  CpuWrite Consistent CpuDirty       -        -  | ||||
|  *  CpuRead  AccDirty   Consistent     Y        - | ||||
|  *  CpuWrite AccDirty   CpuDirty       Y        - | ||||
|  */ | ||||
| uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise transient) | ||||
| { | ||||
|   //////////////////////////////////////////////////////////////////////////// | ||||
|   // Find if present, otherwise get or force an empty | ||||
|   //////////////////////////////////////////////////////////////////////////// | ||||
|   if ( EntryPresent(CpuPtr)==0 ){ | ||||
|     EvictVictims(bytes); | ||||
|     EntryCreate(CpuPtr,bytes,mode,transient); | ||||
|   } | ||||
|  | ||||
|   auto AccCacheIterator = EntryLookup(CpuPtr); | ||||
|   auto & AccCache = AccCacheIterator->second; | ||||
|    | ||||
|   assert((mode==CpuRead)||(mode==CpuWrite)); | ||||
|   assert(AccCache.accLock==0);  // Programming error | ||||
|  | ||||
|   if(AccCache.state!=Empty) { | ||||
|     assert(AccCache.CpuPtr == CpuPtr); | ||||
|     assert(AccCache.bytes==bytes); | ||||
|   } | ||||
|  | ||||
|   if(AccCache.state==Empty) { | ||||
|     AccCache.CpuPtr = CpuPtr; | ||||
|     AccCache.AccPtr = (uint64_t)NULL; | ||||
|     AccCache.bytes  = bytes; | ||||
|     AccCache.state  = CpuDirty; // Empty + CpuRead/CpuWrite => CpuDirty | ||||
|     AccCache.accLock= 0; | ||||
|     AccCache.cpuLock= 1; | ||||
|   } else if(AccCache.state==CpuDirty ){ | ||||
|     // AccPtr dont care, deferred allocate | ||||
|     AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty | ||||
|     AccCache.cpuLock++; | ||||
|   } else if(AccCache.state==Consistent) { | ||||
|     assert(AccCache.AccPtr != (uint64_t)NULL); | ||||
|     if(mode==CpuWrite) | ||||
|       AccCache.state = CpuDirty;   // Consistent +CpuWrite => CpuDirty | ||||
|     else  | ||||
|       AccCache.state = Consistent; // Consistent +CpuRead  => Consistent | ||||
|     AccCache.cpuLock++; | ||||
|   } else if(AccCache.state==AccDirty) { | ||||
|     assert(AccCache.AccPtr != (uint64_t)NULL); | ||||
|     Flush(AccCache); | ||||
|     if(mode==CpuWrite) AccCache.state = CpuDirty;   // AccDirty +CpuWrite => CpuDirty, Flush | ||||
|     else            AccCache.state = Consistent; // AccDirty +CpuRead  => Consistent, Flush | ||||
|     AccCache.cpuLock++; | ||||
|   } else { | ||||
|     assert(0); // should be unreachable | ||||
|   } | ||||
|  | ||||
|   AccCache.transient= transient? EvictNext : 0; | ||||
|  | ||||
|   return AccCache.CpuPtr; | ||||
| } | ||||
| void  MemoryManager::NotifyDeletion(void *_ptr) | ||||
| { | ||||
|   // Look up in ViewCache | ||||
|   uint64_t ptr = (uint64_t)_ptr; | ||||
|   if(EntryPresent(ptr)) { | ||||
|     auto e = EntryLookup(ptr); | ||||
|     AccDiscard(e->second); | ||||
|   } | ||||
| } | ||||
| void  MemoryManager::Print(void) | ||||
| { | ||||
|   std::cout << GridLogDebug << "--------------------------------------------" << std::endl; | ||||
|   std::cout << GridLogDebug << "Memory Manager                             " << std::endl; | ||||
|   std::cout << GridLogDebug << "--------------------------------------------" << std::endl; | ||||
|   std::cout << GridLogDebug << DeviceBytes   << " bytes allocated on device " << std::endl; | ||||
|   std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl; | ||||
|   std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device       " << std::endl; | ||||
|   std::cout << GridLogDebug << HostToDeviceXfer << " transfers        to   device " << std::endl; | ||||
|   std::cout << GridLogDebug << DeviceToHostXfer << " transfers        from device " << std::endl; | ||||
|   std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to   device " << std::endl; | ||||
|   std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl; | ||||
|   std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl; | ||||
|   std::cout << GridLogDebug << "--------------------------------------------" << std::endl; | ||||
|   std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl; | ||||
|   std::cout << GridLogDebug << "--------------------------------------------" << std::endl; | ||||
|   for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){ | ||||
|     auto &AccCache = it->second; | ||||
|      | ||||
|     std::string str; | ||||
|     if ( AccCache.state==Empty    ) str = std::string("Empty"); | ||||
|     if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty"); | ||||
|     if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); | ||||
|     if ( AccCache.state==Consistent)str = std::string("Consistent"); | ||||
|  | ||||
|     std::cout << GridLogDebug << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec | ||||
| 	      << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str | ||||
| 	      << "\t" << AccCache.cpuLock | ||||
| 	      << "\t" << AccCache.accLock | ||||
| 	      << "\t" << AccCache.LRU_valid<<std::endl; | ||||
|   } | ||||
|   std::cout << GridLogDebug << "--------------------------------------------" << std::endl; | ||||
|  | ||||
| }; | ||||
| int   MemoryManager::isOpen   (void* _CpuPtr)  | ||||
| {  | ||||
|   uint64_t CpuPtr = (uint64_t)_CpuPtr; | ||||
|   if ( EntryPresent(CpuPtr) ){ | ||||
|     auto AccCacheIterator = EntryLookup(CpuPtr); | ||||
|     auto & AccCache = AccCacheIterator->second; | ||||
|     return AccCache.cpuLock+AccCache.accLock; | ||||
|   } else {  | ||||
|     return 0; | ||||
|   } | ||||
| } | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
| #endif | ||||
							
								
								
									
										24
									
								
								Grid/allocator/MemoryManagerShared.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								Grid/allocator/MemoryManagerShared.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,24 @@ | ||||
| #include <Grid/GridCore.h> | ||||
| #ifdef GRID_UVM | ||||
|  | ||||
| #warning "Grid is assuming unified virtual memory address space" | ||||
| NAMESPACE_BEGIN(Grid); | ||||
| ///////////////////////////////////////////////////////////////////////////////// | ||||
| // View management is 1:1 address space mapping | ||||
| ///////////////////////////////////////////////////////////////////////////////// | ||||
| uint64_t  MemoryManager::DeviceBytes; | ||||
| uint64_t  MemoryManager::DeviceLRUBytes; | ||||
| uint64_t  MemoryManager::DeviceMaxBytes = 1024*1024*128; | ||||
| uint64_t  MemoryManager::HostToDeviceBytes; | ||||
| uint64_t  MemoryManager::DeviceToHostBytes; | ||||
| uint64_t  MemoryManager::HostToDeviceXfer; | ||||
| uint64_t  MemoryManager::DeviceToHostXfer; | ||||
|  | ||||
| void  MemoryManager::ViewClose(void* AccPtr,ViewMode mode){}; | ||||
| void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; }; | ||||
| int   MemoryManager::isOpen   (void* CpuPtr) { return 0;} | ||||
| void  MemoryManager::Print(void){}; | ||||
| void  MemoryManager::NotifyDeletion(void *ptr){}; | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
							
								
								
									
										67
									
								
								Grid/allocator/MemoryStats.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										67
									
								
								Grid/allocator/MemoryStats.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,67 @@ | ||||
| #include <Grid/GridCore.h> | ||||
| #include <fcntl.h> | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| MemoryStats *MemoryProfiler::stats = nullptr; | ||||
| bool         MemoryProfiler::debug = false; | ||||
|  | ||||
| void check_huge_pages(void *Buf,uint64_t BYTES) | ||||
| { | ||||
| #ifdef __linux__ | ||||
|   int fd = open("/proc/self/pagemap", O_RDONLY); | ||||
|   assert(fd >= 0); | ||||
|   const int page_size = 4096; | ||||
|   uint64_t virt_pfn = (uint64_t)Buf / page_size; | ||||
|   off_t offset = sizeof(uint64_t) * virt_pfn; | ||||
|   uint64_t npages = (BYTES + page_size-1) / page_size; | ||||
|   uint64_t pagedata[npages]; | ||||
|   uint64_t ret = lseek(fd, offset, SEEK_SET); | ||||
|   assert(ret == offset); | ||||
|   ret = ::read(fd, pagedata, sizeof(uint64_t)*npages); | ||||
|   assert(ret == sizeof(uint64_t) * npages); | ||||
|   int nhugepages = npages / 512; | ||||
|   int n4ktotal, nnothuge; | ||||
|   n4ktotal = 0; | ||||
|   nnothuge = 0; | ||||
|   for (int i = 0; i < nhugepages; ++i) { | ||||
|     uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size; | ||||
|     for (int j = 0; j < 512; ++j) { | ||||
|       uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size; | ||||
|       ++n4ktotal; | ||||
|       if (pageaddr != baseaddr + j * page_size) | ||||
| 	++nnothuge; | ||||
|     } | ||||
|   } | ||||
|   int rank = CartesianCommunicator::RankWorld(); | ||||
|   printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| std::string sizeString(const size_t bytes) | ||||
| { | ||||
|   constexpr unsigned int bufSize = 256; | ||||
|   const char             *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"}; | ||||
|   char                   buf[256]; | ||||
|   size_t                 s     = 0; | ||||
|   double                 count = bytes; | ||||
|    | ||||
|   while (count >= 1024 && s < 7) | ||||
|     { | ||||
|       s++; | ||||
|       count /= 1024; | ||||
|     } | ||||
|   if (count - floor(count) == 0.0) | ||||
|     { | ||||
|       snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]); | ||||
|     } | ||||
|   else | ||||
|     { | ||||
|       snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]); | ||||
|     } | ||||
|    | ||||
|   return std::string(buf); | ||||
| } | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
							
								
								
									
										95
									
								
								Grid/allocator/MemoryStats.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										95
									
								
								Grid/allocator/MemoryStats.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,95 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
|     Source file: ./lib/MemoryStats.h | ||||
|  | ||||
|     Copyright (C) 2015 | ||||
|  | ||||
| Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
|  | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
|  | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #pragma once | ||||
|  | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| std::string sizeString(size_t bytes); | ||||
|  | ||||
| struct MemoryStats | ||||
| { | ||||
|   size_t totalAllocated{0}, maxAllocated{0},  | ||||
|     currentlyAllocated{0}, totalFreed{0}; | ||||
| }; | ||||
|      | ||||
| class MemoryProfiler | ||||
| { | ||||
| public: | ||||
|   static MemoryStats *stats; | ||||
|   static bool        debug; | ||||
| }; | ||||
|  | ||||
| #define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")" | ||||
| #define profilerDebugPrint						\ | ||||
|   if (MemoryProfiler::stats)						\ | ||||
|     {									\ | ||||
|       auto s = MemoryProfiler::stats;					\ | ||||
|       std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \ | ||||
|       std::cout << GridLogDebug << "[Memory debug] total  : " << memString(s->totalAllocated) \ | ||||
| 		<< std::endl;						\ | ||||
|       std::cout << GridLogDebug << "[Memory debug] max    : " << memString(s->maxAllocated) \ | ||||
| 		<< std::endl;						\ | ||||
|       std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \ | ||||
| 		<< std::endl;						\ | ||||
|       std::cout << GridLogDebug << "[Memory debug] freed  : " << memString(s->totalFreed) \ | ||||
| 		<< std::endl;						\ | ||||
|     } | ||||
|  | ||||
| #define profilerAllocate(bytes)						\ | ||||
|   if (MemoryProfiler::stats)						\ | ||||
|     {									\ | ||||
|       auto s = MemoryProfiler::stats;					\ | ||||
|       s->totalAllocated     += (bytes);					\ | ||||
|       s->currentlyAllocated += (bytes);					\ | ||||
|       s->maxAllocated        = std::max(s->maxAllocated, s->currentlyAllocated); \ | ||||
|     }									\ | ||||
|   if (MemoryProfiler::debug)						\ | ||||
|     {									\ | ||||
|       std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \ | ||||
|       profilerDebugPrint;						\ | ||||
|     } | ||||
|  | ||||
| #define profilerFree(bytes)						\ | ||||
|   if (MemoryProfiler::stats)						\ | ||||
|     {									\ | ||||
|       auto s = MemoryProfiler::stats;					\ | ||||
|       s->totalFreed         += (bytes);					\ | ||||
|       s->currentlyAllocated -= (bytes);					\ | ||||
|     }									\ | ||||
|   if (MemoryProfiler::debug)						\ | ||||
|     {									\ | ||||
|       std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \ | ||||
|       profilerDebugPrint;						\ | ||||
|     } | ||||
|  | ||||
| void check_huge_pages(void *Buf,uint64_t BYTES); | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -23,8 +23,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_CARTESIAN_H | ||||
| #define GRID_CARTESIAN_H | ||||
|  | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -25,268 +25,267 @@ | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_CARTESIAN_BASE_H | ||||
| #define GRID_CARTESIAN_BASE_H | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| namespace Grid{ | ||||
|  | ||||
|   ////////////////////////////////////////////////////////////////////// | ||||
|   // Commicator provides information on the processor grid | ||||
|   ////////////////////////////////////////////////////////////////////// | ||||
|   //    unsigned long _ndimension; | ||||
|   //    std::vector<int> _processors; // processor grid | ||||
|   //    int              _processor;  // linear processor rank | ||||
|   //    std::vector<int> _processor_coor;  // linear processor rank | ||||
|   ////////////////////////////////////////////////////////////////////// | ||||
|   class GridBase : public CartesianCommunicator , public GridThread { | ||||
| ////////////////////////////////////////////////////////////////////// | ||||
| // Commicator provides information on the processor grid | ||||
| ////////////////////////////////////////////////////////////////////// | ||||
| //    unsigned long _ndimension; | ||||
| //    Coordinate _processors; // processor grid | ||||
| //    int              _processor;  // linear processor rank | ||||
| //    Coordinate _processor_coor;  // linear processor rank | ||||
| ////////////////////////////////////////////////////////////////////// | ||||
| class GridBase : public CartesianCommunicator , public GridThread { | ||||
|  | ||||
| public: | ||||
|     int dummy; | ||||
|     // Give Lattice access | ||||
|     template<class object> friend class Lattice; | ||||
|   int dummy; | ||||
|   // Give Lattice access | ||||
|   template<class object> friend class Lattice; | ||||
|  | ||||
|     GridBase(const std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {}; | ||||
|     GridBase(const std::vector<int> & processor_grid, | ||||
| 	     const CartesianCommunicator &parent, | ||||
| 	     int &split_rank)  | ||||
|       : CartesianCommunicator(processor_grid,parent,split_rank) {}; | ||||
|     GridBase(const std::vector<int> & processor_grid, | ||||
| 	     const CartesianCommunicator &parent)  | ||||
|       : CartesianCommunicator(processor_grid,parent,dummy) {}; | ||||
|   GridBase(const Coordinate & processor_grid) : CartesianCommunicator(processor_grid) { LocallyPeriodic=0;};  | ||||
|  | ||||
|     virtual ~GridBase() = default; | ||||
|   GridBase(const Coordinate & processor_grid, | ||||
| 	   const CartesianCommunicator &parent, | ||||
| 	   int &split_rank)  | ||||
|     : CartesianCommunicator(processor_grid,parent,split_rank) {LocallyPeriodic=0;}; | ||||
|  | ||||
|   GridBase(const Coordinate & processor_grid, | ||||
| 	   const CartesianCommunicator &parent)  | ||||
|     : CartesianCommunicator(processor_grid,parent,dummy) {LocallyPeriodic=0;}; | ||||
|  | ||||
|     // Physics Grid information. | ||||
|     std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes. | ||||
|     std::vector<int> _fdimensions;// (full) Global dimensions of array prior to cb removal | ||||
|     std::vector<int> _gdimensions;// Global dimensions of array after cb removal | ||||
|     std::vector<int> _ldimensions;// local dimensions of array with processor images removed | ||||
|     std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed  | ||||
|     std::vector<int> _ostride;    // Outer stride for each dimension | ||||
|     std::vector<int> _istride;    // Inner stride i.e. within simd lane | ||||
|     int _osites;                  // _isites*_osites = product(dimensions). | ||||
|     int _isites; | ||||
|     int _fsites;                  // _isites*_osites = product(dimensions). | ||||
|     int _gsites; | ||||
|     std::vector<int> _slice_block;// subslice information | ||||
|     std::vector<int> _slice_stride; | ||||
|     std::vector<int> _slice_nblock; | ||||
|   virtual ~GridBase() = default; | ||||
|  | ||||
|     std::vector<int> _lstart;     // local start of array in gcoors _processor_coor[d]*_ldimensions[d] | ||||
|     std::vector<int> _lend  ;     // local end of array in gcoors   _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1 | ||||
|   // Physics Grid information. | ||||
|   Coordinate _simd_layout;// Which dimensions get relayed out over simd lanes. | ||||
|   Coordinate _fdimensions;// (full) Global dimensions of array prior to cb removal | ||||
|   Coordinate _gdimensions;// Global dimensions of array after cb removal | ||||
|   Coordinate _ldimensions;// local dimensions of array with processor images removed | ||||
|   Coordinate _rdimensions;// Reduced local dimensions with simd lane images and processor images removed  | ||||
|   Coordinate _ostride;    // Outer stride for each dimension | ||||
|   Coordinate _istride;    // Inner stride i.e. within simd lane | ||||
|   int _osites;                  // _isites*_osites = product(dimensions). | ||||
|   int _isites; | ||||
|   int _fsites;                  // _isites*_osites = product(dimensions). | ||||
|   int _gsites; | ||||
|   Coordinate _slice_block;// subslice information | ||||
|   Coordinate _slice_stride; | ||||
|   Coordinate _slice_nblock; | ||||
|  | ||||
|     bool _isCheckerBoarded;  | ||||
|   Coordinate _lstart;     // local start of array in gcoors _processor_coor[d]*_ldimensions[d] | ||||
|   Coordinate _lend  ;     // local end of array in gcoors   _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1 | ||||
|  | ||||
|   bool _isCheckerBoarded;  | ||||
|   int        LocallyPeriodic; | ||||
|   Coordinate _checker_dim_mask; | ||||
|  | ||||
| public: | ||||
|  | ||||
|     //////////////////////////////////////////////////////////////// | ||||
|     // Checkerboarding interface is virtual and overridden by  | ||||
|     // GridCartesian / GridRedBlackCartesian | ||||
|     //////////////////////////////////////////////////////////////// | ||||
|     virtual int CheckerBoarded(int dim)=0; | ||||
|     virtual int CheckerBoard(const std::vector<int> &site)=0; | ||||
|     virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0; | ||||
|     virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0; | ||||
|     virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0; | ||||
|     virtual int CheckerBoardFromOindex (int Oindex)=0; | ||||
|     virtual int CheckerBoardFromOindexTable (int Oindex)=0; | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   // Checkerboarding interface is virtual and overridden by  | ||||
|   // GridCartesian / GridRedBlackCartesian | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   virtual int CheckerBoarded(int dim)=0; | ||||
|   virtual int CheckerBoard(const Coordinate &site)=0; | ||||
|   virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0; | ||||
|   virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0; | ||||
|   virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0; | ||||
|   virtual int CheckerBoardFromOindex (int Oindex)=0; | ||||
|   virtual int CheckerBoardFromOindexTable (int Oindex)=0; | ||||
|  | ||||
|     ////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|     // Local layout calculations | ||||
|     ////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|     // These routines are key. Subdivide the linearised cartesian index into | ||||
|     //      "inner" index identifying which simd lane of object<vFcomplex> is associated with coord | ||||
|     //      "outer" index identifying which element of _odata in class "Lattice" is associated with coord. | ||||
|     // | ||||
|     // Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer | ||||
|     // stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional | ||||
|     // coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all | ||||
|     // lanes are operated upon simultaneously. | ||||
|   ////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // Local layout calculations | ||||
|   ////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // These routines are key. Subdivide the linearised cartesian index into | ||||
|   //      "inner" index identifying which simd lane of object<vFcomplex> is associated with coord | ||||
|   //      "outer" index identifying which element of _odata in class "Lattice" is associated with coord. | ||||
|   // | ||||
|   // Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer | ||||
|   // stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional | ||||
|   // coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all | ||||
|   // lanes are operated upon simultaneously. | ||||
|    | ||||
|     virtual int oIndex(std::vector<int> &coor) | ||||
|     { | ||||
|         int idx=0; | ||||
|         // Works with either global or local coordinates | ||||
|         for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]); | ||||
|         return idx; | ||||
|     } | ||||
|     virtual int iIndex(std::vector<int> &lcoor) | ||||
|     { | ||||
|         int idx=0; | ||||
|         for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]); | ||||
|         return idx; | ||||
|     } | ||||
|     inline int oIndexReduced(std::vector<int> &ocoor) | ||||
|     { | ||||
|       int idx=0;  | ||||
|       // ocoor is already reduced so can eliminate the modulo operation | ||||
|       // for fast indexing and inline the routine | ||||
|       for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d]; | ||||
|       return idx; | ||||
|     } | ||||
|     inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){ | ||||
|       Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions); | ||||
|     } | ||||
|   virtual int oIndex(Coordinate &coor) | ||||
|   { | ||||
|     int idx=0; | ||||
|     // Works with either global or local coordinates | ||||
|     for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]); | ||||
|     return idx; | ||||
|   } | ||||
|   virtual int iIndex(Coordinate &lcoor) | ||||
|   { | ||||
|     int idx=0; | ||||
|     for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]); | ||||
|     return idx; | ||||
|   } | ||||
|   inline int oIndexReduced(Coordinate &ocoor) | ||||
|   { | ||||
|     int idx=0;  | ||||
|     // ocoor is already reduced so can eliminate the modulo operation | ||||
|     // for fast indexing and inline the routine | ||||
|     for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d]; | ||||
|     return idx; | ||||
|   } | ||||
|   inline void oCoorFromOindex (Coordinate& coor,int Oindex){ | ||||
|     Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions); | ||||
|   } | ||||
|  | ||||
|     inline void InOutCoorToLocalCoor (std::vector<int> &ocoor, std::vector<int> &icoor, std::vector<int> &lcoor) { | ||||
|       lcoor.resize(_ndimension); | ||||
|       for (int d = 0; d < _ndimension; d++) | ||||
|         lcoor[d] = ocoor[d] + _rdimensions[d] * icoor[d]; | ||||
|     } | ||||
|   inline void InOutCoorToLocalCoor (Coordinate &ocoor, Coordinate &icoor, Coordinate &lcoor) { | ||||
|     lcoor.resize(_ndimension); | ||||
|     for (int d = 0; d < _ndimension; d++) | ||||
|       lcoor[d] = ocoor[d] + _rdimensions[d] * icoor[d]; | ||||
|   } | ||||
|  | ||||
|     ////////////////////////////////////////////////////////// | ||||
|     // SIMD lane addressing | ||||
|     ////////////////////////////////////////////////////////// | ||||
|     inline void iCoorFromIindex(std::vector<int> &coor,int lane) | ||||
|     { | ||||
|       Lexicographic::CoorFromIndex(coor,lane,_simd_layout); | ||||
|     } | ||||
|   ////////////////////////////////////////////////////////// | ||||
|   // SIMD lane addressing | ||||
|   ////////////////////////////////////////////////////////// | ||||
|   inline void iCoorFromIindex(Coordinate &coor,int lane) | ||||
|   { | ||||
|     Lexicographic::CoorFromIndex(coor,lane,_simd_layout); | ||||
|   } | ||||
|  | ||||
|     inline int PermuteDim(int dimension){ | ||||
|       return _simd_layout[dimension]>1; | ||||
|     } | ||||
|     inline int PermuteType(int dimension){ | ||||
|       int permute_type=0; | ||||
|       // | ||||
|       // FIXME: | ||||
|       // | ||||
|       // Best way to encode this would be to present a mask  | ||||
|       // for which simd dimensions are rotated, and the rotation | ||||
|       // size. If there is only one simd dimension rotated, this is just  | ||||
|       // a permute.  | ||||
|       // | ||||
|       // Cases: PermuteType == 1,2,4,8 | ||||
|       // Distance should be either 0,1,2.. | ||||
|       // | ||||
|       if ( _simd_layout[dimension] > 2 ) {  | ||||
|         for(int d=0;d<_ndimension;d++){ | ||||
|           if ( d != dimension ) assert ( (_simd_layout[d]==1)  ); | ||||
|         } | ||||
|         permute_type = RotateBit; // How to specify distance; this is not just direction. | ||||
|         return permute_type; | ||||
|       } | ||||
|  | ||||
|       for(int d=_ndimension-1;d>dimension;d--){ | ||||
|         if (_simd_layout[d]>1 ) permute_type++; | ||||
|   inline int PermuteDim(int dimension){ | ||||
|     return _simd_layout[dimension]>1; | ||||
|   } | ||||
|   inline int PermuteType(int dimension){ | ||||
|     int permute_type=0; | ||||
|     // | ||||
|     // Best way to encode this would be to present a mask  | ||||
|     // for which simd dimensions are rotated, and the rotation | ||||
|     // size. If there is only one simd dimension rotated, this is just  | ||||
|     // a permute.  | ||||
|     // | ||||
|     // Cases: PermuteType == 1,2,4,8 | ||||
|     // Distance should be either 0,1,2.. | ||||
|     // | ||||
|     if ( _simd_layout[dimension] > 2 ) {  | ||||
|       for(int d=0;d<_ndimension;d++){ | ||||
| 	if ( d != dimension ) assert ( (_simd_layout[d]==1)  ); | ||||
|       } | ||||
|       permute_type = RotateBit; // How to specify distance; this is not just direction. | ||||
|       return permute_type; | ||||
|     } | ||||
|     //////////////////////////////////////////////////////////////// | ||||
|     // Array sizing queries | ||||
|     //////////////////////////////////////////////////////////////// | ||||
|  | ||||
|     inline int iSites(void) const { return _isites; }; | ||||
|     inline int Nsimd(void)  const { return _isites; };// Synonymous with iSites | ||||
|     inline int oSites(void) const { return _osites; }; | ||||
|     inline int lSites(void) const { return _isites*_osites; };  | ||||
|     inline int gSites(void) const { return _isites*_osites*_Nprocessors; };  | ||||
|     inline int Nd    (void) const { return _ndimension;}; | ||||
|  | ||||
|     inline const std::vector<int> LocalStarts(void)             { return _lstart;    }; | ||||
|     inline const std::vector<int> &FullDimensions(void)         { return _fdimensions;}; | ||||
|     inline const std::vector<int> &GlobalDimensions(void)       { return _gdimensions;}; | ||||
|     inline const std::vector<int> &LocalDimensions(void)        { return _ldimensions;}; | ||||
|     inline const std::vector<int> &VirtualLocalDimensions(void) { return _ldimensions;}; | ||||
|  | ||||
|     //////////////////////////////////////////////////////////////// | ||||
|     // Utility to print the full decomposition details  | ||||
|     //////////////////////////////////////////////////////////////// | ||||
|  | ||||
|     void show_decomposition(){ | ||||
|       std::cout << GridLogMessage << "\tFull Dimensions    : " << _fdimensions << std::endl; | ||||
|       std::cout << GridLogMessage << "\tSIMD layout        : " << _simd_layout << std::endl; | ||||
|       std::cout << GridLogMessage << "\tGlobal Dimensions  : " << _gdimensions << std::endl; | ||||
|       std::cout << GridLogMessage << "\tLocal Dimensions   : " << _ldimensions << std::endl; | ||||
|       std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl; | ||||
|       std::cout << GridLogMessage << "\tOuter strides      : " << _ostride << std::endl; | ||||
|       std::cout << GridLogMessage << "\tInner strides      : " << _istride << std::endl; | ||||
|       std::cout << GridLogMessage << "\tiSites             : " << _isites << std::endl; | ||||
|       std::cout << GridLogMessage << "\toSites             : " << _osites << std::endl; | ||||
|       std::cout << GridLogMessage << "\tlSites             : " << lSites() << std::endl;         | ||||
|       std::cout << GridLogMessage << "\tgSites             : " << gSites() << std::endl; | ||||
|       std::cout << GridLogMessage << "\tNd                 : " << _ndimension << std::endl;              | ||||
|     }  | ||||
|  | ||||
|     //////////////////////////////////////////////////////////////// | ||||
|     // Global addressing | ||||
|     //////////////////////////////////////////////////////////////// | ||||
|     void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){ | ||||
|       assert(gidx< gSites()); | ||||
|       Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions); | ||||
|     for(int d=_ndimension-1;d>dimension;d--){ | ||||
|       if (_simd_layout[d]>1 ) permute_type++; | ||||
|     } | ||||
|     void LocalIndexToLocalCoor(int lidx,std::vector<int> &lcoor){ | ||||
|       assert(lidx<lSites()); | ||||
|       Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions); | ||||
|     return permute_type; | ||||
|   } | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   // Array sizing queries | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|  | ||||
|   inline int iSites(void) const { return _isites; }; | ||||
|   inline int Nsimd(void)  const { return _isites; };// Synonymous with iSites | ||||
|   inline int oSites(void) const { return _osites; }; | ||||
|   inline int lSites(void) const { return _isites*_osites; };  | ||||
|   inline int gSites(void) const { return _isites*_osites*_Nprocessors; };  | ||||
|   inline int Nd    (void) const { return _ndimension;}; | ||||
|  | ||||
|   inline const Coordinate LocalStarts(void)             { return _lstart;    }; | ||||
|   inline const Coordinate &FullDimensions(void)         { return _fdimensions;}; | ||||
|   inline const Coordinate &GlobalDimensions(void)       { return _gdimensions;}; | ||||
|   inline const Coordinate &LocalDimensions(void)        { return _ldimensions;}; | ||||
|   inline const Coordinate &VirtualLocalDimensions(void) { return _ldimensions;}; | ||||
|  | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   // Utility to print the full decomposition details  | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|  | ||||
|   void show_decomposition(){ | ||||
|     std::cout << GridLogMessage << "\tFull Dimensions    : " << _fdimensions << std::endl; | ||||
|     std::cout << GridLogMessage << "\tSIMD layout        : " << _simd_layout << std::endl; | ||||
|     std::cout << GridLogMessage << "\tGlobal Dimensions  : " << _gdimensions << std::endl; | ||||
|     std::cout << GridLogMessage << "\tLocal Dimensions   : " << _ldimensions << std::endl; | ||||
|     std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl; | ||||
|     std::cout << GridLogMessage << "\tOuter strides      : " << _ostride << std::endl; | ||||
|     std::cout << GridLogMessage << "\tInner strides      : " << _istride << std::endl; | ||||
|     std::cout << GridLogMessage << "\tiSites             : " << _isites << std::endl; | ||||
|     std::cout << GridLogMessage << "\toSites             : " << _osites << std::endl; | ||||
|     std::cout << GridLogMessage << "\tlSites             : " << lSites() << std::endl;         | ||||
|     std::cout << GridLogMessage << "\tgSites             : " << gSites() << std::endl; | ||||
|     std::cout << GridLogMessage << "\tNd                 : " << _ndimension << std::endl;              | ||||
|   }  | ||||
|  | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   // Global addressing | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   void GlobalIndexToGlobalCoor(int gidx,Coordinate &gcoor){ | ||||
|     assert(gidx< gSites()); | ||||
|     Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions); | ||||
|   } | ||||
|   void LocalIndexToLocalCoor(int lidx,Coordinate &lcoor){ | ||||
|     assert(lidx<lSites()); | ||||
|     Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions); | ||||
|   } | ||||
|   void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int & gidx){ | ||||
|     gidx=0; | ||||
|     int mult=1; | ||||
|     for(int mu=0;mu<_ndimension;mu++) { | ||||
|       gidx+=mult*gcoor[mu]; | ||||
|       mult*=_gdimensions[mu]; | ||||
|     } | ||||
|     void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){ | ||||
|       gidx=0; | ||||
|       int mult=1; | ||||
|       for(int mu=0;mu<_ndimension;mu++) { | ||||
|         gidx+=mult*gcoor[mu]; | ||||
|         mult*=_gdimensions[mu]; | ||||
|       } | ||||
|   } | ||||
|   void GlobalCoorToProcessorCoorLocalCoor(Coordinate &pcoor,Coordinate &lcoor,const Coordinate &gcoor) | ||||
|   { | ||||
|     pcoor.resize(_ndimension); | ||||
|     lcoor.resize(_ndimension); | ||||
|     for(int mu=0;mu<_ndimension;mu++){ | ||||
|       int _fld  = _fdimensions[mu]/_processors[mu]; | ||||
|       pcoor[mu] = gcoor[mu]/_fld; | ||||
|       lcoor[mu] = gcoor[mu]%_fld; | ||||
|     } | ||||
|     void GlobalCoorToProcessorCoorLocalCoor(std::vector<int> &pcoor,std::vector<int> &lcoor,const std::vector<int> &gcoor) | ||||
|     { | ||||
|       pcoor.resize(_ndimension); | ||||
|       lcoor.resize(_ndimension); | ||||
|       for(int mu=0;mu<_ndimension;mu++){ | ||||
|         int _fld  = _fdimensions[mu]/_processors[mu]; | ||||
|         pcoor[mu] = gcoor[mu]/_fld; | ||||
|         lcoor[mu] = gcoor[mu]%_fld; | ||||
|       } | ||||
|     } | ||||
|     void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor) | ||||
|     { | ||||
|       std::vector<int> pcoor; | ||||
|       std::vector<int> lcoor; | ||||
|       GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor); | ||||
|       rank = RankFromProcessorCoor(pcoor); | ||||
|       /* | ||||
|       std::vector<int> cblcoor(lcoor); | ||||
|   } | ||||
|   void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const Coordinate &gcoor) | ||||
|   { | ||||
|     Coordinate pcoor; | ||||
|     Coordinate lcoor; | ||||
|     GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor); | ||||
|     rank = RankFromProcessorCoor(pcoor); | ||||
|     /* | ||||
|       Coordinate cblcoor(lcoor); | ||||
|       for(int d=0;d<cblcoor.size();d++){ | ||||
|         if( this->CheckerBoarded(d) ) { | ||||
|           cblcoor[d] = lcoor[d]/2; | ||||
|         } | ||||
|       if( this->CheckerBoarded(d) ) { | ||||
|       cblcoor[d] = lcoor[d]/2; | ||||
|       } | ||||
|       */ | ||||
|       i_idx= iIndex(lcoor); | ||||
|       o_idx= oIndex(lcoor); | ||||
|     } | ||||
|       } | ||||
|     */ | ||||
|     i_idx= iIndex(lcoor); | ||||
|     o_idx= oIndex(lcoor); | ||||
|   } | ||||
|  | ||||
|     void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor) | ||||
|     { | ||||
|       gcoor.resize(_ndimension); | ||||
|       std::vector<int> coor(_ndimension); | ||||
|   void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , Coordinate &gcoor) | ||||
|   { | ||||
|     gcoor.resize(_ndimension); | ||||
|     Coordinate coor(_ndimension); | ||||
|  | ||||
|       ProcessorCoorFromRank(rank,coor); | ||||
|       for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]*coor[mu]; | ||||
|     ProcessorCoorFromRank(rank,coor); | ||||
|     for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]*coor[mu]; | ||||
|  | ||||
|       iCoorFromIindex(coor,i_idx); | ||||
|       for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += _rdimensions[mu]*coor[mu]; | ||||
|     iCoorFromIindex(coor,i_idx); | ||||
|     for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += _rdimensions[mu]*coor[mu]; | ||||
|  | ||||
|       oCoorFromOindex (coor,o_idx); | ||||
|       for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu]; | ||||
|     oCoorFromOindex (coor,o_idx); | ||||
|     for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu]; | ||||
|        | ||||
|   } | ||||
|   void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,Coordinate &fcoor) | ||||
|   { | ||||
|     RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor); | ||||
|     if(CheckerBoarded(0)){ | ||||
|       fcoor[0] = fcoor[0]*2+cb; | ||||
|     } | ||||
|     void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,std::vector<int> &fcoor) | ||||
|     { | ||||
|       RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor); | ||||
|       if(CheckerBoarded(0)){ | ||||
|         fcoor[0] = fcoor[0]*2+cb; | ||||
|       } | ||||
|     } | ||||
|     void ProcessorCoorLocalCoorToGlobalCoor(std::vector<int> &Pcoor,std::vector<int> &Lcoor,std::vector<int> &gcoor) | ||||
|     { | ||||
|       gcoor.resize(_ndimension); | ||||
|       for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu]; | ||||
|     } | ||||
|   } | ||||
|   void ProcessorCoorLocalCoorToGlobalCoor(Coordinate &Pcoor,Coordinate &Lcoor,Coordinate &gcoor) | ||||
|   { | ||||
|     gcoor.resize(_ndimension); | ||||
|     for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu]; | ||||
|   } | ||||
| }; | ||||
|  | ||||
|  | ||||
| } | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -23,98 +23,101 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_CARTESIAN_FULL_H | ||||
| #define GRID_CARTESIAN_FULL_H | ||||
|  | ||||
| namespace Grid{ | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|      | ||||
| ///////////////////////////////////////////////////////////////////////////////////////// | ||||
| // Grid Support. | ||||
| ///////////////////////////////////////////////////////////////////////////////////////// | ||||
|  | ||||
|  | ||||
| class GridCartesian: public GridBase { | ||||
|  | ||||
| public: | ||||
|     int dummy; | ||||
|     virtual int  CheckerBoardFromOindexTable (int Oindex) { | ||||
|       return 0; | ||||
|     } | ||||
|     virtual int  CheckerBoardFromOindex (int Oindex) | ||||
|     { | ||||
|       return 0; | ||||
|     } | ||||
|     virtual int CheckerBoarded(int dim){ | ||||
|       return 0; | ||||
|     } | ||||
|     virtual int CheckerBoard(const std::vector<int> &site){ | ||||
|         return 0; | ||||
|     } | ||||
|     virtual int CheckerBoardDestination(int cb,int shift,int dim){ | ||||
|         return 0; | ||||
|     } | ||||
|     virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift, int ocb){ | ||||
|       return shift; | ||||
|     } | ||||
|     virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){ | ||||
|       return shift; | ||||
|     } | ||||
|     ///////////////////////////////////////////////////////////////////////// | ||||
|     // Constructor takes a parent grid and possibly subdivides communicator. | ||||
|     ///////////////////////////////////////////////////////////////////////// | ||||
|     GridCartesian(const std::vector<int> &dimensions, | ||||
| 		  const std::vector<int> &simd_layout, | ||||
| 		  const std::vector<int> &processor_grid, | ||||
| 		  const GridCartesian &parent) : GridBase(processor_grid,parent,dummy) | ||||
|     { | ||||
|       Init(dimensions,simd_layout,processor_grid); | ||||
|     } | ||||
|     GridCartesian(const std::vector<int> &dimensions, | ||||
| 		  const std::vector<int> &simd_layout, | ||||
| 		  const std::vector<int> &processor_grid, | ||||
| 		  const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank) | ||||
|     { | ||||
|       Init(dimensions,simd_layout,processor_grid); | ||||
|     } | ||||
|     ///////////////////////////////////////////////////////////////////////// | ||||
|     // Construct from comm world | ||||
|     ///////////////////////////////////////////////////////////////////////// | ||||
|     GridCartesian(const std::vector<int> &dimensions, | ||||
| 		  const std::vector<int> &simd_layout, | ||||
| 		  const std::vector<int> &processor_grid) : GridBase(processor_grid) | ||||
|     { | ||||
|       Init(dimensions,simd_layout,processor_grid); | ||||
|     } | ||||
|   int dummy; | ||||
|   Coordinate _checker_dim_mask; | ||||
|   virtual int  CheckerBoardFromOindexTable (int Oindex) { | ||||
|     return 0; | ||||
|   } | ||||
|   virtual int  CheckerBoardFromOindex (int Oindex) | ||||
|   { | ||||
|     return 0; | ||||
|   } | ||||
|   virtual int CheckerBoarded(int dim){ | ||||
|     return 0; | ||||
|   } | ||||
|   virtual int CheckerBoard(const Coordinate &site){ | ||||
|     return 0; | ||||
|   } | ||||
|   virtual int CheckerBoardDestination(int cb,int shift,int dim){ | ||||
|     return 0; | ||||
|   } | ||||
|   virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift, int ocb){ | ||||
|     return shift; | ||||
|   } | ||||
|   virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){ | ||||
|     return shift; | ||||
|   } | ||||
|   ///////////////////////////////////////////////////////////////////////// | ||||
|   // Constructor takes a parent grid and possibly subdivides communicator. | ||||
|   ///////////////////////////////////////////////////////////////////////// | ||||
|   GridCartesian(const Coordinate &dimensions, | ||||
| 		const Coordinate &simd_layout, | ||||
| 		const Coordinate &processor_grid, | ||||
| 		const GridCartesian &parent) : GridBase(processor_grid,parent,dummy) | ||||
|   { | ||||
|     Init(dimensions,simd_layout,processor_grid); | ||||
|   } | ||||
|   GridCartesian(const Coordinate &dimensions, | ||||
| 		const Coordinate &simd_layout, | ||||
| 		const Coordinate &processor_grid, | ||||
| 		const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank) | ||||
|   { | ||||
|     Init(dimensions,simd_layout,processor_grid); | ||||
|   } | ||||
|   ///////////////////////////////////////////////////////////////////////// | ||||
|   // Construct from comm world | ||||
|   ///////////////////////////////////////////////////////////////////////// | ||||
|   GridCartesian(const Coordinate &dimensions, | ||||
| 		const Coordinate &simd_layout, | ||||
| 		const Coordinate &processor_grid) : GridBase(processor_grid) | ||||
|   { | ||||
|     Init(dimensions,simd_layout,processor_grid); | ||||
|   } | ||||
|  | ||||
|     virtual ~GridCartesian() = default; | ||||
|   virtual ~GridCartesian() = default; | ||||
|  | ||||
|     void Init(const std::vector<int> &dimensions, | ||||
| 	      const std::vector<int> &simd_layout, | ||||
| 	      const std::vector<int> &processor_grid) | ||||
|     { | ||||
|       /////////////////////// | ||||
|       // Grid information | ||||
|       /////////////////////// | ||||
|   void Init(const Coordinate &dimensions, | ||||
| 	    const Coordinate &simd_layout, | ||||
| 	    const Coordinate &processor_grid) | ||||
|   { | ||||
|     /////////////////////// | ||||
|     // Grid information | ||||
|     /////////////////////// | ||||
|       _isCheckerBoarded = false; | ||||
|       _ndimension = dimensions.size(); | ||||
|     _ndimension = dimensions.size(); | ||||
|  | ||||
|       _fdimensions.resize(_ndimension); | ||||
|       _gdimensions.resize(_ndimension); | ||||
|       _ldimensions.resize(_ndimension); | ||||
|       _rdimensions.resize(_ndimension); | ||||
|       _simd_layout.resize(_ndimension); | ||||
|       _lstart.resize(_ndimension); | ||||
|       _lend.resize(_ndimension); | ||||
|     _fdimensions.resize(_ndimension); | ||||
|     _gdimensions.resize(_ndimension); | ||||
|     _ldimensions.resize(_ndimension); | ||||
|     _rdimensions.resize(_ndimension); | ||||
|     _simd_layout.resize(_ndimension); | ||||
|     _checker_dim_mask.resize(_ndimension);; | ||||
|     _lstart.resize(_ndimension); | ||||
|     _lend.resize(_ndimension); | ||||
|  | ||||
|       _ostride.resize(_ndimension); | ||||
|       _istride.resize(_ndimension); | ||||
|     _ostride.resize(_ndimension); | ||||
|     _istride.resize(_ndimension); | ||||
|  | ||||
|       _fsites = _gsites = _osites = _isites = 1; | ||||
|     _fsites = _gsites = _osites = _isites = 1; | ||||
|  | ||||
|       for (int d = 0; d < _ndimension; d++) | ||||
|     for (int d = 0; d < _ndimension; d++) | ||||
|       { | ||||
| 	_checker_dim_mask[d]=0; | ||||
|  | ||||
|         _fdimensions[d] = dimensions[d];   // Global dimensions | ||||
|         _gdimensions[d] = _fdimensions[d]; // Global dimensions | ||||
|         _simd_layout[d] = simd_layout[d]; | ||||
| @@ -136,30 +139,30 @@ public: | ||||
|  | ||||
|         // Addressing support | ||||
|         if (d == 0) | ||||
|         { | ||||
|           _ostride[d] = 1; | ||||
|           _istride[d] = 1; | ||||
|         } | ||||
| 	  { | ||||
| 	    _ostride[d] = 1; | ||||
| 	    _istride[d] = 1; | ||||
| 	  } | ||||
|         else | ||||
|         { | ||||
|           _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1]; | ||||
|           _istride[d] = _istride[d - 1] * _simd_layout[d - 1]; | ||||
|         } | ||||
| 	  { | ||||
| 	    _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1]; | ||||
| 	    _istride[d] = _istride[d - 1] * _simd_layout[d - 1]; | ||||
| 	  } | ||||
|       } | ||||
|  | ||||
|       /////////////////////// | ||||
|       // subplane information | ||||
|       /////////////////////// | ||||
|       _slice_block.resize(_ndimension); | ||||
|       _slice_stride.resize(_ndimension); | ||||
|       _slice_nblock.resize(_ndimension); | ||||
|     /////////////////////// | ||||
|     // subplane information | ||||
|     /////////////////////// | ||||
|     _slice_block.resize(_ndimension); | ||||
|     _slice_stride.resize(_ndimension); | ||||
|     _slice_nblock.resize(_ndimension); | ||||
|  | ||||
|       int block = 1; | ||||
|       int nblock = 1; | ||||
|       for (int d = 0; d < _ndimension; d++) | ||||
|         nblock *= _rdimensions[d]; | ||||
|     int block = 1; | ||||
|     int nblock = 1; | ||||
|     for (int d = 0; d < _ndimension; d++) | ||||
|       nblock *= _rdimensions[d]; | ||||
|  | ||||
|       for (int d = 0; d < _ndimension; d++) | ||||
|     for (int d = 0; d < _ndimension; d++) | ||||
|       { | ||||
|         nblock /= _rdimensions[d]; | ||||
|         _slice_block[d] = block; | ||||
| @@ -167,8 +170,9 @@ public: | ||||
|         _slice_nblock[d] = nblock; | ||||
|         block = block * _rdimensions[d]; | ||||
|       } | ||||
|     }; | ||||
|   }; | ||||
|  | ||||
| }; | ||||
| } | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -24,178 +24,163 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_CARTESIAN_RED_BLACK_H | ||||
| #define GRID_CARTESIAN_RED_BLACK_H | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| namespace Grid { | ||||
| static const int CbRed  =0; | ||||
| static const int CbBlack=1; | ||||
| static const int Even   =CbRed; | ||||
| static const int Odd    =CbBlack; | ||||
|  | ||||
| accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex, Coordinate &rdim, Coordinate &chk_dim_msk) | ||||
| { | ||||
|   int nd=rdim.size(); | ||||
|   Coordinate coor(nd); | ||||
|  | ||||
|   Lexicographic::CoorFromIndex(coor,oindex,rdim); | ||||
|  | ||||
|   int linear=0; | ||||
|   for(int d=0;d<nd;d++){ | ||||
|     if(chk_dim_msk[d]) | ||||
|       linear=linear+coor[d]; | ||||
|   } | ||||
|   return (linear&0x1); | ||||
| } | ||||
|  | ||||
|   static const int CbRed  =0; | ||||
|   static const int CbBlack=1; | ||||
|   static const int Even   =CbRed; | ||||
|   static const int Odd    =CbBlack; | ||||
|      | ||||
| // Specialise this for red black grids storing half the data like a chess board. | ||||
| class GridRedBlackCartesian : public GridBase | ||||
| { | ||||
| public: | ||||
|     std::vector<int> _checker_dim_mask; | ||||
|     int              _checker_dim; | ||||
|     std::vector<int> _checker_board; | ||||
|   //  Coordinate _checker_dim_mask; | ||||
|   int              _checker_dim; | ||||
|   std::vector<int> _checker_board; | ||||
|  | ||||
|     virtual int CheckerBoarded(int dim){ | ||||
|       if( dim==_checker_dim) return 1; | ||||
|       else return 0; | ||||
|     } | ||||
|     virtual int CheckerBoard(const std::vector<int> &site){ | ||||
|       int linear=0; | ||||
|       assert(site.size()==_ndimension); | ||||
|       for(int d=0;d<_ndimension;d++){  | ||||
| 	if(_checker_dim_mask[d]) | ||||
| 	  linear=linear+site[d]; | ||||
|       } | ||||
|       return (linear&0x1); | ||||
|   virtual int CheckerBoarded(int dim){ | ||||
|     if( dim==_checker_dim) return 1; | ||||
|     else return 0; | ||||
|   } | ||||
|   virtual int CheckerBoard(const Coordinate &site){ | ||||
|     int linear=0; | ||||
|     assert(site.size()==_ndimension); | ||||
|     for(int d=0;d<_ndimension;d++){  | ||||
|       if(_checker_dim_mask[d]) | ||||
| 	linear=linear+site[d]; | ||||
|     } | ||||
|     return (linear&0x1); | ||||
|   } | ||||
|  | ||||
|   // Depending on the cb of site, we toggle source cb. | ||||
|   // for block #b, element #e = (b, e) | ||||
|   // we need  | ||||
|   virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int ocb){ | ||||
|     if(dim != _checker_dim) return shift; | ||||
|  | ||||
|     // Depending on the cb of site, we toggle source cb. | ||||
|     // for block #b, element #e = (b, e) | ||||
|     // we need  | ||||
|     virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int ocb){ | ||||
|       if(dim != _checker_dim) return shift; | ||||
|     int fulldim =_fdimensions[dim]; | ||||
|     shift = (shift+fulldim)%fulldim; | ||||
|  | ||||
|       int fulldim =_fdimensions[dim]; | ||||
|       shift = (shift+fulldim)%fulldim; | ||||
|  | ||||
|       // Probably faster with table lookup; | ||||
|       // or by looping over x,y,z and multiply rather than computing checkerboard. | ||||
|     // Probably faster with table lookup; | ||||
|     // or by looping over x,y,z and multiply rather than computing checkerboard. | ||||
| 	   | ||||
|       if ( (source_cb+ocb)&1 ) { | ||||
| 	return (shift)/2; | ||||
|       } else { | ||||
| 	return (shift+1)/2; | ||||
|       } | ||||
|     if ( (source_cb+ocb)&1 ) { | ||||
|       return (shift)/2; | ||||
|     } else { | ||||
|       return (shift+1)/2; | ||||
|     } | ||||
|     virtual int  CheckerBoardFromOindexTable (int Oindex) { | ||||
|       return _checker_board[Oindex]; | ||||
|     } | ||||
|     virtual int  CheckerBoardFromOindex (int Oindex) | ||||
|     { | ||||
|       std::vector<int> ocoor; | ||||
|       oCoorFromOindex(ocoor,Oindex); | ||||
|       return CheckerBoard(ocoor); | ||||
|     } | ||||
|     virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){ | ||||
|   } | ||||
|   virtual int  CheckerBoardFromOindexTable (int Oindex) { | ||||
|     return _checker_board[Oindex]; | ||||
|   } | ||||
|   virtual int  CheckerBoardFromOindex (int Oindex) | ||||
|   { | ||||
|     Coordinate ocoor; | ||||
|     oCoorFromOindex(ocoor,Oindex); | ||||
|     return CheckerBoard(ocoor); | ||||
|   } | ||||
|   virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){ | ||||
|  | ||||
|       if(dim != _checker_dim) return shift; | ||||
|     if(dim != _checker_dim) return shift; | ||||
|  | ||||
|       int ocb=CheckerBoardFromOindex(osite); | ||||
|     int ocb=CheckerBoardFromOindex(osite); | ||||
|        | ||||
|       return CheckerBoardShiftForCB(source_cb,dim,shift,ocb); | ||||
|     } | ||||
|     return CheckerBoardShiftForCB(source_cb,dim,shift,ocb); | ||||
|   } | ||||
|      | ||||
|     virtual int CheckerBoardDestination(int source_cb,int shift,int dim){ | ||||
|       if ( _checker_dim_mask[dim]  ) { | ||||
| 	// If _fdimensions[checker_dim] is odd, then shifting by 1 in other dims | ||||
| 	// does NOT cause a parity hop. | ||||
| 	int add=(dim==_checker_dim) ? 0 : _fdimensions[_checker_dim]; | ||||
|         if ( (shift+add) &0x1) { | ||||
|             return 1-source_cb; | ||||
|         } else { | ||||
|             return source_cb; | ||||
|         } | ||||
|   virtual int CheckerBoardDestination(int source_cb,int shift,int dim){ | ||||
|     if ( _checker_dim_mask[dim]  ) { | ||||
|       // If _fdimensions[checker_dim] is odd, then shifting by 1 in other dims | ||||
|       // does NOT cause a parity hop. | ||||
|       int add=(dim==_checker_dim) ? 0 : _fdimensions[_checker_dim]; | ||||
|       if ( (shift+add) &0x1) { | ||||
| 	return 1-source_cb; | ||||
|       } else { | ||||
| 	return source_cb; | ||||
|  | ||||
|       } | ||||
|     }; | ||||
|     } else { | ||||
|       return source_cb; | ||||
|  | ||||
|     //////////////////////////////////////////////////////////// | ||||
|     // Create Redblack from original grid; require full grid pointer ? | ||||
|     //////////////////////////////////////////////////////////// | ||||
|     GridRedBlackCartesian(const GridBase *base) : GridBase(base->_processors,*base) | ||||
|     { | ||||
|       int dims = base->_ndimension; | ||||
|       std::vector<int> checker_dim_mask(dims,1); | ||||
|       int checker_dim = 0; | ||||
|       Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim); | ||||
|     }; | ||||
|  | ||||
|     //////////////////////////////////////////////////////////// | ||||
|     // Create redblack from original grid, with non-trivial checker dim mask | ||||
|     //////////////////////////////////////////////////////////// | ||||
|     GridRedBlackCartesian(const GridBase *base, | ||||
| 			  const std::vector<int> &checker_dim_mask, | ||||
| 			  int checker_dim | ||||
| 			  ) :  GridBase(base->_processors,*base)  | ||||
|     { | ||||
|       Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim)  ; | ||||
|     } | ||||
|   }; | ||||
|  | ||||
|     virtual ~GridRedBlackCartesian() = default; | ||||
| #if 0 | ||||
|     //////////////////////////////////////////////////////////// | ||||
|     // Create redblack grid ;; deprecate these. Should not | ||||
|     // need direct creation of redblack without a full grid to base on | ||||
|     //////////////////////////////////////////////////////////// | ||||
|     GridRedBlackCartesian(const GridBase *base, | ||||
| 			  const std::vector<int> &dimensions, | ||||
| 			  const std::vector<int> &simd_layout, | ||||
| 			  const std::vector<int> &processor_grid, | ||||
| 			  const std::vector<int> &checker_dim_mask, | ||||
| 			  int checker_dim | ||||
| 			  ) :  GridBase(processor_grid,*base)  | ||||
|     { | ||||
|       Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim); | ||||
|     } | ||||
|   //////////////////////////////////////////////////////////// | ||||
|   // Create Redblack from original grid; require full grid pointer ? | ||||
|   //////////////////////////////////////////////////////////// | ||||
|   GridRedBlackCartesian(const GridBase *base) : GridBase(base->_processors,*base) | ||||
|   { | ||||
|     int dims = base->_ndimension; | ||||
|     Coordinate checker_dim_mask(dims,1); | ||||
|     int checker_dim = 0; | ||||
|     Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim); | ||||
|   }; | ||||
|  | ||||
|     //////////////////////////////////////////////////////////// | ||||
|     // Create redblack grid | ||||
|     //////////////////////////////////////////////////////////// | ||||
|     GridRedBlackCartesian(const GridBase *base, | ||||
| 			  const std::vector<int> &dimensions, | ||||
| 			  const std::vector<int> &simd_layout, | ||||
| 			  const std::vector<int> &processor_grid) : GridBase(processor_grid,*base)  | ||||
|     { | ||||
|       std::vector<int> checker_dim_mask(dimensions.size(),1); | ||||
|       int checker_dim = 0; | ||||
|       Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim); | ||||
|     } | ||||
| #endif | ||||
|   //////////////////////////////////////////////////////////// | ||||
|   // Create redblack from original grid, with non-trivial checker dim mask | ||||
|   //////////////////////////////////////////////////////////// | ||||
|   GridRedBlackCartesian(const GridBase *base, | ||||
| 			const Coordinate &checker_dim_mask, | ||||
| 			int checker_dim | ||||
| 			) :  GridBase(base->_processors,*base)  | ||||
|   { | ||||
|     Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim)  ; | ||||
|   } | ||||
|  | ||||
|     void Init(const std::vector<int> &dimensions, | ||||
|               const std::vector<int> &simd_layout, | ||||
|               const std::vector<int> &processor_grid, | ||||
|               const std::vector<int> &checker_dim_mask, | ||||
|               int checker_dim) | ||||
|     { | ||||
|   virtual ~GridRedBlackCartesian() = default; | ||||
|  | ||||
|   void Init(const Coordinate &dimensions, | ||||
| 	    const Coordinate &simd_layout, | ||||
| 	    const Coordinate &processor_grid, | ||||
| 	    const Coordinate &checker_dim_mask, | ||||
| 	    int checker_dim) | ||||
|   { | ||||
|  | ||||
|       _isCheckerBoarded = true; | ||||
|       _checker_dim = checker_dim; | ||||
|       assert(checker_dim_mask[checker_dim] == 1); | ||||
|       _ndimension = dimensions.size(); | ||||
|       assert(checker_dim_mask.size() == _ndimension); | ||||
|       assert(processor_grid.size() == _ndimension); | ||||
|       assert(simd_layout.size() == _ndimension); | ||||
|     _checker_dim = checker_dim; | ||||
|     assert(checker_dim_mask[checker_dim] == 1); | ||||
|     _ndimension = dimensions.size(); | ||||
|     assert(checker_dim_mask.size() == _ndimension); | ||||
|     assert(processor_grid.size() == _ndimension); | ||||
|     assert(simd_layout.size() == _ndimension); | ||||
|  | ||||
|       _fdimensions.resize(_ndimension); | ||||
|       _gdimensions.resize(_ndimension); | ||||
|       _ldimensions.resize(_ndimension); | ||||
|       _rdimensions.resize(_ndimension); | ||||
|       _simd_layout.resize(_ndimension); | ||||
|       _lstart.resize(_ndimension); | ||||
|       _lend.resize(_ndimension); | ||||
|     _fdimensions.resize(_ndimension); | ||||
|     _gdimensions.resize(_ndimension); | ||||
|     _ldimensions.resize(_ndimension); | ||||
|     _rdimensions.resize(_ndimension); | ||||
|     _simd_layout.resize(_ndimension); | ||||
|     _lstart.resize(_ndimension); | ||||
|     _lend.resize(_ndimension); | ||||
|  | ||||
|       _ostride.resize(_ndimension); | ||||
|       _istride.resize(_ndimension); | ||||
|     _ostride.resize(_ndimension); | ||||
|     _istride.resize(_ndimension); | ||||
|  | ||||
|       _fsites = _gsites = _osites = _isites = 1; | ||||
|     _fsites = _gsites = _osites = _isites = 1; | ||||
|  | ||||
|       _checker_dim_mask = checker_dim_mask; | ||||
|     _checker_dim_mask = checker_dim_mask; | ||||
|  | ||||
|       for (int d = 0; d < _ndimension; d++) | ||||
|     for (int d = 0; d < _ndimension; d++) | ||||
|       { | ||||
|         _fdimensions[d] = dimensions[d]; | ||||
|         _gdimensions[d] = _fdimensions[d]; | ||||
| @@ -203,11 +188,11 @@ public: | ||||
|         _gsites = _gsites * _gdimensions[d]; | ||||
|  | ||||
|         if (d == _checker_dim) | ||||
|         { | ||||
|           assert((_gdimensions[d] & 0x1) == 0); | ||||
|           _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard | ||||
| 	  _gsites /= 2; | ||||
|         } | ||||
| 	  { | ||||
| 	    assert((_gdimensions[d] & 0x1) == 0); | ||||
| 	    _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard | ||||
| 	    _gsites /= 2; | ||||
| 	  } | ||||
|         _ldimensions[d] = _gdimensions[d] / _processors[d]; | ||||
|         assert(_ldimensions[d] * _processors[d] == _gdimensions[d]); | ||||
|         _lstart[d] = _processor_coor[d] * _ldimensions[d]; | ||||
| @@ -222,42 +207,42 @@ public: | ||||
|         // all elements of a simd vector must have same checkerboard. | ||||
|         // If Ls vectorised, this must still be the case; e.g. dwf rb5d | ||||
|         if (_simd_layout[d] > 1) | ||||
|         { | ||||
|           if (checker_dim_mask[d]) | ||||
|           { | ||||
|             assert((_rdimensions[d] & 0x1) == 0); | ||||
|           } | ||||
|         } | ||||
| 	  { | ||||
| 	    if (checker_dim_mask[d]) | ||||
| 	      { | ||||
| 		assert((_rdimensions[d] & 0x1) == 0); | ||||
| 	      } | ||||
| 	  } | ||||
|  | ||||
|         _osites *= _rdimensions[d]; | ||||
|         _isites *= _simd_layout[d]; | ||||
|  | ||||
|         // Addressing support | ||||
|         if (d == 0) | ||||
|         { | ||||
|           _ostride[d] = 1; | ||||
|           _istride[d] = 1; | ||||
|         } | ||||
| 	  { | ||||
| 	    _ostride[d] = 1; | ||||
| 	    _istride[d] = 1; | ||||
| 	  } | ||||
|         else | ||||
|         { | ||||
|           _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1]; | ||||
|           _istride[d] = _istride[d - 1] * _simd_layout[d - 1]; | ||||
|         } | ||||
| 	  { | ||||
| 	    _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1]; | ||||
| 	    _istride[d] = _istride[d - 1] * _simd_layout[d - 1]; | ||||
| 	  } | ||||
|       } | ||||
|  | ||||
|       //////////////////////////////////////////////////////////////////////////////////////////// | ||||
|       // subplane information | ||||
|       //////////////////////////////////////////////////////////////////////////////////////////// | ||||
|       _slice_block.resize(_ndimension); | ||||
|       _slice_stride.resize(_ndimension); | ||||
|       _slice_nblock.resize(_ndimension); | ||||
|     //////////////////////////////////////////////////////////////////////////////////////////// | ||||
|     // subplane information | ||||
|     //////////////////////////////////////////////////////////////////////////////////////////// | ||||
|     _slice_block.resize(_ndimension); | ||||
|     _slice_stride.resize(_ndimension); | ||||
|     _slice_nblock.resize(_ndimension); | ||||
|  | ||||
|       int block = 1; | ||||
|       int nblock = 1; | ||||
|       for (int d = 0; d < _ndimension; d++) | ||||
|         nblock *= _rdimensions[d]; | ||||
|     int block = 1; | ||||
|     int nblock = 1; | ||||
|     for (int d = 0; d < _ndimension; d++) | ||||
|       nblock *= _rdimensions[d]; | ||||
|  | ||||
|       for (int d = 0; d < _ndimension; d++) | ||||
|     for (int d = 0; d < _ndimension; d++) | ||||
|       { | ||||
|         nblock /= _rdimensions[d]; | ||||
|         _slice_block[d] = block; | ||||
| @@ -266,55 +251,55 @@ public: | ||||
|         block = block * _rdimensions[d]; | ||||
|       } | ||||
|  | ||||
|       //////////////////////////////////////////////// | ||||
|       // Create a checkerboard lookup table | ||||
|       //////////////////////////////////////////////// | ||||
|       int rvol = 1; | ||||
|       for (int d = 0; d < _ndimension; d++) | ||||
|     //////////////////////////////////////////////// | ||||
|     // Create a checkerboard lookup table | ||||
|     //////////////////////////////////////////////// | ||||
|     int rvol = 1; | ||||
|     for (int d = 0; d < _ndimension; d++) | ||||
|       { | ||||
|         rvol = rvol * _rdimensions[d]; | ||||
|       } | ||||
|       _checker_board.resize(rvol); | ||||
|       for (int osite = 0; osite < _osites; osite++) | ||||
|     _checker_board.resize(rvol); | ||||
|     for (int osite = 0; osite < _osites; osite++) | ||||
|       { | ||||
|         _checker_board[osite] = CheckerBoardFromOindex(osite); | ||||
|       } | ||||
|     }; | ||||
|   }; | ||||
|  | ||||
|   protected: | ||||
|     virtual int oIndex(std::vector<int> &coor) | ||||
|     { | ||||
|       int idx = 0; | ||||
|       for (int d = 0; d < _ndimension; d++) | ||||
| protected: | ||||
|   virtual int oIndex(Coordinate &coor) | ||||
|   { | ||||
|     int idx = 0; | ||||
|     for (int d = 0; d < _ndimension; d++) | ||||
|       { | ||||
|         if (d == _checker_dim) | ||||
|         { | ||||
|           idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]); | ||||
|         } | ||||
| 	  { | ||||
| 	    idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]); | ||||
| 	  } | ||||
|         else | ||||
|         { | ||||
|           idx += _ostride[d] * (coor[d] % _rdimensions[d]); | ||||
|         } | ||||
| 	  { | ||||
| 	    idx += _ostride[d] * (coor[d] % _rdimensions[d]); | ||||
| 	  } | ||||
|       } | ||||
|       return idx; | ||||
|     }; | ||||
|     return idx; | ||||
|   }; | ||||
|  | ||||
|     virtual int iIndex(std::vector<int> &lcoor) | ||||
|     { | ||||
|       int idx = 0; | ||||
|       for (int d = 0; d < _ndimension; d++) | ||||
|   virtual int iIndex(Coordinate &lcoor) | ||||
|   { | ||||
|     int idx = 0; | ||||
|     for (int d = 0; d < _ndimension; d++) | ||||
|       { | ||||
|         if (d == _checker_dim) | ||||
|         { | ||||
|           idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d])); | ||||
|         } | ||||
| 	  { | ||||
| 	    idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d])); | ||||
| 	  } | ||||
|         else | ||||
|         { | ||||
|           idx += _istride[d] * (lcoor[d] / _rdimensions[d]); | ||||
|         } | ||||
| 	  { | ||||
| 	    idx += _istride[d] * (lcoor[d] / _rdimensions[d]); | ||||
| 	  } | ||||
|       } | ||||
|       return idx; | ||||
|     } | ||||
|     return idx; | ||||
|   } | ||||
| }; | ||||
| } | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -23,11 +23,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_COMMUNICATOR_H | ||||
| #define GRID_COMMUNICATOR_H | ||||
|  | ||||
| #include <Grid/util/Coordinate.h> | ||||
| #include <Grid/communicator/SharedMemory.h> | ||||
| #include <Grid/communicator/Communicator_base.h> | ||||
|  | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -23,15 +23,15 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #include <Grid/GridCore.h> | ||||
| #include <fcntl.h> | ||||
| #include <unistd.h> | ||||
| #include <limits.h> | ||||
| #include <sys/mman.h> | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| /////////////////////////////////////////////////////////////// | ||||
| // Info that is setup once and indept of cartesian layout | ||||
| @@ -47,8 +47,8 @@ int                      CartesianCommunicator::Dimensions(void)        { return | ||||
| int                      CartesianCommunicator::IsBoss(void)            { return _processor==0; }; | ||||
| int                      CartesianCommunicator::BossRank(void)          { return 0; }; | ||||
| int                      CartesianCommunicator::ThisRank(void)          { return _processor; }; | ||||
| const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; }; | ||||
| const std::vector<int> & CartesianCommunicator::ProcessorGrid(void)     { return _processors; }; | ||||
| const Coordinate & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; }; | ||||
| const Coordinate & CartesianCommunicator::ProcessorGrid(void)     { return _processors; }; | ||||
| int                      CartesianCommunicator::ProcessorCount(void)    { return _Nprocessors; }; | ||||
|  | ||||
| //////////////////////////////////////////////////////////////////////////////// | ||||
| @@ -72,5 +72,6 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) | ||||
|   GlobalSumVector((double *)c,2*N); | ||||
| } | ||||
|    | ||||
| } | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -1,5 +1,5 @@ | ||||
|  | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -24,8 +24,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_COMMUNICATOR_BASE_H | ||||
| #define GRID_COMMUNICATOR_BASE_H | ||||
|  | ||||
| @@ -34,7 +34,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| /////////////////////////////////// | ||||
| #include <Grid/communicator/SharedMemory.h> | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| class CartesianCommunicator : public SharedMemory { | ||||
|  | ||||
| @@ -52,9 +52,9 @@ public: | ||||
|   // Communicator should know nothing of the physics grid, only processor grid. | ||||
|   //////////////////////////////////////////// | ||||
|   int              _Nprocessors;     // How many in all | ||||
|   std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes. | ||||
|   Coordinate _processors;      // Which dimensions get relayed out over processors lanes. | ||||
|   int              _processor;       // linear processor rank | ||||
|   std::vector<int> _processor_coor;  // linear processor coordinate | ||||
|   Coordinate _processor_coor;  // linear processor coordinate | ||||
|   unsigned long    _ndimension; | ||||
|   static Grid_MPI_Comm      communicator_world; | ||||
|   Grid_MPI_Comm             communicator; | ||||
| @@ -69,34 +69,34 @@ public: | ||||
|   // Constructors to sub-divide a parent communicator | ||||
|   // and default to comm world | ||||
|   //////////////////////////////////////////////// | ||||
|   CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank); | ||||
|   CartesianCommunicator(const std::vector<int> &pdimensions_in); | ||||
|   CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank); | ||||
|   CartesianCommunicator(const Coordinate &pdimensions_in); | ||||
|   virtual ~CartesianCommunicator(); | ||||
|  | ||||
|  private: | ||||
| private: | ||||
|  | ||||
|   //////////////////////////////////////////////// | ||||
|   // Private initialise from an MPI communicator | ||||
|   // Can use after an MPI_Comm_split, but hidden from user so private | ||||
|   //////////////////////////////////////////////// | ||||
|   void InitFromMPICommunicator(const std::vector<int> &processors, Grid_MPI_Comm communicator_base); | ||||
|  | ||||
|  public: | ||||
|   void InitFromMPICommunicator(const Coordinate &processors, Grid_MPI_Comm communicator_base); | ||||
|  | ||||
| public: | ||||
|    | ||||
|    | ||||
|   //////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // Wraps MPI_Cart routines, or implements equivalent on other impls | ||||
|   //////////////////////////////////////////////////////////////////////////////////////// | ||||
|   void ShiftedRanks(int dim,int shift,int & source, int & dest); | ||||
|   int  RankFromProcessorCoor(std::vector<int> &coor); | ||||
|   void ProcessorCoorFromRank(int rank,std::vector<int> &coor); | ||||
|   int  RankFromProcessorCoor(Coordinate &coor); | ||||
|   void ProcessorCoorFromRank(int rank,Coordinate &coor); | ||||
|    | ||||
|   int                      Dimensions(void)        ; | ||||
|   int                      IsBoss(void)            ; | ||||
|   int                      BossRank(void)          ; | ||||
|   int                      ThisRank(void)          ; | ||||
|   const std::vector<int> & ThisProcessorCoor(void) ; | ||||
|   const std::vector<int> & ProcessorGrid(void)     ; | ||||
|   const Coordinate & ThisProcessorCoor(void) ; | ||||
|   const Coordinate & ProcessorGrid(void)     ; | ||||
|   int                      ProcessorCount(void)    ; | ||||
|  | ||||
|   //////////////////////////////////////////////////////////////////////////////// | ||||
| @@ -114,6 +114,7 @@ public: | ||||
|   void GlobalSumVector(RealD *,int N); | ||||
|   void GlobalSum(uint32_t &); | ||||
|   void GlobalSum(uint64_t &); | ||||
|   void GlobalSumVector(uint64_t*,int N); | ||||
|   void GlobalSum(ComplexF &c); | ||||
|   void GlobalSumVector(ComplexF *c,int N); | ||||
|   void GlobalSum(ComplexD &c); | ||||
| @@ -197,11 +198,12 @@ public: | ||||
|   void AllToAll(void  *in,void *out,uint64_t words         ,uint64_t bytes); | ||||
|    | ||||
|   template<class obj> void Broadcast(int root,obj &data) | ||||
|     { | ||||
|       Broadcast(root,(void *)&data,sizeof(data)); | ||||
|     }; | ||||
|   { | ||||
|     Broadcast(root,(void *)&data,sizeof(data)); | ||||
|   } | ||||
|  | ||||
| };  | ||||
| } | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
| #endif | ||||
|   | ||||
| @@ -23,12 +23,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #include <Grid/GridCore.h> | ||||
| #include <Grid/communicator/SharedMemory.h> | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| Grid_MPI_Comm       CartesianCommunicator::communicator_world; | ||||
|  | ||||
| @@ -44,10 +44,15 @@ void CartesianCommunicator::Init(int *argc, char ***argv) | ||||
|   MPI_Initialized(&flag); // needed to coexist with other libs apparently | ||||
|   if ( !flag ) { | ||||
|     MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); | ||||
|  | ||||
|     //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE | ||||
|     if( (nCommThreads == 1 && provided == MPI_THREAD_SINGLE) || | ||||
|         (nCommThreads > 1 && provided != MPI_THREAD_MULTIPLE) ) | ||||
|     if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) { | ||||
|       assert(0); | ||||
|     } | ||||
|  | ||||
|     if( (nCommThreads > 1) && (provided != MPI_THREAD_MULTIPLE) ) { | ||||
|       assert(0); | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   // Never clean up as done once. | ||||
| @@ -69,14 +74,14 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest | ||||
|   int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest); | ||||
|   assert(ierr==0); | ||||
| } | ||||
| int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) | ||||
| int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) | ||||
| { | ||||
|   int rank; | ||||
|   int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank); | ||||
|   assert(ierr==0); | ||||
|   return rank; | ||||
| } | ||||
| void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor) | ||||
| void  CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor) | ||||
| { | ||||
|   coor.resize(_ndimension); | ||||
|   int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]); | ||||
| @@ -86,7 +91,7 @@ void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &c | ||||
| //////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // Initialises from communicator_world | ||||
| //////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)  | ||||
| CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)  | ||||
| { | ||||
|   MPI_Comm optimal_comm; | ||||
|   //////////////////////////////////////////////////// | ||||
| @@ -105,12 +110,12 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | ||||
| ////////////////////////////////// | ||||
| // Try to subdivide communicator | ||||
| ////////////////////////////////// | ||||
| CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)     | ||||
| CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)     | ||||
| { | ||||
|   _ndimension = processors.size();  assert(_ndimension>=1); | ||||
|   int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension); | ||||
|   std::vector<int> parent_processor_coor(_ndimension,0); | ||||
|   std::vector<int> parent_processors    (_ndimension,1); | ||||
|   Coordinate parent_processor_coor(_ndimension,0); | ||||
|   Coordinate parent_processors    (_ndimension,1); | ||||
|  | ||||
|   // Can make 5d grid from 4d etc... | ||||
|   int pad = _ndimension-parent_ndimension; | ||||
| @@ -133,9 +138,9 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors, | ||||
|   int Nchild = Nparent/childsize; | ||||
|   assert (childsize * Nchild == Nparent); | ||||
|  | ||||
|   std::vector<int> ccoor(_ndimension); // coor within subcommunicator | ||||
|   std::vector<int> scoor(_ndimension); // coor of split within parent | ||||
|   std::vector<int> ssize(_ndimension); // coor of split within parent | ||||
|   Coordinate ccoor(_ndimension); // coor within subcommunicator | ||||
|   Coordinate scoor(_ndimension); // coor of split within parent | ||||
|   Coordinate ssize(_ndimension); // coor of split within parent | ||||
|  | ||||
|   for(int d=0;d<_ndimension;d++){ | ||||
|     ccoor[d] = parent_processor_coor[d] % processors[d]; | ||||
| @@ -152,36 +157,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors, | ||||
|   MPI_Comm comm_split; | ||||
|   if ( Nchild > 1 ) {  | ||||
|  | ||||
|     if(0){ | ||||
|       std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl; | ||||
|       std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"]    "; | ||||
|       for(int d=0;d<parent._ndimension;d++)  std::cout << parent._processors[d] << " "; | ||||
|       std::cout<<std::endl; | ||||
|        | ||||
|       std::cout << GridLogMessage<<" child grid["<< _ndimension <<"]    "; | ||||
|       for(int d=0;d<processors.size();d++)  std::cout << processors[d] << " "; | ||||
|       std::cout<<std::endl; | ||||
|        | ||||
|       std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< parent._ndimension <<"]    "; | ||||
|       for(int d=0;d<parent._ndimension;d++)  std::cout << parent._processor_coor[d] << " "; | ||||
|       std::cout<<std::endl; | ||||
|        | ||||
|       std::cout << GridLogMessage<<" new split "<< srank<<" scoor ["<< _ndimension <<"]    "; | ||||
|       for(int d=0;d<processors.size();d++)  std::cout << scoor[d] << " "; | ||||
|       std::cout<<std::endl; | ||||
|        | ||||
|       std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"]    "; | ||||
|       for(int d=0;d<processors.size();d++)  std::cout << ccoor[d] << " "; | ||||
|       std::cout<<std::endl; | ||||
|  | ||||
|       ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|       // Declare victory | ||||
|       ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|       std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into " | ||||
| 		<< Nchild <<" communicators with " << childsize << " ranks"<<std::endl; | ||||
|       std::cout << " Split communicator " <<comm_split <<std::endl; | ||||
|     } | ||||
|  | ||||
|     //////////////////////////////////////////////////////////////// | ||||
|     // Split the communicator | ||||
|     //////////////////////////////////////////////////////////////// | ||||
| @@ -203,7 +178,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors, | ||||
|   // Take the right SHM buffers | ||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   SetCommunicator(comm_split); | ||||
|    | ||||
|  | ||||
|   /////////////////////////////////////////////// | ||||
|   // Free the temp communicator  | ||||
|   /////////////////////////////////////////////// | ||||
| @@ -220,7 +195,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors, | ||||
|   } | ||||
| } | ||||
|  | ||||
| void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base) | ||||
| void CartesianCommunicator::InitFromMPICommunicator(const Coordinate &processors, MPI_Comm communicator_base) | ||||
| { | ||||
|   //////////////////////////////////////////////////// | ||||
|   // Creates communicator, and the communicator_halo | ||||
| @@ -237,7 +212,7 @@ void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &proc | ||||
|     _Nprocessors*=_processors[i]; | ||||
|   } | ||||
|  | ||||
|   std::vector<int> periodic(_ndimension,1); | ||||
|   Coordinate periodic(_ndimension,1); | ||||
|   MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator); | ||||
|   MPI_Comm_rank(communicator,&_processor); | ||||
|   MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]); | ||||
| @@ -280,6 +255,10 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){ | ||||
|   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); | ||||
|   assert(ierr==0); | ||||
| } | ||||
| void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){ | ||||
|   int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator); | ||||
|   assert(ierr==0); | ||||
| } | ||||
| void CartesianCommunicator::GlobalXOR(uint32_t &u){ | ||||
|   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator); | ||||
|   assert(ierr==0); | ||||
| @@ -474,7 +453,7 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) | ||||
|  | ||||
| void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes) | ||||
| { | ||||
|   std::vector<int> row(_ndimension,1); | ||||
|   Coordinate row(_ndimension,1); | ||||
|   assert(dim>=0 && dim<_ndimension); | ||||
|  | ||||
|   //  Split the communicator | ||||
| @@ -503,7 +482,6 @@ void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t | ||||
|   MPI_Type_free(&object); | ||||
| } | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
|  | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -23,11 +23,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #include <Grid/GridCore.h> | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // Info that is setup once and indept of cartesian layout | ||||
| @@ -38,18 +38,18 @@ void CartesianCommunicator::Init(int *argc, char *** arv) | ||||
| { | ||||
|   GlobalSharedMemory::Init(communicator_world); | ||||
|   GlobalSharedMemory::SharedMemoryAllocate( | ||||
| 		   GlobalSharedMemory::MAX_MPI_SHM_BYTES, | ||||
| 		   GlobalSharedMemory::Hugepages); | ||||
| 					   GlobalSharedMemory::MAX_MPI_SHM_BYTES, | ||||
| 					   GlobalSharedMemory::Hugepages); | ||||
| } | ||||
|  | ||||
| CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)  | ||||
| CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)  | ||||
|   : CartesianCommunicator(processors)  | ||||
| { | ||||
|   srank=0; | ||||
|   SetCommunicator(communicator_world); | ||||
| } | ||||
|  | ||||
| CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | ||||
| CartesianCommunicator::CartesianCommunicator(const Coordinate &processors) | ||||
| { | ||||
|   _processors = processors; | ||||
|   _ndimension = processors.size();  assert(_ndimension>=1); | ||||
| @@ -70,9 +70,10 @@ CartesianCommunicator::~CartesianCommunicator(){} | ||||
| void CartesianCommunicator::GlobalSum(float &){} | ||||
| void CartesianCommunicator::GlobalSumVector(float *,int N){} | ||||
| void CartesianCommunicator::GlobalSum(double &){} | ||||
| void CartesianCommunicator::GlobalSumVector(double *,int N){} | ||||
| void CartesianCommunicator::GlobalSum(uint32_t &){} | ||||
| void CartesianCommunicator::GlobalSum(uint64_t &){} | ||||
| void CartesianCommunicator::GlobalSumVector(double *,int N){} | ||||
| void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){} | ||||
| void CartesianCommunicator::GlobalXOR(uint32_t &){} | ||||
| void CartesianCommunicator::GlobalXOR(uint64_t &){} | ||||
|  | ||||
| @@ -122,8 +123,8 @@ int  CartesianCommunicator::RankWorld(void){return 0;} | ||||
| void CartesianCommunicator::Barrier(void){} | ||||
| void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {} | ||||
| void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { } | ||||
| int  CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) {  return 0;} | ||||
| void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){  coor = _processor_coor; } | ||||
| int  CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) {  return 0;} | ||||
| void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){  coor = _processor_coor; } | ||||
| void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) | ||||
| { | ||||
|   source =0; | ||||
| @@ -160,6 +161,6 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque | ||||
|  | ||||
| void CartesianCommunicator::StencilBarrier(void){}; | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -28,10 +28,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
| #include <Grid/GridCore.h> | ||||
|  | ||||
| namespace Grid {  | ||||
| NAMESPACE_BEGIN(Grid);  | ||||
|  | ||||
| // static data | ||||
|  | ||||
| int                 GlobalSharedMemory::HPEhypercube = 1; | ||||
| uint64_t            GlobalSharedMemory::MAX_MPI_SHM_BYTES   = 1024LL*1024LL*1024LL;  | ||||
| int                 GlobalSharedMemory::Hugepages = 0; | ||||
| int                 GlobalSharedMemory::_ShmSetup; | ||||
| @@ -73,9 +74,12 @@ void *SharedMemory::ShmBufferMalloc(size_t bytes){ | ||||
|   if (heap_bytes >= heap_size) { | ||||
|     std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl; | ||||
|     std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl; | ||||
|     std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl; | ||||
|     std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl; | ||||
|     std::cout<< " Current bytes is " << (heap_bytes/(1024*1024)) <<"MB"<<std::endl; | ||||
|     std::cout<< " Current heap  is " << (heap_size/(1024*1024)) <<"MB"<<std::endl; | ||||
|     assert(heap_bytes<heap_size); | ||||
|   } | ||||
|   //std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl; | ||||
|   return ptr; | ||||
| } | ||||
| void SharedMemory::ShmBufferFreeAll(void) {  | ||||
| @@ -84,9 +88,9 @@ void SharedMemory::ShmBufferFreeAll(void) { | ||||
| } | ||||
| void *SharedMemory::ShmBufferSelf(void) | ||||
| { | ||||
|   //std::cerr << "ShmBufferSelf "<<ShmRank<<" "<<std::hex<< ShmCommBufs[ShmRank] <<std::dec<<std::endl; | ||||
|   return ShmCommBufs[ShmRank]; | ||||
| } | ||||
|  | ||||
| NAMESPACE_END(Grid);  | ||||
|  | ||||
|  | ||||
| } | ||||
|   | ||||
| @@ -25,18 +25,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
|  | ||||
|  | ||||
| // TODO | ||||
| // 1) move includes into SharedMemory.cc | ||||
| // | ||||
| // 2) split shared memory into a) optimal communicator creation from comm world | ||||
| //  | ||||
| //                             b) shared memory buffers container | ||||
| //                                -- static globally shared; init once | ||||
| //                                -- per instance set of buffers. | ||||
| //                                    | ||||
|  | ||||
| #pragma once  | ||||
|  | ||||
| #include <Grid/GridCore.h> | ||||
| @@ -53,30 +41,33 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| #include <sys/shm.h> | ||||
| #include <sys/mman.h> | ||||
| #include <zlib.h> | ||||
| #ifdef HAVE_NUMAIF_H | ||||
| #include <numaif.h> | ||||
| #endif | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| #if defined (GRID_COMMS_MPI3)  | ||||
|   typedef MPI_Comm    Grid_MPI_Comm; | ||||
|   typedef MPI_Request CommsRequest_t; | ||||
| typedef MPI_Comm    Grid_MPI_Comm; | ||||
| typedef MPI_Request CommsRequest_t; | ||||
| #else  | ||||
|   typedef int CommsRequest_t; | ||||
|   typedef int Grid_MPI_Comm; | ||||
| typedef int CommsRequest_t; | ||||
| typedef int Grid_MPI_Comm; | ||||
| #endif | ||||
|  | ||||
| class GlobalSharedMemory { | ||||
|  private: | ||||
| private: | ||||
|   static const int     MAXLOG2RANKSPERNODE = 16;             | ||||
|  | ||||
|  | ||||
|   // Init once lock on the buffer allocation | ||||
|   static int      _ShmSetup; | ||||
|   static int      _ShmAlloc; | ||||
|   static uint64_t _ShmAllocBytes; | ||||
|  | ||||
|  public: | ||||
| public: | ||||
|   /////////////////////////////////////// | ||||
|   // HPE 8600 hypercube optimisation | ||||
|   /////////////////////////////////////// | ||||
|   static int HPEhypercube; | ||||
|  | ||||
|   static int      ShmSetup(void)      { return _ShmSetup; } | ||||
|   static int      ShmAlloc(void)      { return _ShmAlloc; } | ||||
|   static uint64_t ShmAllocBytes(void) { return _ShmAllocBytes; } | ||||
| @@ -102,14 +93,17 @@ class GlobalSharedMemory { | ||||
|   // Create an optimal reordered communicator that makes MPI_Cart_create get it right | ||||
|   ////////////////////////////////////////////////////////////////////////////////////// | ||||
|   static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD | ||||
|   static void OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian | ||||
|   static void OptimalCommunicatorHypercube(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian | ||||
|   static void OptimalCommunicatorSharedMemory(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian | ||||
|   static void OptimalCommunicator            (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian | ||||
|   static void OptimalCommunicatorHypercube   (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian | ||||
|   static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian | ||||
|   static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims); | ||||
|   /////////////////////////////////////////////////// | ||||
|   // Provide shared memory facilities off comm world | ||||
|   /////////////////////////////////////////////////// | ||||
|   static void SharedMemoryAllocate(uint64_t bytes, int flags); | ||||
|   static void SharedMemoryFree(void); | ||||
|   static void SharedMemoryCopy(void *dest,const void *src,size_t bytes); | ||||
|   static void SharedMemoryZero(void *dest,size_t bytes); | ||||
|  | ||||
| }; | ||||
|  | ||||
| @@ -118,14 +112,14 @@ class GlobalSharedMemory { | ||||
| ////////////////////////////// | ||||
| class SharedMemory  | ||||
| { | ||||
|  private: | ||||
| private: | ||||
|   static const int     MAXLOG2RANKSPERNODE = 16;             | ||||
|  | ||||
|   size_t heap_top; | ||||
|   size_t heap_bytes; | ||||
|   size_t heap_size; | ||||
|  | ||||
|  protected: | ||||
| protected: | ||||
|  | ||||
|   Grid_MPI_Comm    ShmComm; // for barriers | ||||
|   int    ShmRank;  | ||||
| @@ -133,7 +127,7 @@ class SharedMemory | ||||
|   std::vector<void *> ShmCommBufs; | ||||
|   std::vector<int>    ShmRanks;// Mapping comm ranks to Shm ranks | ||||
|  | ||||
|  public: | ||||
| public: | ||||
|   SharedMemory() {}; | ||||
|   ~SharedMemory(); | ||||
|   /////////////////////////////////////////////////////////////////////////////////////// | ||||
| @@ -150,6 +144,7 @@ class SharedMemory | ||||
|   // Call on any instance | ||||
|   /////////////////////////////////////////////////// | ||||
|   void SharedMemoryTest(void); | ||||
|    | ||||
|   void *ShmBufferSelf(void); | ||||
|   void *ShmBuffer    (int rank); | ||||
|   void *ShmBufferTranslate(int rank,void * local_p); | ||||
| @@ -164,4 +159,5 @@ class SharedMemory | ||||
|  | ||||
| }; | ||||
|  | ||||
| } | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
|   | ||||
| @@ -29,8 +29,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| #include <Grid/GridCore.h> | ||||
| #include <pwd.h> | ||||
|  | ||||
| namespace Grid {  | ||||
| #ifdef GRID_CUDA | ||||
| #include <cuda_runtime_api.h> | ||||
| #endif | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid);  | ||||
| #define header "SharedMemoryMpi: " | ||||
| /*Construct from an MPI communicator*/ | ||||
| void GlobalSharedMemory::Init(Grid_MPI_Comm comm) | ||||
| { | ||||
| @@ -46,6 +50,11 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm) | ||||
|   MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm); | ||||
|   MPI_Comm_rank(WorldShmComm     ,&WorldShmRank); | ||||
|   MPI_Comm_size(WorldShmComm     ,&WorldShmSize); | ||||
|  | ||||
|   if ( WorldRank == 0) { | ||||
|     std::cout << header " World communicator of size " <<WorldSize << std::endl;   | ||||
|     std::cout << header " Node  communicator of size " <<WorldShmSize << std::endl; | ||||
|   } | ||||
|   // WorldShmComm, WorldShmSize, WorldShmRank | ||||
|  | ||||
|   // WorldNodes | ||||
| @@ -130,7 +139,7 @@ int Log2Size(int TwoToPower,int MAXLOG2) | ||||
|   } | ||||
|   return log2size; | ||||
| } | ||||
| void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm) | ||||
| void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm) | ||||
| { | ||||
|   ////////////////////////////////////////////////////////////////////////////// | ||||
|   // Look and see if it looks like an HPE 8600 based on hostname conventions | ||||
| @@ -143,10 +152,46 @@ void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors, | ||||
|   gethostname(name,namelen); | ||||
|   int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ; | ||||
|  | ||||
|   if(nscan==3) OptimalCommunicatorHypercube(processors,optimal_comm); | ||||
|   else         OptimalCommunicatorSharedMemory(processors,optimal_comm); | ||||
|   if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm); | ||||
|   else                          OptimalCommunicatorSharedMemory(processors,optimal_comm); | ||||
| } | ||||
| void GlobalSharedMemory::OptimalCommunicatorHypercube(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm) | ||||
| static inline int divides(int a,int b) | ||||
| { | ||||
|   return ( b == ( (b/a)*a ) ); | ||||
| } | ||||
| void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims) | ||||
| { | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   // Powers of 2,3,5 only in prime decomposition for now | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   int ndimension = WorldDims.size(); | ||||
|   ShmDims=Coordinate(ndimension,1); | ||||
|  | ||||
|   std::vector<int> primes({2,3,5}); | ||||
|  | ||||
|   int dim = 0; | ||||
|   int last_dim = ndimension - 1; | ||||
|   int AutoShmSize = 1; | ||||
|   while(AutoShmSize != WorldShmSize) { | ||||
|     int p; | ||||
|     for(p=0;p<primes.size();p++) { | ||||
|       int prime=primes[p]; | ||||
|       if ( divides(prime,WorldDims[dim]/ShmDims[dim]) | ||||
|         && divides(prime,WorldShmSize/AutoShmSize)  ) { | ||||
| 	AutoShmSize*=prime; | ||||
| 	ShmDims[dim]*=prime; | ||||
| 	last_dim = dim; | ||||
| 	break; | ||||
|       } | ||||
|     } | ||||
|     if (p == primes.size() && last_dim == dim) { | ||||
|       std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl; | ||||
|       exit(EXIT_FAILURE); | ||||
|     } | ||||
|     dim=(dim+1) %ndimension; | ||||
|   } | ||||
| } | ||||
| void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm) | ||||
| { | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   // Assert power of two shm_size. | ||||
| @@ -188,9 +233,9 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const std::vector<int> &pr | ||||
|   } | ||||
|  | ||||
|   std::string hname(name); | ||||
|   std::cout << "hostname "<<hname<<std::endl; | ||||
|   std::cout << "R " << R << " I " << I << " N "<< N | ||||
|             << " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl; | ||||
|   //  std::cout << "hostname "<<hname<<std::endl; | ||||
|   //  std::cout << "R " << R << " I " << I << " N "<< N | ||||
|   //            << " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl; | ||||
|  | ||||
|   ////////////////////////////////////////////////////////////////// | ||||
|   // broadcast node 0's base coordinate for this partition. | ||||
| @@ -212,16 +257,13 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const std::vector<int> &pr | ||||
|   // in a maximally symmetrical way | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   int ndimension              = processors.size(); | ||||
|   std::vector<int> processor_coor(ndimension); | ||||
|   std::vector<int> WorldDims = processors;   std::vector<int> ShmDims  (ndimension,1);  std::vector<int> NodeDims (ndimension); | ||||
|   std::vector<int> ShmCoor  (ndimension);    std::vector<int> NodeCoor (ndimension);    std::vector<int> WorldCoor(ndimension); | ||||
|   std::vector<int> HyperCoor(ndimension); | ||||
|   int dim = 0; | ||||
|   for(int l2=0;l2<log2size;l2++){ | ||||
|     while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension; | ||||
|     ShmDims[dim]*=2; | ||||
|     dim=(dim+1)%ndimension; | ||||
|   } | ||||
|   Coordinate processor_coor(ndimension); | ||||
|   Coordinate WorldDims = processors; | ||||
|   Coordinate ShmDims  (ndimension);  Coordinate NodeDims (ndimension); | ||||
|   Coordinate ShmCoor  (ndimension);    Coordinate NodeCoor (ndimension);    Coordinate WorldCoor(ndimension); | ||||
|   Coordinate HyperCoor(ndimension); | ||||
|  | ||||
|   GetShmDims(WorldDims,ShmDims); | ||||
|  | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   // Establish torus of processes and nodes with sub-blockings | ||||
| @@ -240,7 +282,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const std::vector<int> &pr | ||||
|      HyperCoor[d]=hcoor & msk;   | ||||
|      HyperCoor[d]=BinaryToGray(HyperCoor[d]); // Space filling curve magic | ||||
|      hcoor = hcoor >> bits; | ||||
|   }  | ||||
|   } | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   // Check processor counts match | ||||
|   //////////////////////////////////////////////////////////////// | ||||
| @@ -269,29 +311,18 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const std::vector<int> &pr | ||||
|   int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm); | ||||
|   assert(ierr==0); | ||||
| } | ||||
| void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm) | ||||
| void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm) | ||||
| { | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   // Assert power of two shm_size. | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE); | ||||
|   assert(log2size != -1); | ||||
|  | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   // Identify subblock of ranks on node spreading across dims | ||||
|   // in a maximally symmetrical way | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   int ndimension              = processors.size(); | ||||
|   std::vector<int> processor_coor(ndimension); | ||||
|   std::vector<int> WorldDims = processors;   std::vector<int> ShmDims  (ndimension,1);  std::vector<int> NodeDims (ndimension); | ||||
|   std::vector<int> ShmCoor  (ndimension);    std::vector<int> NodeCoor (ndimension);    std::vector<int> WorldCoor(ndimension); | ||||
|   int dim = 0; | ||||
|   for(int l2=0;l2<log2size;l2++){ | ||||
|     while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension; | ||||
|     ShmDims[dim]*=2; | ||||
|     dim=(dim+1)%ndimension; | ||||
|   } | ||||
|   Coordinate processor_coor(ndimension); | ||||
|   Coordinate WorldDims = processors; Coordinate ShmDims(ndimension);  Coordinate NodeDims (ndimension); | ||||
|   Coordinate ShmCoor(ndimension);    Coordinate NodeCoor(ndimension);   Coordinate WorldCoor(ndimension); | ||||
|  | ||||
|   GetShmDims(WorldDims,ShmDims); | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   // Establish torus of processes and nodes with sub-blockings | ||||
|   //////////////////////////////////////////////////////////////// | ||||
| @@ -330,7 +361,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const std::vector<int> | ||||
| #ifdef GRID_MPI3_SHMGET | ||||
| void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
| { | ||||
|   std::cout << "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl; | ||||
|   std::cout << header "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl; | ||||
|   assert(_ShmSetup==1); | ||||
|   assert(_ShmAlloc==0); | ||||
|  | ||||
| @@ -385,14 +416,101 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
|   _ShmAllocBytes  = bytes; | ||||
| } | ||||
| #endif | ||||
|   | ||||
|  | ||||
| //////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // Hugetlbfs mapping intended | ||||
| //////////////////////////////////////////////////////////////////////////////////////////// | ||||
| #ifdef GRID_CUDA | ||||
| void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
| { | ||||
|   void * ShmCommBuf ;  | ||||
|   assert(_ShmSetup==1); | ||||
|   assert(_ShmAlloc==0); | ||||
|  | ||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // allocate the pointer array for shared windows for our group | ||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   MPI_Barrier(WorldShmComm); | ||||
|   WorldShmCommBufs.resize(WorldShmSize); | ||||
|  | ||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // TODO/FIXME : NOT ALL NVLINK BOARDS have full Peer to peer connectivity. | ||||
|   // The annoyance is that they have partial peer 2 peer. This occurs on the 8 GPU blades. | ||||
|   // e.g. DGX1, supermicro board,  | ||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   //  cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2); | ||||
|  | ||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // Each MPI rank should allocate our own buffer | ||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   auto err =  cudaMalloc(&ShmCommBuf, bytes); | ||||
|   if ( err !=  cudaSuccess) { | ||||
|     std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl; | ||||
|     exit(EXIT_FAILURE);   | ||||
|   } | ||||
|   if (ShmCommBuf == (void *)NULL ) { | ||||
|     std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl; | ||||
|     exit(EXIT_FAILURE);   | ||||
|   } | ||||
|   if ( WorldRank == 0 ){ | ||||
|     std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl; | ||||
|   } | ||||
|   SharedMemoryZero(ShmCommBuf,bytes); | ||||
|  | ||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // Loop over ranks/gpu's on our node | ||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   for(int r=0;r<WorldShmSize;r++){ | ||||
|      | ||||
|     ////////////////////////////////////////////////// | ||||
|     // If it is me, pass around the IPC access key | ||||
|     ////////////////////////////////////////////////// | ||||
|     cudaIpcMemHandle_t handle; | ||||
|      | ||||
|     if ( r==WorldShmRank ) {  | ||||
|       err = cudaIpcGetMemHandle(&handle,ShmCommBuf); | ||||
|       if ( err !=  cudaSuccess) { | ||||
| 	std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl; | ||||
| 	exit(EXIT_FAILURE); | ||||
|       } | ||||
|     } | ||||
|     ////////////////////////////////////////////////// | ||||
|     // Share this IPC handle across the Shm Comm | ||||
|     ////////////////////////////////////////////////// | ||||
|     {  | ||||
|       int ierr=MPI_Bcast(&handle, | ||||
| 			 sizeof(handle), | ||||
| 			 MPI_BYTE, | ||||
| 			 r, | ||||
| 			 WorldShmComm); | ||||
|       assert(ierr==0); | ||||
|     } | ||||
|      | ||||
|     /////////////////////////////////////////////////////////////// | ||||
|     // If I am not the source, overwrite thisBuf with remote buffer | ||||
|     /////////////////////////////////////////////////////////////// | ||||
|     void * thisBuf = ShmCommBuf; | ||||
|     if ( r!=WorldShmRank ) {  | ||||
|       err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess); | ||||
|       if ( err !=  cudaSuccess) { | ||||
| 	std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl; | ||||
| 	exit(EXIT_FAILURE); | ||||
|       } | ||||
|     } | ||||
|     /////////////////////////////////////////////////////////////// | ||||
|     // Save a copy of the device buffers | ||||
|     /////////////////////////////////////////////////////////////// | ||||
|     WorldShmCommBufs[r] = thisBuf; | ||||
|   } | ||||
|  | ||||
|   _ShmAllocBytes=bytes; | ||||
|   _ShmAlloc=1; | ||||
| } | ||||
| #else  | ||||
| #ifdef GRID_MPI3_SHMMMAP | ||||
| void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
| { | ||||
|   std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl; | ||||
|   std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl; | ||||
|   assert(_ShmSetup==1); | ||||
|   assert(_ShmAlloc==0); | ||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| @@ -429,7 +547,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
|     assert(((uint64_t)ptr&0x3F)==0); | ||||
|     close(fd); | ||||
|     WorldShmCommBufs[r] =ptr; | ||||
|     //    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl; | ||||
|     //    std::cout << header "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl; | ||||
|   } | ||||
|   _ShmAlloc=1; | ||||
|   _ShmAllocBytes  = bytes; | ||||
| @@ -439,7 +557,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
| #ifdef GRID_MPI3_SHM_NONE | ||||
| void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
| { | ||||
|   std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl; | ||||
|   std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl; | ||||
|   assert(_ShmSetup==1); | ||||
|   assert(_ShmAlloc==0); | ||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| @@ -486,7 +604,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
| //////////////////////////////////////////////////////////////////////////////////////////// | ||||
| void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
| {  | ||||
|   std::cout << "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl; | ||||
|   std::cout << header "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl; | ||||
|   assert(_ShmSetup==1); | ||||
|   assert(_ShmAlloc==0);  | ||||
|   MPI_Barrier(WorldShmComm); | ||||
| @@ -552,14 +670,31 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
|   _ShmAllocBytes = bytes; | ||||
| } | ||||
| #endif | ||||
| #endif // End NVCC case for GPU device buffers | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|   //////////////////////////////////////////////////////// | ||||
|   // Global shared functionality finished | ||||
|   // Now move to per communicator functionality | ||||
|   //////////////////////////////////////////////////////// | ||||
| ///////////////////////////////////////////////////////////////////////// | ||||
| // Routines accessing shared memory should route through for GPU safety | ||||
| ///////////////////////////////////////////////////////////////////////// | ||||
| void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes) | ||||
| { | ||||
| #ifdef GRID_CUDA | ||||
|   cudaMemset(dest,0,bytes); | ||||
| #else | ||||
|   bzero(dest,bytes); | ||||
| #endif | ||||
| } | ||||
| void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes) | ||||
| { | ||||
| #ifdef GRID_CUDA | ||||
|   cudaMemcpy(dest,src,bytes,cudaMemcpyDefault); | ||||
| #else    | ||||
|   bcopy(src,dest,bytes); | ||||
| #endif | ||||
| } | ||||
| //////////////////////////////////////////////////////// | ||||
| // Global shared functionality finished | ||||
| // Now move to per communicator functionality | ||||
| //////////////////////////////////////////////////////// | ||||
| void SharedMemory::SetCommunicator(Grid_MPI_Comm comm) | ||||
| { | ||||
|   int rank, size; | ||||
| @@ -587,7 +722,6 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm) | ||||
|     MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm); | ||||
|  | ||||
|     ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr]; | ||||
|     //    std::cout << "SetCommunicator ShmCommBufs ["<< r<< "] = "<< ShmCommBufs[r]<< "  wsr = "<<wsr<<std::endl; | ||||
|   } | ||||
|   ShmBufferFreeAll(); | ||||
|  | ||||
| @@ -600,6 +734,26 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm) | ||||
|  | ||||
|   std::vector<int> ranks(size);   for(int r=0;r<size;r++) ranks[r]=r; | ||||
|   MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);  | ||||
|  | ||||
| #ifdef GRID_IBM_SUMMIT | ||||
|   // Hide the shared memory path between sockets  | ||||
|   // if even number of nodes | ||||
|   if ( (ShmSize & 0x1)==0 ) { | ||||
|     int SocketSize = ShmSize/2; | ||||
|     int mySocket = ShmRank/SocketSize;  | ||||
|     for(int r=0;r<size;r++){ | ||||
|       int hisRank=ShmRanks[r]; | ||||
|       if ( hisRank!= MPI_UNDEFINED ) { | ||||
| 	int hisSocket=hisRank/SocketSize; | ||||
| 	if ( hisSocket != mySocket ) { | ||||
| 	  ShmRanks[r] = MPI_UNDEFINED; | ||||
| 	} | ||||
|       } | ||||
|     } | ||||
|   } | ||||
| #endif | ||||
|  | ||||
|   SharedMemoryTest(); | ||||
| } | ||||
| ////////////////////////////////////////////////////////////////// | ||||
| // On node barrier | ||||
| @@ -614,24 +768,26 @@ void SharedMemory::ShmBarrier(void) | ||||
| void SharedMemory::SharedMemoryTest(void) | ||||
| { | ||||
|   ShmBarrier(); | ||||
|   uint64_t check[3]; | ||||
|   uint64_t magic = 0x5A5A5A; | ||||
|   if ( ShmRank == 0 ) { | ||||
|     for(int r=0;r<ShmSize;r++){ | ||||
|       uint64_t * check = (uint64_t *) ShmCommBufs[r]; | ||||
|       check[0] = GlobalSharedMemory::WorldNode; | ||||
|       check[1] = r; | ||||
|       check[2] = 0x5A5A5A; | ||||
|     for(uint64_t r=0;r<ShmSize;r++){ | ||||
|        check[0]=GlobalSharedMemory::WorldNode; | ||||
|        check[1]=r; | ||||
|        check[2]=magic; | ||||
|        GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t)); | ||||
|     } | ||||
|   } | ||||
|   ShmBarrier(); | ||||
|   for(int r=0;r<ShmSize;r++){ | ||||
|     uint64_t * check = (uint64_t *) ShmCommBufs[r]; | ||||
|      | ||||
|   for(uint64_t r=0;r<ShmSize;r++){ | ||||
|     ShmBarrier(); | ||||
|     GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t)); | ||||
|     ShmBarrier(); | ||||
|     assert(check[0]==GlobalSharedMemory::WorldNode); | ||||
|     assert(check[1]==r); | ||||
|     assert(check[2]==0x5A5A5A); | ||||
|      | ||||
|     assert(check[2]==magic); | ||||
|     ShmBarrier(); | ||||
|   } | ||||
|   ShmBarrier(); | ||||
| } | ||||
|  | ||||
| void *SharedMemory::ShmBuffer(int rank) | ||||
| @@ -645,7 +801,6 @@ void *SharedMemory::ShmBuffer(int rank) | ||||
| } | ||||
| void *SharedMemory::ShmBufferTranslate(int rank,void * local_p) | ||||
| { | ||||
|   static int count =0; | ||||
|   int gpeer = ShmRanks[rank]; | ||||
|   assert(gpeer!=ShmRank); // never send to self | ||||
|   if (gpeer == MPI_UNDEFINED){ | ||||
| @@ -664,4 +819,5 @@ SharedMemory::~SharedMemory() | ||||
|   } | ||||
| }; | ||||
|  | ||||
| } | ||||
| NAMESPACE_END(Grid);  | ||||
|  | ||||
|   | ||||
| @@ -28,7 +28,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
| #include <Grid/GridCore.h> | ||||
|  | ||||
| namespace Grid {  | ||||
| NAMESPACE_BEGIN(Grid);  | ||||
|  | ||||
| /*Construct from an MPI communicator*/ | ||||
| void GlobalSharedMemory::Init(Grid_MPI_Comm comm) | ||||
| @@ -47,7 +47,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm) | ||||
|   _ShmSetup=1; | ||||
| } | ||||
|  | ||||
| void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm) | ||||
| void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm) | ||||
| { | ||||
|   optimal_comm = WorldComm; | ||||
| } | ||||
| @@ -84,10 +84,10 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
|   _ShmAlloc=1; | ||||
| }; | ||||
|  | ||||
|   //////////////////////////////////////////////////////// | ||||
|   // Global shared functionality finished | ||||
|   // Now move to per communicator functionality | ||||
|   //////////////////////////////////////////////////////// | ||||
| //////////////////////////////////////////////////////// | ||||
| // Global shared functionality finished | ||||
| // Now move to per communicator functionality | ||||
| //////////////////////////////////////////////////////// | ||||
| void SharedMemory::SetCommunicator(Grid_MPI_Comm comm) | ||||
| { | ||||
|   assert(GlobalSharedMemory::ShmAlloc()==1); | ||||
| @@ -125,4 +125,5 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p) | ||||
| SharedMemory::~SharedMemory() | ||||
| {}; | ||||
|  | ||||
| } | ||||
| NAMESPACE_END(Grid);  | ||||
|  | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -23,8 +23,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef _GRID_CSHIFT_H_ | ||||
| #define _GRID_CSHIFT_H_ | ||||
|  | ||||
| @@ -49,4 +49,29 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| #ifdef GRID_COMMS_SHMEM | ||||
| #include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator | ||||
| #endif  | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| template<typename Op, typename T1>  | ||||
| auto Cshift(const LatticeUnaryExpression<Op,T1> &expr,int dim,int shift) | ||||
|     -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>  | ||||
| { | ||||
|   return Cshift(closure(expr),dim,shift); | ||||
| } | ||||
| template <class Op, class T1, class T2> | ||||
| auto Cshift(const LatticeBinaryExpression<Op,T1,T2> &expr,int dim,int shift) | ||||
|   -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))>  | ||||
| { | ||||
|   return Cshift(closure(expr),dim,shift); | ||||
| } | ||||
| template <class Op, class T1, class T2, class T3> | ||||
| auto Cshift(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr,int dim,int shift) | ||||
|   -> Lattice<decltype(expr.op.func(eval(0, expr.arg1), | ||||
| 				   eval(0, expr.arg2), | ||||
| 				   eval(0, expr.arg3)))>  | ||||
| { | ||||
|   return Cshift(closure(expr),dim,shift); | ||||
| } | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
| #endif | ||||
|   | ||||
| @@ -25,10 +25,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #ifndef _GRID_CSHIFT_COMMON_H_ | ||||
| #define _GRID_CSHIFT_COMMON_H_ | ||||
| #pragma once | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| extern Vector<std::pair<int,int> > Cshift_table;  | ||||
|  | ||||
| /////////////////////////////////////////////////////////////////// | ||||
| // Gather for when there is no need to SIMD split  | ||||
| @@ -36,26 +37,27 @@ namespace Grid { | ||||
| template<class vobj> void  | ||||
| Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0) | ||||
| { | ||||
|   int rd = rhs._grid->_rdimensions[dimension]; | ||||
|   int rd = rhs.Grid()->_rdimensions[dimension]; | ||||
|  | ||||
|   if ( !rhs._grid->CheckerBoarded(dimension) ) { | ||||
|   if ( !rhs.Grid()->CheckerBoarded(dimension) ) { | ||||
|     cbmask = 0x3; | ||||
|   } | ||||
|    | ||||
|   int so=plane*rhs._grid->_ostride[dimension]; // base offset for start of plane  | ||||
|   int e1=rhs._grid->_slice_nblock[dimension]; | ||||
|   int e2=rhs._grid->_slice_block[dimension]; | ||||
|   int so=plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane  | ||||
|   int e1=rhs.Grid()->_slice_nblock[dimension]; | ||||
|   int e2=rhs.Grid()->_slice_block[dimension]; | ||||
|   int ent = 0; | ||||
|  | ||||
|   static std::vector<std::pair<int,int> > table; table.resize(e1*e2); | ||||
|   if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest | ||||
|  | ||||
|   int stride=rhs.Grid()->_slice_stride[dimension]; | ||||
|  | ||||
|   int stride=rhs._grid->_slice_stride[dimension]; | ||||
|   if ( cbmask == 0x3 ) {  | ||||
|     for(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
| 	int o  = n*stride; | ||||
| 	int bo = n*e2; | ||||
| 	table[ent++] = std::pair<int,int>(off+bo+b,so+o+b); | ||||
| 	Cshift_table[ent++] = std::pair<int,int>(off+bo+b,so+o+b); | ||||
|       } | ||||
|     } | ||||
|   } else {  | ||||
| @@ -63,15 +65,20 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen | ||||
|      for(int n=0;n<e1;n++){ | ||||
|        for(int b=0;b<e2;b++){ | ||||
| 	 int o  = n*stride; | ||||
| 	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b); | ||||
| 	 int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b); | ||||
| 	 if ( ocb &cbmask ) { | ||||
| 	   table[ent++]=std::pair<int,int> (off+bo++,so+o+b); | ||||
| 	   Cshift_table[ent++]=std::pair<int,int> (off+bo++,so+o+b); | ||||
| 	 } | ||||
|        } | ||||
|      } | ||||
|   } | ||||
|   parallel_for(int i=0;i<ent;i++){ | ||||
|     buffer[table[i].first]=rhs._odata[table[i].second]; | ||||
|   { | ||||
|     autoView(rhs_v , rhs, AcceleratorRead); | ||||
|     auto buffer_p = & buffer[0]; | ||||
|     auto table = &Cshift_table[0]; | ||||
|     accelerator_for(i,ent,1,{ | ||||
|       buffer_p[table[i].first]=rhs_v[table[i].second]; | ||||
|     }); | ||||
|   } | ||||
| } | ||||
|  | ||||
| @@ -79,50 +86,54 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen | ||||
| // Gather for when there *is* need to SIMD split  | ||||
| /////////////////////////////////////////////////////////////////// | ||||
| template<class vobj> void  | ||||
| Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask) | ||||
| Gather_plane_extract(const Lattice<vobj> &rhs, | ||||
| 		     ExtractPointerArray<typename vobj::scalar_object> pointers, | ||||
| 		     int dimension,int plane,int cbmask) | ||||
| { | ||||
|   int rd = rhs._grid->_rdimensions[dimension]; | ||||
|   int rd = rhs.Grid()->_rdimensions[dimension]; | ||||
|  | ||||
|   if ( !rhs._grid->CheckerBoarded(dimension) ) { | ||||
|   if ( !rhs.Grid()->CheckerBoarded(dimension) ) { | ||||
|     cbmask = 0x3; | ||||
|   } | ||||
|  | ||||
|   int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane  | ||||
|   int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane  | ||||
|  | ||||
|   int e1=rhs._grid->_slice_nblock[dimension]; | ||||
|   int e2=rhs._grid->_slice_block[dimension]; | ||||
|   int n1=rhs._grid->_slice_stride[dimension]; | ||||
|   int e1=rhs.Grid()->_slice_nblock[dimension]; | ||||
|   int e2=rhs.Grid()->_slice_block[dimension]; | ||||
|   int n1=rhs.Grid()->_slice_stride[dimension]; | ||||
|  | ||||
|   if ( cbmask ==0x3){ | ||||
|     parallel_for_nest2(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
|  | ||||
|     autoView(rhs_v , rhs, AcceleratorRead); | ||||
|     accelerator_for2d(n,e1,b,e2,1,{ | ||||
| 	int o      =   n*n1; | ||||
| 	int offset = b+n*e2; | ||||
| 	 | ||||
| 	vobj temp =rhs._odata[so+o+b]; | ||||
| 	vobj temp =rhs_v[so+o+b]; | ||||
| 	extract<vobj>(temp,pointers,offset); | ||||
|  | ||||
|       } | ||||
|     } | ||||
|       }); | ||||
|   } else {  | ||||
|     autoView(rhs_v , rhs, AcceleratorRead); | ||||
|  | ||||
|     // Case of SIMD split AND checker dim cannot currently be hit, except in  | ||||
|     // Test_cshift_red_black code. | ||||
|     std::cout << " Dense packed buffer WARNING " <<std::endl; | ||||
|     parallel_for_nest2(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
|     Coordinate rdim=rhs.Grid()->_rdimensions; | ||||
|     Coordinate cdm =rhs.Grid()->_checker_dim_mask; | ||||
|     std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb? | ||||
|     accelerator_for2d(n,e1,b,e2,1,{ | ||||
|  | ||||
| 	Coordinate coor; | ||||
|  | ||||
| 	int o=n*n1; | ||||
| 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b); | ||||
| 	int oindex = o+b; | ||||
|  | ||||
|        	int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm); | ||||
|  | ||||
| 	int ocb=1<<cb; | ||||
| 	int offset = b+n*e2; | ||||
|  | ||||
| 	if ( ocb & cbmask ) { | ||||
| 	  vobj temp =rhs._odata[so+o+b]; | ||||
| 	  vobj temp =rhs_v[so+o+b]; | ||||
| 	  extract<vobj>(temp,pointers,offset); | ||||
| 	} | ||||
|       } | ||||
|     } | ||||
|       }); | ||||
|   } | ||||
| } | ||||
|  | ||||
| @@ -131,28 +142,29 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_ | ||||
| ////////////////////////////////////////////////////// | ||||
| template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask) | ||||
| { | ||||
|   int rd = rhs._grid->_rdimensions[dimension]; | ||||
|   int rd = rhs.Grid()->_rdimensions[dimension]; | ||||
|  | ||||
|   if ( !rhs._grid->CheckerBoarded(dimension) ) { | ||||
|   if ( !rhs.Grid()->CheckerBoarded(dimension) ) { | ||||
|     cbmask=0x3; | ||||
|   } | ||||
|  | ||||
|   int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane  | ||||
|   int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane  | ||||
|      | ||||
|   int e1=rhs._grid->_slice_nblock[dimension]; | ||||
|   int e2=rhs._grid->_slice_block[dimension]; | ||||
|   int stride=rhs._grid->_slice_stride[dimension]; | ||||
|   int e1=rhs.Grid()->_slice_nblock[dimension]; | ||||
|   int e2=rhs.Grid()->_slice_block[dimension]; | ||||
|   int stride=rhs.Grid()->_slice_stride[dimension]; | ||||
|  | ||||
|   if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest | ||||
|  | ||||
|   static std::vector<std::pair<int,int> > table; table.resize(e1*e2); | ||||
|   int ent    =0; | ||||
|  | ||||
|   if ( cbmask ==0x3 ) { | ||||
|  | ||||
|     for(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
| 	int o   =n*rhs._grid->_slice_stride[dimension]; | ||||
| 	int bo  =n*rhs._grid->_slice_block[dimension]; | ||||
| 	table[ent++] = std::pair<int,int>(so+o+b,bo+b); | ||||
| 	int o   =n*rhs.Grid()->_slice_stride[dimension]; | ||||
| 	int bo  =n*rhs.Grid()->_slice_block[dimension]; | ||||
| 	Cshift_table[ent++] = std::pair<int,int>(so+o+b,bo+b); | ||||
|       } | ||||
|     } | ||||
|  | ||||
| @@ -160,57 +172,62 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo | ||||
|     int bo=0; | ||||
|     for(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
| 	int o   =n*rhs._grid->_slice_stride[dimension]; | ||||
| 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup | ||||
| 	int o   =n*rhs.Grid()->_slice_stride[dimension]; | ||||
| 	int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup | ||||
| 	if ( ocb & cbmask ) { | ||||
| 	  table[ent++]=std::pair<int,int> (so+o+b,bo++); | ||||
| 	  Cshift_table[ent++]=std::pair<int,int> (so+o+b,bo++); | ||||
| 	} | ||||
|       } | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   parallel_for(int i=0;i<ent;i++){ | ||||
|     rhs._odata[table[i].first]=buffer[table[i].second]; | ||||
|    | ||||
|   { | ||||
|     autoView( rhs_v, rhs, AcceleratorWrite); | ||||
|     auto buffer_p = & buffer[0]; | ||||
|     auto table = &Cshift_table[0]; | ||||
|     accelerator_for(i,ent,1,{ | ||||
| 	rhs_v[table[i].first]=buffer_p[table[i].second]; | ||||
|     }); | ||||
|   } | ||||
| } | ||||
|  | ||||
| ////////////////////////////////////////////////////// | ||||
| // Scatter for when there *is* need to SIMD split | ||||
| ////////////////////////////////////////////////////// | ||||
| template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask) | ||||
| template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerArray<typename vobj::scalar_object> pointers,int dimension,int plane,int cbmask) | ||||
| { | ||||
|   int rd = rhs._grid->_rdimensions[dimension]; | ||||
|   int rd = rhs.Grid()->_rdimensions[dimension]; | ||||
|  | ||||
|   if ( !rhs._grid->CheckerBoarded(dimension) ) { | ||||
|   if ( !rhs.Grid()->CheckerBoarded(dimension) ) { | ||||
|     cbmask=0x3; | ||||
|   } | ||||
|  | ||||
|   int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane  | ||||
|   int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane  | ||||
|      | ||||
|   int e1=rhs._grid->_slice_nblock[dimension]; | ||||
|   int e2=rhs._grid->_slice_block[dimension]; | ||||
|   int e1=rhs.Grid()->_slice_nblock[dimension]; | ||||
|   int e2=rhs.Grid()->_slice_block[dimension]; | ||||
|  | ||||
|   if(cbmask ==0x3 ) { | ||||
|     parallel_for_nest2(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
| 	int o      = n*rhs._grid->_slice_stride[dimension]; | ||||
| 	int offset = b+n*rhs._grid->_slice_block[dimension]; | ||||
| 	merge(rhs._odata[so+o+b],pointers,offset); | ||||
|       } | ||||
|     } | ||||
|     autoView( rhs_v , rhs, AcceleratorWrite); | ||||
|     accelerator_for2d(n,e1,b,e2,1,{ | ||||
| 	int o      = n*rhs.Grid()->_slice_stride[dimension]; | ||||
| 	int offset = b+n*rhs.Grid()->_slice_block[dimension]; | ||||
| 	merge(rhs_v[so+o+b],pointers,offset); | ||||
|       }); | ||||
|   } else {  | ||||
|  | ||||
|     // Case of SIMD split AND checker dim cannot currently be hit, except in  | ||||
|     // Test_cshift_red_black code. | ||||
|     //    std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME | ||||
|     std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl; | ||||
|     autoView( rhs_v, rhs, CpuWrite); | ||||
|     for(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
| 	int o      = n*rhs._grid->_slice_stride[dimension]; | ||||
| 	int offset = b+n*rhs._grid->_slice_block[dimension]; | ||||
| 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b); | ||||
| 	int o      = n*rhs.Grid()->_slice_stride[dimension]; | ||||
| 	int offset = b+n*rhs.Grid()->_slice_block[dimension]; | ||||
| 	int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b); | ||||
| 	if ( ocb&cbmask ) { | ||||
| 	  merge(rhs._odata[so+o+b],pointers,offset); | ||||
| 	  merge(rhs_v[so+o+b],pointers,offset); | ||||
| 	} | ||||
|       } | ||||
|     } | ||||
| @@ -220,85 +237,96 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<typ | ||||
| ////////////////////////////////////////////////////// | ||||
| // local to node block strided copies | ||||
| ////////////////////////////////////////////////////// | ||||
|  | ||||
| template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask) | ||||
| { | ||||
|   int rd = rhs._grid->_rdimensions[dimension]; | ||||
|   int rd = rhs.Grid()->_rdimensions[dimension]; | ||||
|  | ||||
|   if ( !rhs._grid->CheckerBoarded(dimension) ) { | ||||
|   if ( !rhs.Grid()->CheckerBoarded(dimension) ) { | ||||
|     cbmask=0x3; | ||||
|   } | ||||
|  | ||||
|   int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane  | ||||
|   int lo  = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane  | ||||
|   int ro  = rplane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane  | ||||
|   int lo  = lplane*lhs.Grid()->_ostride[dimension]; // base offset for start of plane  | ||||
|  | ||||
|   int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc | ||||
|   int e2=rhs.Grid()->_slice_block[dimension]; | ||||
|   int stride = rhs.Grid()->_slice_stride[dimension]; | ||||
|  | ||||
|   if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest | ||||
|  | ||||
|   int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc | ||||
|   int e2=rhs._grid->_slice_block[dimension]; | ||||
|   int stride = rhs._grid->_slice_stride[dimension]; | ||||
|   static std::vector<std::pair<int,int> > table; table.resize(e1*e2); | ||||
|   int ent=0; | ||||
|  | ||||
|   if(cbmask == 0x3 ){ | ||||
|     for(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
|         int o =n*stride+b; | ||||
| 	table[ent++] = std::pair<int,int>(lo+o,ro+o); | ||||
| 	Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o); | ||||
|       } | ||||
|     } | ||||
|   } else {  | ||||
|     for(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
|         int o =n*stride+b; | ||||
|         int ocb=1<<lhs._grid->CheckerBoardFromOindex(o); | ||||
|         int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o); | ||||
|         if ( ocb&cbmask ) { | ||||
| 	  table[ent++] = std::pair<int,int>(lo+o,ro+o); | ||||
| 	  Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o); | ||||
| 	} | ||||
|       } | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   parallel_for(int i=0;i<ent;i++){ | ||||
|     lhs._odata[table[i].first]=rhs._odata[table[i].second]; | ||||
|   { | ||||
|     autoView(rhs_v , rhs, AcceleratorRead); | ||||
|     autoView(lhs_v , lhs, AcceleratorWrite); | ||||
|     auto table = &Cshift_table[0]; | ||||
|     accelerator_for(i,ent,1,{ | ||||
|       lhs_v[table[i].first]=rhs_v[table[i].second]; | ||||
|     }); | ||||
|   } | ||||
|  | ||||
| } | ||||
|  | ||||
| template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type) | ||||
| { | ||||
|   | ||||
|   int rd = rhs._grid->_rdimensions[dimension]; | ||||
|   int rd = rhs.Grid()->_rdimensions[dimension]; | ||||
|  | ||||
|   if ( !rhs._grid->CheckerBoarded(dimension) ) { | ||||
|   if ( !rhs.Grid()->CheckerBoarded(dimension) ) { | ||||
|     cbmask=0x3; | ||||
|   } | ||||
|  | ||||
|   int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane  | ||||
|   int lo  = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane  | ||||
|   int ro  = rplane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane  | ||||
|   int lo  = lplane*lhs.Grid()->_ostride[dimension]; // base offset for start of plane  | ||||
|  | ||||
|   int e1=rhs._grid->_slice_nblock[dimension]; | ||||
|   int e2=rhs._grid->_slice_block [dimension]; | ||||
|   int stride = rhs._grid->_slice_stride[dimension]; | ||||
|   int e1=rhs.Grid()->_slice_nblock[dimension]; | ||||
|   int e2=rhs.Grid()->_slice_block [dimension]; | ||||
|   int stride = rhs.Grid()->_slice_stride[dimension]; | ||||
|  | ||||
|   if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest | ||||
|  | ||||
|   static std::vector<std::pair<int,int> > table;  table.resize(e1*e2); | ||||
|   int ent=0; | ||||
|  | ||||
|   double t_tab,t_perm; | ||||
|   if ( cbmask == 0x3 ) { | ||||
|     for(int n=0;n<e1;n++){ | ||||
|     for(int b=0;b<e2;b++){ | ||||
|       int o  =n*stride; | ||||
|       table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); | ||||
|       Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); | ||||
|     }} | ||||
|   } else { | ||||
|     for(int n=0;n<e1;n++){ | ||||
|     for(int b=0;b<e2;b++){ | ||||
|       int o  =n*stride; | ||||
|       int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b); | ||||
|       if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); | ||||
|       int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b); | ||||
|       if ( ocb&cbmask ) Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); | ||||
|     }} | ||||
|   } | ||||
|  | ||||
|   parallel_for(int i=0;i<ent;i++){ | ||||
|     permute(lhs._odata[table[i].first],rhs._odata[table[i].second],permute_type); | ||||
|   { | ||||
|     autoView( rhs_v, rhs, AcceleratorRead); | ||||
|     autoView( lhs_v, lhs, AcceleratorWrite); | ||||
|     auto table = &Cshift_table[0]; | ||||
|     accelerator_for(i,ent,1,{ | ||||
|       permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type); | ||||
|     }); | ||||
|   } | ||||
| } | ||||
|  | ||||
| @@ -309,11 +337,9 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r | ||||
| { | ||||
|   int sshift[2]; | ||||
|  | ||||
|   sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even); | ||||
|   sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd); | ||||
|   sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); | ||||
|   sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd); | ||||
|  | ||||
|   double t_local; | ||||
|    | ||||
|   if ( sshift[0] == sshift[1] ) { | ||||
|     Cshift_local(ret,rhs,dimension,shift,0x3); | ||||
|   } else { | ||||
| @@ -324,7 +350,7 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r | ||||
|  | ||||
| template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) | ||||
| { | ||||
|   GridBase *grid = rhs._grid; | ||||
|   GridBase *grid = rhs.Grid(); | ||||
|   int fd = grid->_fdimensions[dimension]; | ||||
|   int rd = grid->_rdimensions[dimension]; | ||||
|   int ld = grid->_ldimensions[dimension]; | ||||
| @@ -335,18 +361,18 @@ template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &r | ||||
|   shift = (shift+fd)%fd; | ||||
|  | ||||
|   // the permute type | ||||
|   ret.checkerboard = grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension); | ||||
|   ret.Checkerboard() = grid->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension); | ||||
|   int permute_dim =grid->PermuteDim(dimension); | ||||
|   int permute_type=grid->PermuteType(dimension); | ||||
|   int permute_type_dist; | ||||
|  | ||||
|   for(int x=0;x<rd;x++){        | ||||
|  | ||||
|     int o   = 0; | ||||
|     //    int o   = 0; | ||||
|     int bo  = x * grid->_ostride[dimension]; | ||||
|     int cb= (cbmask==0x2)? Odd : Even; | ||||
|  | ||||
|     int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); | ||||
|     int sshift = grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); | ||||
|     int sx     = (x+sshift)%rd; | ||||
|      | ||||
|     // wrap is whether sshift > rd. | ||||
| @@ -387,5 +413,5 @@ template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &r | ||||
|    | ||||
|   } | ||||
| } | ||||
| } | ||||
| #endif | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -24,33 +24,33 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef _GRID_CSHIFT_MPI_H_ | ||||
| #define _GRID_CSHIFT_MPI_H_ | ||||
|  | ||||
|  | ||||
| namespace Grid {  | ||||
| NAMESPACE_BEGIN(Grid);  | ||||
|  | ||||
| template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift) | ||||
| { | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|  | ||||
|   Lattice<vobj> ret(rhs._grid);  | ||||
|   Lattice<vobj> ret(rhs.Grid());  | ||||
|    | ||||
|   int fd = rhs._grid->_fdimensions[dimension]; | ||||
|   int rd = rhs._grid->_rdimensions[dimension]; | ||||
|   int fd = rhs.Grid()->_fdimensions[dimension]; | ||||
|   int rd = rhs.Grid()->_rdimensions[dimension]; | ||||
|  | ||||
|   // Map to always positive shift modulo global full dimension. | ||||
|   shift = (shift+fd)%fd; | ||||
|  | ||||
|   ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension); | ||||
|   ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension); | ||||
|          | ||||
|   // the permute type | ||||
|   int simd_layout     = rhs._grid->_simd_layout[dimension]; | ||||
|   int comm_dim        = rhs._grid->_processors[dimension] >1 ; | ||||
|   int splice_dim      = rhs._grid->_simd_layout[dimension]>1 && (comm_dim); | ||||
|   int simd_layout     = rhs.Grid()->_simd_layout[dimension]; | ||||
|   int comm_dim        = rhs.Grid()->_processors[dimension] >1 ; | ||||
|   int splice_dim      = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim); | ||||
|  | ||||
|  | ||||
|   if ( !comm_dim ) { | ||||
| @@ -70,10 +70,10 @@ template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &r | ||||
| { | ||||
|   int sshift[2]; | ||||
|  | ||||
|   sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even); | ||||
|   sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd); | ||||
|   sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); | ||||
|   sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd); | ||||
|  | ||||
|   //  std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl; | ||||
|   //  std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl; | ||||
|   if ( sshift[0] == sshift[1] ) { | ||||
|     //    std::cout << "Single pass Cshift_comms" <<std::endl; | ||||
|     Cshift_comms(ret,rhs,dimension,shift,0x3); | ||||
| @@ -88,8 +88,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob | ||||
| { | ||||
|   int sshift[2]; | ||||
|  | ||||
|   sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even); | ||||
|   sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd); | ||||
|   sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); | ||||
|   sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd); | ||||
|  | ||||
|   //std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl; | ||||
|   if ( sshift[0] == sshift[1] ) { | ||||
| @@ -107,25 +107,25 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|  | ||||
|   GridBase *grid=rhs._grid; | ||||
|   Lattice<vobj> temp(rhs._grid); | ||||
|   GridBase *grid=rhs.Grid(); | ||||
|   Lattice<vobj> temp(rhs.Grid()); | ||||
|  | ||||
|   int fd              = rhs._grid->_fdimensions[dimension]; | ||||
|   int rd              = rhs._grid->_rdimensions[dimension]; | ||||
|   int pd              = rhs._grid->_processors[dimension]; | ||||
|   int simd_layout     = rhs._grid->_simd_layout[dimension]; | ||||
|   int comm_dim        = rhs._grid->_processors[dimension] >1 ; | ||||
|   int fd              = rhs.Grid()->_fdimensions[dimension]; | ||||
|   int rd              = rhs.Grid()->_rdimensions[dimension]; | ||||
|   int pd              = rhs.Grid()->_processors[dimension]; | ||||
|   int simd_layout     = rhs.Grid()->_simd_layout[dimension]; | ||||
|   int comm_dim        = rhs.Grid()->_processors[dimension] >1 ; | ||||
|   assert(simd_layout==1); | ||||
|   assert(comm_dim==1); | ||||
|   assert(shift>=0); | ||||
|   assert(shift<fd); | ||||
|    | ||||
|   int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension]; | ||||
|   int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; | ||||
|   commVector<vobj> send_buf(buffer_size); | ||||
|   commVector<vobj> recv_buf(buffer_size); | ||||
|  | ||||
|   int cb= (cbmask==0x2)? Odd : Even; | ||||
|   int sshift= rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); | ||||
|   int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); | ||||
|  | ||||
|   for(int x=0;x<rd;x++){        | ||||
|  | ||||
| @@ -145,7 +145,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | ||||
|  | ||||
|       Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask); | ||||
|  | ||||
|       int rank           = grid->_processor; | ||||
|       //      int rank           = grid->_processor; | ||||
|       int recv_from_rank; | ||||
|       int xmit_to_rank; | ||||
|       grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); | ||||
| @@ -165,7 +165,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | ||||
|  | ||||
| template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) | ||||
| { | ||||
|   GridBase *grid=rhs._grid; | ||||
|   GridBase *grid=rhs.Grid(); | ||||
|   const int Nsimd = grid->Nsimd(); | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|   typedef typename vobj::scalar_object scalar_object; | ||||
| @@ -193,21 +193,21 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | ||||
|   // Simd direction uses an extract/merge pair | ||||
|   /////////////////////////////////////////////// | ||||
|   int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; | ||||
|   int words = sizeof(vobj)/sizeof(vector_type); | ||||
|   //  int words = sizeof(vobj)/sizeof(vector_type); | ||||
|  | ||||
|   std::vector<commVector<scalar_object> >   send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) ); | ||||
|   std::vector<commVector<scalar_object> >   recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) ); | ||||
|  | ||||
|   int bytes = buffer_size*sizeof(scalar_object); | ||||
|  | ||||
|   std::vector<scalar_object *>  pointers(Nsimd); //  | ||||
|   std::vector<scalar_object *> rpointers(Nsimd); // received pointers | ||||
|   ExtractPointerArray<scalar_object>  pointers(Nsimd); //  | ||||
|   ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers | ||||
|  | ||||
|   /////////////////////////////////////////// | ||||
|   // Work out what to send where | ||||
|   /////////////////////////////////////////// | ||||
|   int cb    = (cbmask==0x2)? Odd : Even; | ||||
|   int sshift= grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); | ||||
|   int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); | ||||
|  | ||||
|   // loop over outer coord planes orthog to dim | ||||
|   for(int x=0;x<rd;x++){        | ||||
| @@ -257,6 +257,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | ||||
|     Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); | ||||
|   } | ||||
|  | ||||
|  } | ||||
| } | ||||
|  | ||||
| NAMESPACE_END(Grid);  | ||||
|  | ||||
| #endif | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -23,17 +23,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef _GRID_CSHIFT_NONE_H_ | ||||
| #define _GRID_CSHIFT_NONE_H_ | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
| template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift) | ||||
| { | ||||
|   Lattice<vobj> ret(rhs._grid); | ||||
|   ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension); | ||||
|   Lattice<vobj> ret(rhs.Grid()); | ||||
|   ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension); | ||||
|   Cshift_local(ret,rhs,dimension,shift); | ||||
|   return ret; | ||||
| } | ||||
| } | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
| #endif | ||||
|   | ||||
							
								
								
									
										4
									
								
								Grid/cshift/Cshift_table.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								Grid/cshift/Cshift_table.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,4 @@ | ||||
| #include <Grid/GridCore.h>        | ||||
| NAMESPACE_BEGIN(Grid); | ||||
| Vector<std::pair<int,int> > Cshift_table;  | ||||
| NAMESPACE_END(Grid); | ||||
| @@ -1,3 +1,4 @@ | ||||
| #ifndef __NVCC__ | ||||
| /* | ||||
|     __ _____ _____ _____ | ||||
|  __|  |   __|     |   | |  JSON for Modern C++ | ||||
| @@ -18918,3 +18919,4 @@ inline nlohmann::json::json_pointer operator "" _json_pointer(const char* s, std | ||||
|  | ||||
|  | ||||
| #endif | ||||
| #endif | ||||
|   | ||||
| @@ -25,9 +25,23 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #ifndef GRID_LATTICE_H | ||||
| #define GRID_LATTICE_H | ||||
|  | ||||
| #pragma once | ||||
| #include <Grid/lattice/Lattice_view.h> | ||||
| #include <Grid/lattice/Lattice_base.h> | ||||
|  | ||||
| #endif | ||||
| #include <Grid/lattice/Lattice_conformable.h> | ||||
| #include <Grid/lattice/Lattice_ET.h> | ||||
| #include <Grid/lattice/Lattice_arith.h> | ||||
| #include <Grid/lattice/Lattice_trace.h> | ||||
| #include <Grid/lattice/Lattice_transpose.h> | ||||
| #include <Grid/lattice/Lattice_local.h> | ||||
| #include <Grid/lattice/Lattice_reduction.h> | ||||
| #include <Grid/lattice/Lattice_peekpoke.h> | ||||
| //#include <Grid/lattice/Lattice_reality.h> | ||||
| #include <Grid/lattice/Lattice_comparison_utils.h> | ||||
| #include <Grid/lattice/Lattice_comparison.h> | ||||
| #include <Grid/lattice/Lattice_coordinate.h> | ||||
| //#include <Grid/lattice/Lattice_where.h> | ||||
| #include <Grid/lattice/Lattice_rng.h> | ||||
| #include <Grid/lattice/Lattice_unary.h> | ||||
| #include <Grid/lattice/Lattice_transfer.h> | ||||
| #include <Grid/lattice/Lattice_basis.h> | ||||
|   | ||||
| @@ -9,6 +9,7 @@ Copyright (C) 2015 | ||||
| Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| Author: neo <cossu@post.kek.jp> | ||||
| Author: Christoph Lehner <christoph@lhnr.de | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| @@ -27,7 +28,7 @@ with this program; if not, write to the Free Software Foundation, Inc., | ||||
| See the full license in the file "LICENSE" in the top level distribution | ||||
| directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| 			   /*  END LEGAL */ | ||||
| #ifndef GRID_LATTICE_ET_H | ||||
| #define GRID_LATTICE_ET_H | ||||
|  | ||||
| @@ -36,13 +37,13 @@ directory | ||||
| #include <typeinfo> | ||||
| #include <vector> | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| //////////////////////////////////////////////////// | ||||
| // Predicated where support | ||||
| //////////////////////////////////////////////////// | ||||
| template <class iobj, class vobj, class robj> | ||||
| inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, | ||||
| accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, | ||||
|                             const robj &iffalse) { | ||||
|   typename std::remove_const<vobj>::type ret; | ||||
|  | ||||
| @@ -51,11 +52,10 @@ inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|  | ||||
|   const int Nsimd = vobj::vector_type::Nsimd(); | ||||
|   const int words = sizeof(vobj) / sizeof(vector_type); | ||||
|  | ||||
|   std::vector<Integer> mask(Nsimd); | ||||
|   std::vector<scalar_object> truevals(Nsimd); | ||||
|   std::vector<scalar_object> falsevals(Nsimd); | ||||
|   ExtractBuffer<Integer> mask(Nsimd); | ||||
|   ExtractBuffer<scalar_object> truevals(Nsimd); | ||||
|   ExtractBuffer<scalar_object> falsevals(Nsimd); | ||||
|  | ||||
|   extract(iftrue, truevals); | ||||
|   extract(iffalse, falsevals); | ||||
| @@ -69,158 +69,212 @@ inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| //////////////////////////////////////////// | ||||
| // recursive evaluation of expressions; Could | ||||
| // switch to generic approach with variadics, a la | ||||
| // Antonin's Lat Sim but the repack to variadic with popped | ||||
| // from tuple is hideous; C++14 introduces std::make_index_sequence for this | ||||
| //////////////////////////////////////////// | ||||
|  | ||||
| // leaf eval of lattice ; should enable if protect using traits | ||||
|  | ||||
| template <typename T> | ||||
| using is_lattice = std::is_base_of<LatticeBase, T>; | ||||
|  | ||||
| template <typename T> | ||||
| using is_lattice_expr = std::is_base_of<LatticeExpressionBase, T>; | ||||
|  | ||||
| template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >; | ||||
|  | ||||
| ///////////////////////////////////////////////////// | ||||
| //Specialization of getVectorType for lattices | ||||
| ///////////////////////////////////////////////////// | ||||
| template<typename T> | ||||
| struct getVectorType<Lattice<T> >{ | ||||
|   typedef typename Lattice<T>::vector_object type; | ||||
| }; | ||||
|   | ||||
| template<class sobj> | ||||
| inline sobj eval(const unsigned int ss, const sobj &arg) | ||||
|  | ||||
| //////////////////////////////////////////// | ||||
| //--  recursive evaluation of expressions; -- | ||||
| // handle leaves of syntax tree | ||||
| /////////////////////////////////////////////////// | ||||
| template<class sobj> accelerator_inline  | ||||
| sobj eval(const uint64_t ss, const sobj &arg) | ||||
| { | ||||
|   return arg; | ||||
| } | ||||
| template <class lobj> | ||||
| inline const lobj &eval(const unsigned int ss, const Lattice<lobj> &arg) { | ||||
|   return arg._odata[ss]; | ||||
|  | ||||
| template <class lobj> accelerator_inline  | ||||
| const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg)  | ||||
| { | ||||
|   return arg[ss]; | ||||
| } | ||||
|  | ||||
| // handle nodes in syntax tree | ||||
| template <typename Op, typename T1> | ||||
| auto inline eval( | ||||
|     const unsigned int ss, | ||||
|     const LatticeUnaryExpression<Op, T1> &expr)  // eval one operand | ||||
|     -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)))) { | ||||
|   return expr.first.func(eval(ss, std::get<0>(expr.second))); | ||||
| // What needs this? | ||||
| // Cannot be legal on accelerator | ||||
| // Comparison must convert | ||||
| #if 1 | ||||
| template <class lobj> accelerator_inline  | ||||
| const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg)  | ||||
| { | ||||
|   auto view = arg.View(AcceleratorRead); | ||||
|   return view[ss]; | ||||
| } | ||||
| #endif | ||||
|  | ||||
| template <typename Op, typename T1, typename T2> | ||||
| auto inline eval( | ||||
|     const unsigned int ss, | ||||
|     const LatticeBinaryExpression<Op, T1, T2> &expr)  // eval two operands | ||||
|     -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)), | ||||
|                                 eval(ss, std::get<1>(expr.second)))) { | ||||
|   return expr.first.func(eval(ss, std::get<0>(expr.second)), | ||||
|                          eval(ss, std::get<1>(expr.second))); | ||||
| /////////////////////////////////////////////////// | ||||
| // handle nodes in syntax tree- eval one operand | ||||
| /////////////////////////////////////////////////// | ||||
| template <typename Op, typename T1> accelerator_inline  | ||||
| auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)   | ||||
|   -> decltype(expr.op.func( eval(ss, expr.arg1))) | ||||
| { | ||||
|   return expr.op.func( eval(ss, expr.arg1) ); | ||||
| } | ||||
|  | ||||
| template <typename Op, typename T1, typename T2, typename T3> | ||||
| auto inline eval(const unsigned int ss, | ||||
|                  const LatticeTrinaryExpression<Op, T1, T2, T3> | ||||
|                      &expr)  // eval three operands | ||||
|     -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)), | ||||
|                                 eval(ss, std::get<1>(expr.second)), | ||||
|                                 eval(ss, std::get<2>(expr.second)))) { | ||||
|   return expr.first.func(eval(ss, std::get<0>(expr.second)), | ||||
|                          eval(ss, std::get<1>(expr.second)), | ||||
|                          eval(ss, std::get<2>(expr.second))); | ||||
| /////////////////////// | ||||
| // eval two operands | ||||
| /////////////////////// | ||||
| template <typename Op, typename T1, typename T2> accelerator_inline | ||||
| auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)   | ||||
|   -> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2))) | ||||
| { | ||||
|   return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) ); | ||||
| } | ||||
| /////////////////////// | ||||
| // eval three operands | ||||
| /////////////////////// | ||||
| template <typename Op, typename T1, typename T2, typename T3> accelerator_inline | ||||
| auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)   | ||||
|   -> decltype(expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3))) | ||||
| { | ||||
|   return expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3)); | ||||
| } | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////////// | ||||
| // Obtain the grid from an expression, ensuring conformable. This must follow a | ||||
| // tree recursion | ||||
| // tree recursion; must retain grid pointer in the LatticeView class which sucks | ||||
| // Use a different method, and make it void *. | ||||
| // Perhaps a conformable method. | ||||
| ////////////////////////////////////////////////////////////////////////// | ||||
| template <class T1, | ||||
|           typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr> | ||||
| inline void GridFromExpression(GridBase *&grid, const T1 &lat)  // Lattice leaf | ||||
| template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr> | ||||
| accelerator_inline void GridFromExpression(GridBase *&grid, const T1 &lat)  // Lattice leaf | ||||
| { | ||||
|   if (grid) { | ||||
|     conformable(grid, lat._grid); | ||||
|   } | ||||
|   grid = lat._grid; | ||||
|   lat.Conformable(grid); | ||||
| } | ||||
| template <class T1, | ||||
|           typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr> | ||||
| inline void GridFromExpression(GridBase *&grid, | ||||
|                                const T1 ¬lat)  // non-lattice leaf | ||||
|  | ||||
| template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr> | ||||
| accelerator_inline  | ||||
| void GridFromExpression(GridBase *&grid,const T1 ¬lat)  // non-lattice leaf | ||||
| {} | ||||
|  | ||||
| template <typename Op, typename T1> | ||||
| inline void GridFromExpression(GridBase *&grid, | ||||
|                                const LatticeUnaryExpression<Op, T1> &expr) { | ||||
|   GridFromExpression(grid, std::get<0>(expr.second));  // recurse | ||||
| accelerator_inline  | ||||
| void GridFromExpression(GridBase *&grid,const LatticeUnaryExpression<Op, T1> &expr)  | ||||
| { | ||||
|   GridFromExpression(grid, expr.arg1);  // recurse | ||||
| } | ||||
|  | ||||
| template <typename Op, typename T1, typename T2> | ||||
| inline void GridFromExpression( | ||||
|     GridBase *&grid, const LatticeBinaryExpression<Op, T1, T2> &expr) { | ||||
|   GridFromExpression(grid, std::get<0>(expr.second));  // recurse | ||||
|   GridFromExpression(grid, std::get<1>(expr.second)); | ||||
| accelerator_inline  | ||||
| void GridFromExpression(GridBase *&grid, const LatticeBinaryExpression<Op, T1, T2> &expr)  | ||||
| { | ||||
|   GridFromExpression(grid, expr.arg1);  // recurse | ||||
|   GridFromExpression(grid, expr.arg2); | ||||
| } | ||||
| template <typename Op, typename T1, typename T2, typename T3> | ||||
| inline void GridFromExpression( | ||||
|     GridBase *&grid, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) { | ||||
|   GridFromExpression(grid, std::get<0>(expr.second));  // recurse | ||||
|   GridFromExpression(grid, std::get<1>(expr.second)); | ||||
|   GridFromExpression(grid, std::get<2>(expr.second)); | ||||
| accelerator_inline  | ||||
| void GridFromExpression(GridBase *&grid, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)  | ||||
| { | ||||
|   GridFromExpression(grid, expr.arg1);  // recurse | ||||
|   GridFromExpression(grid, expr.arg2);  // recurse | ||||
|   GridFromExpression(grid, expr.arg3);  // recurse | ||||
| } | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////////// | ||||
| // Obtain the CB from an expression, ensuring conformable. This must follow a | ||||
| // tree recursion | ||||
| ////////////////////////////////////////////////////////////////////////// | ||||
| template <class T1, | ||||
|           typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr> | ||||
| template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr> | ||||
| inline void CBFromExpression(int &cb, const T1 &lat)  // Lattice leaf | ||||
| { | ||||
|   if ((cb == Odd) || (cb == Even)) { | ||||
|     assert(cb == lat.checkerboard); | ||||
|     assert(cb == lat.Checkerboard()); | ||||
|   } | ||||
|   cb = lat.checkerboard; | ||||
|   //  std::cout<<GridLogMessage<<"Lattice leaf cb "<<cb<<std::endl; | ||||
|   cb = lat.Checkerboard(); | ||||
| } | ||||
| template <class T1, | ||||
|           typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr> | ||||
| inline void CBFromExpression(int &cb, const T1 ¬lat)  // non-lattice leaf | ||||
| template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr> | ||||
| inline void CBFromExpression(int &cb, const T1 ¬lat) {} // non-lattice leaf | ||||
| template <typename Op, typename T1> inline  | ||||
| void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr)  | ||||
| { | ||||
|   //  std::cout<<GridLogMessage<<"Non lattice leaf cb"<<cb<<std::endl; | ||||
|   CBFromExpression(cb, expr.arg1);  // recurse AST | ||||
| } | ||||
| template <typename Op, typename T1> | ||||
| inline void CBFromExpression(int &cb, | ||||
|                              const LatticeUnaryExpression<Op, T1> &expr) { | ||||
|   CBFromExpression(cb, std::get<0>(expr.second));  // recurse | ||||
|   //  std::cout<<GridLogMessage<<"Unary node cb "<<cb<<std::endl; | ||||
| } | ||||
|  | ||||
| template <typename Op, typename T1, typename T2> | ||||
| inline void CBFromExpression(int &cb, | ||||
|                              const LatticeBinaryExpression<Op, T1, T2> &expr) { | ||||
|   CBFromExpression(cb, std::get<0>(expr.second));  // recurse | ||||
|   CBFromExpression(cb, std::get<1>(expr.second)); | ||||
|   //  std::cout<<GridLogMessage<<"Binary node cb "<<cb<<std::endl; | ||||
| template <typename Op, typename T1, typename T2> inline  | ||||
| void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr)  | ||||
| { | ||||
|   CBFromExpression(cb, expr.arg1);  // recurse AST | ||||
|   CBFromExpression(cb, expr.arg2);  // recurse AST | ||||
| } | ||||
| template <typename Op, typename T1, typename T2, typename T3> | ||||
| inline void CBFromExpression( | ||||
|     int &cb, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) { | ||||
|   CBFromExpression(cb, std::get<0>(expr.second));  // recurse | ||||
|   CBFromExpression(cb, std::get<1>(expr.second)); | ||||
|   CBFromExpression(cb, std::get<2>(expr.second)); | ||||
|   //  std::cout<<GridLogMessage<<"Trinary node cb "<<cb<<std::endl; | ||||
| inline void CBFromExpression(int &cb, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)  | ||||
| { | ||||
|   CBFromExpression(cb, expr.arg1);  // recurse AST | ||||
|   CBFromExpression(cb, expr.arg2);  // recurse AST | ||||
|   CBFromExpression(cb, expr.arg3);  // recurse AST | ||||
| } | ||||
|  | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////////// | ||||
| // ViewOpen | ||||
| ////////////////////////////////////////////////////////////////////////// | ||||
| template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr> | ||||
| inline void ExpressionViewOpen(T1 &lat)  // Lattice leaf | ||||
| { | ||||
|   lat.ViewOpen(AcceleratorRead); | ||||
| } | ||||
| template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr> | ||||
|   inline void ExpressionViewOpen(T1 ¬lat) {} | ||||
|  | ||||
| template <typename Op, typename T1> inline  | ||||
| void ExpressionViewOpen(LatticeUnaryExpression<Op, T1> &expr)  | ||||
| {   | ||||
|   ExpressionViewOpen(expr.arg1); // recurse AST | ||||
| } | ||||
|  | ||||
| template <typename Op, typename T1, typename T2> inline  | ||||
| void ExpressionViewOpen(LatticeBinaryExpression<Op, T1, T2> &expr)  | ||||
| { | ||||
|   ExpressionViewOpen(expr.arg1);  // recurse AST | ||||
|   ExpressionViewOpen(expr.arg2);  // recurse AST | ||||
| } | ||||
| template <typename Op, typename T1, typename T2, typename T3> | ||||
| inline void ExpressionViewOpen(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)  | ||||
| { | ||||
|   ExpressionViewOpen(expr.arg1);  // recurse AST | ||||
|   ExpressionViewOpen(expr.arg2);  // recurse AST | ||||
|   ExpressionViewOpen(expr.arg3);  // recurse AST | ||||
| } | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////////// | ||||
| // ViewClose | ||||
| ////////////////////////////////////////////////////////////////////////// | ||||
| template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr> | ||||
| inline void ExpressionViewClose( T1 &lat)  // Lattice leaf | ||||
| { | ||||
|   lat.ViewClose(); | ||||
| } | ||||
| template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr> | ||||
| inline void ExpressionViewClose(T1 ¬lat) {} | ||||
|  | ||||
| template <typename Op, typename T1> inline  | ||||
| void ExpressionViewClose(LatticeUnaryExpression<Op, T1> &expr)  | ||||
| {   | ||||
|   ExpressionViewClose(expr.arg1); // recurse AST | ||||
| } | ||||
| template <typename Op, typename T1, typename T2> inline  | ||||
| void ExpressionViewClose(LatticeBinaryExpression<Op, T1, T2> &expr)  | ||||
| { | ||||
|   ExpressionViewClose(expr.arg1);  // recurse AST | ||||
|   ExpressionViewClose(expr.arg2);  // recurse AST | ||||
| } | ||||
| template <typename Op, typename T1, typename T2, typename T3> | ||||
| inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)  | ||||
| { | ||||
|   ExpressionViewClose(expr.arg1);  // recurse AST | ||||
|   ExpressionViewClose(expr.arg2);  // recurse AST | ||||
|   ExpressionViewClose(expr.arg3);  // recurse AST | ||||
| } | ||||
|  | ||||
| //////////////////////////////////////////// | ||||
| // Unary operators and funcs | ||||
| //////////////////////////////////////////// | ||||
| #define GridUnopClass(name, ret)                                          \ | ||||
|   template <class arg>                                                    \ | ||||
|   struct name {                                                           \ | ||||
|     static auto inline func(const arg a) -> decltype(ret) { return ret; } \ | ||||
| #define GridUnopClass(name, ret)					\ | ||||
|   template <class arg>							\ | ||||
|   struct name {								\ | ||||
|     static auto accelerator_inline func(const arg a) -> decltype(ret) { return ret; } \ | ||||
|   }; | ||||
|  | ||||
| GridUnopClass(UnarySub, -a); | ||||
| @@ -250,19 +304,21 @@ GridUnopClass(UnaryExp, exp(a)); | ||||
| //////////////////////////////////////////// | ||||
| // Binary operators | ||||
| //////////////////////////////////////////// | ||||
| #define GridBinOpClass(name, combination)                      \ | ||||
|   template <class left, class right>                           \ | ||||
|   struct name {                                                \ | ||||
|     static auto inline func(const left &lhs, const right &rhs) \ | ||||
|         -> decltype(combination) const {                       \ | ||||
|       return combination;                                      \ | ||||
|     }                                                          \ | ||||
|   } | ||||
| #define GridBinOpClass(name, combination)			\ | ||||
|   template <class left, class right>				\ | ||||
|   struct name {							\ | ||||
|     static auto accelerator_inline				\ | ||||
|     func(const left &lhs, const right &rhs)			\ | ||||
|       -> decltype(combination) const				\ | ||||
|     {								\ | ||||
|       return combination;					\ | ||||
|     }								\ | ||||
|   }; | ||||
|  | ||||
| GridBinOpClass(BinaryAdd, lhs + rhs); | ||||
| GridBinOpClass(BinarySub, lhs - rhs); | ||||
| GridBinOpClass(BinaryMul, lhs *rhs); | ||||
| GridBinOpClass(BinaryDiv, lhs /rhs); | ||||
|  | ||||
| GridBinOpClass(BinaryAnd, lhs &rhs); | ||||
| GridBinOpClass(BinaryOr, lhs | rhs); | ||||
| GridBinOpClass(BinaryAndAnd, lhs &&rhs); | ||||
| @@ -271,92 +327,71 @@ GridBinOpClass(BinaryOrOr, lhs || rhs); | ||||
| //////////////////////////////////////////////////// | ||||
| // Trinary conditional op | ||||
| //////////////////////////////////////////////////// | ||||
| #define GridTrinOpClass(name, combination)                                     \ | ||||
|   template <class predicate, class left, class right>                          \ | ||||
|   struct name {                                                                \ | ||||
|     static auto inline func(const predicate &pred, const left &lhs,            \ | ||||
|                             const right &rhs) -> decltype(combination) const { \ | ||||
|       return combination;                                                      \ | ||||
|     }                                                                          \ | ||||
|   } | ||||
| #define GridTrinOpClass(name, combination)				\ | ||||
|   template <class predicate, class left, class right>			\ | ||||
|   struct name {								\ | ||||
|     static auto accelerator_inline					\ | ||||
|     func(const predicate &pred, const left &lhs, const right &rhs)	\ | ||||
|       -> decltype(combination) const					\ | ||||
|     {									\ | ||||
|       return combination;						\ | ||||
|     }									\ | ||||
|   }; | ||||
|  | ||||
| GridTrinOpClass( | ||||
|     TrinaryWhere, | ||||
|     (predicatedWhere<predicate, typename std::remove_reference<left>::type, | ||||
|                      typename std::remove_reference<right>::type>(pred, lhs, | ||||
|                                                                   rhs))); | ||||
| GridTrinOpClass(TrinaryWhere, | ||||
| 		(predicatedWhere<predicate,  | ||||
| 		 typename std::remove_reference<left>::type, | ||||
| 		 typename std::remove_reference<right>::type>(pred, lhs,rhs))); | ||||
|  | ||||
| //////////////////////////////////////////// | ||||
| // Operator syntactical glue | ||||
| //////////////////////////////////////////// | ||||
|  | ||||
| #define GRID_UNOP(name) name<decltype(eval(0, arg))> | ||||
| #define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))> | ||||
| #define GRID_TRINOP(name) \ | ||||
|   name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))> | ||||
| #define GRID_UNOP(name)   name<decltype(eval(0, arg))> | ||||
| #define GRID_BINOP(name)  name<decltype(eval(0, lhs)), decltype(eval(0, rhs))> | ||||
| #define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))> | ||||
|  | ||||
| #define GRID_DEF_UNOP(op, name)                                             \ | ||||
|   template <typename T1,                                                    \ | ||||
|             typename std::enable_if<is_lattice<T1>::value ||                \ | ||||
|                                         is_lattice_expr<T1>::value,         \ | ||||
|                                     T1>::type * = nullptr>                  \ | ||||
|   inline auto op(const T1 &arg)                                             \ | ||||
|       ->decltype(LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(       \ | ||||
|           std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)))) { \ | ||||
|     return LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(             \ | ||||
|         std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)));     \ | ||||
| #define GRID_DEF_UNOP(op, name)						\ | ||||
|   template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \ | ||||
|   inline auto op(const T1 &arg) ->decltype(LatticeUnaryExpression<GRID_UNOP(name),T1>(GRID_UNOP(name)(), arg)) \ | ||||
|   {									\ | ||||
|     return     LatticeUnaryExpression<GRID_UNOP(name),T1>(GRID_UNOP(name)(), arg); \ | ||||
|   } | ||||
|  | ||||
| #define GRID_BINOP_LEFT(op, name)                                             \ | ||||
|   template <typename T1, typename T2,                                         \ | ||||
|             typename std::enable_if<is_lattice<T1>::value ||                  \ | ||||
|                                         is_lattice_expr<T1>::value,           \ | ||||
|                                     T1>::type * = nullptr>                    \ | ||||
|   inline auto op(const T1 &lhs, const T2 &rhs)                                \ | ||||
|       ->decltype(                                                             \ | ||||
|           LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(  \ | ||||
|               std::make_pair(GRID_BINOP(name)(),                              \ | ||||
|                              std::forward_as_tuple(lhs, rhs)))) {             \ | ||||
|     return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \ | ||||
|         std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \ | ||||
| #define GRID_BINOP_LEFT(op, name)					\ | ||||
|   template <typename T1, typename T2,					\ | ||||
|             typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \ | ||||
|   inline auto op(const T1 &lhs, const T2 &rhs)				\ | ||||
|     ->decltype(LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs,rhs)) \ | ||||
|   {									\ | ||||
|     return     LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs,rhs);\ | ||||
|   } | ||||
|  | ||||
| #define GRID_BINOP_RIGHT(op, name)                                            \ | ||||
|   template <typename T1, typename T2,                                         \ | ||||
|             typename std::enable_if<!is_lattice<T1>::value &&                 \ | ||||
|                                         !is_lattice_expr<T1>::value,          \ | ||||
|                                     T1>::type * = nullptr,                    \ | ||||
|             typename std::enable_if<is_lattice<T2>::value ||                  \ | ||||
|                                         is_lattice_expr<T2>::value,           \ | ||||
|                                     T2>::type * = nullptr>                    \ | ||||
|   inline auto op(const T1 &lhs, const T2 &rhs)                                \ | ||||
|       ->decltype(                                                             \ | ||||
|           LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(  \ | ||||
|               std::make_pair(GRID_BINOP(name)(),                              \ | ||||
|                              std::forward_as_tuple(lhs, rhs)))) {             \ | ||||
|     return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \ | ||||
|         std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \ | ||||
| #define GRID_BINOP_RIGHT(op, name)					\ | ||||
|   template <typename T1, typename T2,					\ | ||||
|             typename std::enable_if<!is_lattice<T1>::value&&!is_lattice_expr<T1>::value,T1>::type * = nullptr, \ | ||||
|             typename std::enable_if< is_lattice<T2>::value|| is_lattice_expr<T2>::value,T2>::type * = nullptr> \ | ||||
|   inline auto op(const T1 &lhs, const T2 &rhs)				\ | ||||
|     ->decltype(LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs, rhs)) \ | ||||
|   {									\ | ||||
|     return     LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs, rhs); \ | ||||
|   } | ||||
|  | ||||
| #define GRID_DEF_BINOP(op, name) \ | ||||
|   GRID_BINOP_LEFT(op, name);     \ | ||||
| #define GRID_DEF_BINOP(op, name)		\ | ||||
|   GRID_BINOP_LEFT(op, name);			\ | ||||
|   GRID_BINOP_RIGHT(op, name); | ||||
|  | ||||
| #define GRID_DEF_TRINOP(op, name)                                              \ | ||||
|   template <typename T1, typename T2, typename T3>                             \ | ||||
|   inline auto op(const T1 &pred, const T2 &lhs, const T3 &rhs)                 \ | ||||
|       ->decltype(                                                              \ | ||||
|           LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &,  \ | ||||
|                                    const T3 &>(std::make_pair(                 \ | ||||
|               GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)))) {  \ | ||||
|     return LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &, \ | ||||
|                                     const T3 &>(std::make_pair(                \ | ||||
|         GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)));          \ | ||||
| #define GRID_DEF_TRINOP(op, name)					\ | ||||
|   template <typename T1, typename T2, typename T3>			\ | ||||
|   inline auto op(const T1 &pred, const T2 &lhs, const T3 &rhs)		\ | ||||
|     ->decltype(LatticeTrinaryExpression<GRID_TRINOP(name),T1,T2,T3>(GRID_TRINOP(name)(),pred, lhs, rhs)) \ | ||||
|   {									\ | ||||
|     return LatticeTrinaryExpression<GRID_TRINOP(name),T1,T2,T3>(GRID_TRINOP(name)(),pred, lhs, rhs); \ | ||||
|   } | ||||
|  | ||||
| //////////////////////// | ||||
| // Operator definitions | ||||
| //////////////////////// | ||||
|  | ||||
| GRID_DEF_UNOP(operator-, UnarySub); | ||||
| GRID_DEF_UNOP(Not, UnaryNot); | ||||
| GRID_DEF_UNOP(operator!, UnaryNot); | ||||
| @@ -400,29 +435,27 @@ GRID_DEF_TRINOP(where, TrinaryWhere); | ||||
| ///////////////////////////////////////////////////////////// | ||||
| template <class Op, class T1> | ||||
| auto closure(const LatticeUnaryExpression<Op, T1> &expr) | ||||
|     -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> { | ||||
|   Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> ret( | ||||
|       expr); | ||||
|   -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>  | ||||
| { | ||||
|   Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> ret(expr); | ||||
|   return ret; | ||||
| } | ||||
| template <class Op, class T1, class T2> | ||||
| auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr) | ||||
|     -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)), | ||||
|                                         eval(0, std::get<1>(expr.second))))> { | ||||
|   Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)), | ||||
|                                    eval(0, std::get<1>(expr.second))))> | ||||
|       ret(expr); | ||||
|   -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))>  | ||||
| { | ||||
|   Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> ret(expr); | ||||
|   return ret; | ||||
| } | ||||
| template <class Op, class T1, class T2, class T3> | ||||
| auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) | ||||
|     -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)), | ||||
|                                         eval(0, std::get<1>(expr.second)), | ||||
|                                         eval(0, std::get<2>(expr.second))))> { | ||||
|   Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)), | ||||
|                                    eval(0, std::get<1>(expr.second)), | ||||
|                                    eval(0, std::get<2>(expr.second))))> | ||||
|       ret(expr); | ||||
|   -> Lattice<decltype(expr.op.func(eval(0, expr.arg1), | ||||
| 				   eval(0, expr.arg2), | ||||
| 				   eval(0, expr.arg3)))>  | ||||
| { | ||||
|   Lattice<decltype(expr.op.func(eval(0, expr.arg1), | ||||
| 				eval(0, expr.arg2), | ||||
| 				eval(0, expr.arg3)))>  ret(expr); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| @@ -433,34 +466,7 @@ auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) | ||||
| #undef GRID_DEF_UNOP | ||||
| #undef GRID_DEF_BINOP | ||||
| #undef GRID_DEF_TRINOP | ||||
| } | ||||
|  | ||||
| #if 0 | ||||
| using namespace Grid; | ||||
|          | ||||
|  int main(int argc,char **argv){ | ||||
|     | ||||
|    Lattice<double> v1(16); | ||||
|    Lattice<double> v2(16); | ||||
|    Lattice<double> v3(16); | ||||
|  | ||||
|    BinaryAdd<double,double> tmp; | ||||
|    LatticeBinaryExpression<BinaryAdd<double,double>,Lattice<double> &,Lattice<double> &>  | ||||
|      expr(std::make_pair(tmp, | ||||
|     std::forward_as_tuple(v1,v2))); | ||||
|    tmp.func(eval(0,v1),eval(0,v2)); | ||||
|  | ||||
|    auto var = v1+v2; | ||||
|    std::cout<<GridLogMessage<<typeid(var).name()<<std::endl; | ||||
|  | ||||
|    v3=v1+v2; | ||||
|    v3=v1+v2+v1*v2; | ||||
|  }; | ||||
|  | ||||
| void testit(Lattice<double> &v1,Lattice<double> &v2,Lattice<double> &v3) | ||||
| { | ||||
|    v3=v1+v2+v1*v2; | ||||
| } | ||||
| #endif | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
| #endif | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -7,6 +7,7 @@ | ||||
|     Copyright (C) 2015 | ||||
|  | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| Author: Christoph Lehner <christoph@lhnr.de> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
| @@ -23,233 +24,235 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_LATTICE_ARITH_H | ||||
| #define GRID_LATTICE_ARITH_H | ||||
|  | ||||
| namespace Grid { | ||||
|  | ||||
|  | ||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   //  avoid copy back routines for mult, mac, sub, add | ||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   template<class obj1,class obj2,class obj3> strong_inline | ||||
|     void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ | ||||
|     ret.checkerboard = lhs.checkerboard; | ||||
|     conformable(ret,rhs); | ||||
|     conformable(lhs,rhs); | ||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       obj1 tmp; | ||||
|       mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]); | ||||
|       vstream(ret._odata[ss],tmp); | ||||
| #else | ||||
|       mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); | ||||
| #endif | ||||
|     } | ||||
|   } | ||||
|    | ||||
|   template<class obj1,class obj2,class obj3> strong_inline | ||||
|     void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ | ||||
|     ret.checkerboard = lhs.checkerboard; | ||||
|     conformable(ret,rhs); | ||||
|     conformable(lhs,rhs); | ||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       obj1 tmp; | ||||
|       mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]); | ||||
|       vstream(ret._odata[ss],tmp); | ||||
| #else | ||||
|       mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); | ||||
| #endif | ||||
|     } | ||||
|   } | ||||
|    | ||||
|   template<class obj1,class obj2,class obj3> strong_inline | ||||
|     void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ | ||||
|     ret.checkerboard = lhs.checkerboard; | ||||
|     conformable(ret,rhs); | ||||
|     conformable(lhs,rhs); | ||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       obj1 tmp; | ||||
|       sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]); | ||||
|       vstream(ret._odata[ss],tmp); | ||||
| #else | ||||
|       sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); | ||||
| #endif | ||||
|     } | ||||
|   } | ||||
|   template<class obj1,class obj2,class obj3> strong_inline | ||||
|     void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ | ||||
|     ret.checkerboard = lhs.checkerboard; | ||||
|     conformable(ret,rhs); | ||||
|     conformable(lhs,rhs); | ||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       obj1 tmp; | ||||
|       add(&tmp,&lhs._odata[ss],&rhs._odata[ss]); | ||||
|       vstream(ret._odata[ss],tmp); | ||||
| #else | ||||
|       add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); | ||||
| #endif | ||||
|     } | ||||
|   } | ||||
|    | ||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   //  avoid copy back routines for mult, mac, sub, add | ||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   template<class obj1,class obj2,class obj3> strong_inline | ||||
|     void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ | ||||
|     ret.checkerboard = lhs.checkerboard; | ||||
|     conformable(lhs,ret); | ||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
|       obj1 tmp; | ||||
|       mult(&tmp,&lhs._odata[ss],&rhs); | ||||
|       vstream(ret._odata[ss],tmp); | ||||
|     } | ||||
|   } | ||||
|    | ||||
|   template<class obj1,class obj2,class obj3> strong_inline | ||||
|     void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ | ||||
|     ret.checkerboard = lhs.checkerboard; | ||||
|     conformable(ret,lhs); | ||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
|       obj1 tmp; | ||||
|       mac(&tmp,&lhs._odata[ss],&rhs); | ||||
|       vstream(ret._odata[ss],tmp); | ||||
|     } | ||||
|   } | ||||
|    | ||||
|   template<class obj1,class obj2,class obj3> strong_inline | ||||
|     void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ | ||||
|     ret.checkerboard = lhs.checkerboard; | ||||
|     conformable(ret,lhs); | ||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       obj1 tmp; | ||||
|       sub(&tmp,&lhs._odata[ss],&rhs); | ||||
|       vstream(ret._odata[ss],tmp); | ||||
| #else  | ||||
|       sub(&ret._odata[ss],&lhs._odata[ss],&rhs); | ||||
| #endif | ||||
|     } | ||||
|   } | ||||
|   template<class obj1,class obj2,class obj3> strong_inline | ||||
|     void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ | ||||
|     ret.checkerboard = lhs.checkerboard; | ||||
|     conformable(lhs,ret); | ||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       obj1 tmp; | ||||
|       add(&tmp,&lhs._odata[ss],&rhs); | ||||
|       vstream(ret._odata[ss],tmp); | ||||
| #else  | ||||
|       add(&ret._odata[ss],&lhs._odata[ss],&rhs); | ||||
| #endif | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   //  avoid copy back routines for mult, mac, sub, add | ||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|     template<class obj1,class obj2,class obj3> strong_inline | ||||
|     void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ | ||||
|     ret.checkerboard = rhs.checkerboard; | ||||
|     conformable(ret,rhs); | ||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       obj1 tmp; | ||||
|       mult(&tmp,&lhs,&rhs._odata[ss]); | ||||
|       vstream(ret._odata[ss],tmp); | ||||
| #else  | ||||
|       mult(&ret._odata[ss],&lhs,&rhs._odata[ss]); | ||||
| #endif | ||||
|     } | ||||
|   } | ||||
|    | ||||
|   template<class obj1,class obj2,class obj3> strong_inline | ||||
|     void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ | ||||
|     ret.checkerboard = rhs.checkerboard; | ||||
|     conformable(ret,rhs); | ||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       obj1 tmp; | ||||
|       mac(&tmp,&lhs,&rhs._odata[ss]); | ||||
|       vstream(ret._odata[ss],tmp); | ||||
| #else  | ||||
|       mac(&ret._odata[ss],&lhs,&rhs._odata[ss]); | ||||
| #endif | ||||
|     } | ||||
|   } | ||||
|    | ||||
|   template<class obj1,class obj2,class obj3> strong_inline | ||||
|     void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ | ||||
|     ret.checkerboard = rhs.checkerboard; | ||||
|     conformable(ret,rhs); | ||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       obj1 tmp; | ||||
|       sub(&tmp,&lhs,&rhs._odata[ss]); | ||||
|       vstream(ret._odata[ss],tmp); | ||||
| #else  | ||||
|       sub(&ret._odata[ss],&lhs,&rhs._odata[ss]); | ||||
| #endif | ||||
|     } | ||||
|   } | ||||
|   template<class obj1,class obj2,class obj3> strong_inline | ||||
|     void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ | ||||
|     ret.checkerboard = rhs.checkerboard; | ||||
|     conformable(ret,rhs); | ||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       obj1 tmp; | ||||
|       add(&tmp,&lhs,&rhs._odata[ss]); | ||||
|       vstream(ret._odata[ss],tmp); | ||||
| #else  | ||||
|       add(&ret._odata[ss],&lhs,&rhs._odata[ss]); | ||||
| #endif | ||||
|     } | ||||
|   } | ||||
|    | ||||
|   template<class sobj,class vobj> strong_inline | ||||
|   void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){ | ||||
|     ret.checkerboard = x.checkerboard; | ||||
|     conformable(ret,x); | ||||
|     conformable(x,y); | ||||
|     parallel_for(int ss=0;ss<x._grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       vobj tmp = a*x._odata[ss]+y._odata[ss]; | ||||
|       vstream(ret._odata[ss],tmp); | ||||
| #else | ||||
|       ret._odata[ss]=a*x._odata[ss]+y._odata[ss]; | ||||
| #endif | ||||
|     } | ||||
|   } | ||||
|   template<class sobj,class vobj> strong_inline | ||||
|   void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){ | ||||
|     ret.checkerboard = x.checkerboard; | ||||
|     conformable(ret,x); | ||||
|     conformable(x,y); | ||||
|     parallel_for(int ss=0;ss<x._grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       vobj tmp = a*x._odata[ss]+b*y._odata[ss]; | ||||
|       vstream(ret._odata[ss],tmp); | ||||
| #else | ||||
|       ret._odata[ss]=a*x._odata[ss]+b*y._odata[ss]; | ||||
| #endif | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   template<class sobj,class vobj> strong_inline | ||||
|   RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){ | ||||
|     return axpy_norm_fast(ret,a,x,y); | ||||
|   } | ||||
|   template<class sobj,class vobj> strong_inline | ||||
|   RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){ | ||||
|     return axpby_norm_fast(ret,a,b,x,y); | ||||
|   } | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| //  avoid copy back routines for mult, mac, sub, add | ||||
| ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| template<class obj1,class obj2,class obj3> inline | ||||
| void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ | ||||
|   ret.Checkerboard() = lhs.Checkerboard(); | ||||
|   autoView( ret_v , ret, AcceleratorWrite); | ||||
|   autoView( lhs_v , lhs, AcceleratorRead); | ||||
|   autoView( rhs_v , rhs, AcceleratorRead); | ||||
|   conformable(ret,rhs); | ||||
|   conformable(lhs,rhs); | ||||
|   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ | ||||
|     decltype(coalescedRead(obj1())) tmp; | ||||
|     auto lhs_t = lhs_v(ss); | ||||
|     auto rhs_t = rhs_v(ss); | ||||
|     mult(&tmp,&lhs_t,&rhs_t); | ||||
|     coalescedWrite(ret_v[ss],tmp); | ||||
|   }); | ||||
| } | ||||
|    | ||||
| template<class obj1,class obj2,class obj3> inline | ||||
| void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ | ||||
|   ret.Checkerboard() = lhs.Checkerboard(); | ||||
|   conformable(ret,rhs); | ||||
|   conformable(lhs,rhs); | ||||
|   autoView( ret_v , ret, AcceleratorWrite); | ||||
|   autoView( lhs_v , lhs, AcceleratorRead); | ||||
|   autoView( rhs_v , rhs, AcceleratorRead); | ||||
|   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ | ||||
|     decltype(coalescedRead(obj1())) tmp; | ||||
|     auto lhs_t=lhs_v(ss); | ||||
|     auto rhs_t=rhs_v(ss); | ||||
|     mac(&tmp,&lhs_t,&rhs_t); | ||||
|     coalescedWrite(ret_v[ss],tmp); | ||||
|   }); | ||||
| } | ||||
|    | ||||
| template<class obj1,class obj2,class obj3> inline | ||||
| void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ | ||||
|   ret.Checkerboard() = lhs.Checkerboard(); | ||||
|   conformable(ret,rhs); | ||||
|   conformable(lhs,rhs); | ||||
|   autoView( ret_v , ret, AcceleratorWrite); | ||||
|   autoView( lhs_v , lhs, AcceleratorRead); | ||||
|   autoView( rhs_v , rhs, AcceleratorRead); | ||||
|   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ | ||||
|     decltype(coalescedRead(obj1())) tmp; | ||||
|     auto lhs_t=lhs_v(ss); | ||||
|     auto rhs_t=rhs_v(ss); | ||||
|     sub(&tmp,&lhs_t,&rhs_t); | ||||
|     coalescedWrite(ret_v[ss],tmp); | ||||
|   }); | ||||
| } | ||||
| template<class obj1,class obj2,class obj3> inline | ||||
| void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ | ||||
|   ret.Checkerboard() = lhs.Checkerboard(); | ||||
|   conformable(ret,rhs); | ||||
|   conformable(lhs,rhs); | ||||
|   autoView( ret_v , ret, AcceleratorWrite); | ||||
|   autoView( lhs_v , lhs, AcceleratorRead); | ||||
|   autoView( rhs_v , rhs, AcceleratorRead); | ||||
|   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ | ||||
|     decltype(coalescedRead(obj1())) tmp; | ||||
|     auto lhs_t=lhs_v(ss); | ||||
|     auto rhs_t=rhs_v(ss); | ||||
|     add(&tmp,&lhs_t,&rhs_t); | ||||
|     coalescedWrite(ret_v[ss],tmp); | ||||
|   }); | ||||
| } | ||||
|    | ||||
| ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| //  avoid copy back routines for mult, mac, sub, add | ||||
| ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| template<class obj1,class obj2,class obj3> inline | ||||
| void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ | ||||
|   ret.Checkerboard() = lhs.Checkerboard(); | ||||
|   conformable(lhs,ret); | ||||
|   autoView( ret_v , ret, AcceleratorWrite); | ||||
|   autoView( lhs_v , lhs, AcceleratorRead); | ||||
|   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ | ||||
|     decltype(coalescedRead(obj1())) tmp; | ||||
|     mult(&tmp,&lhs_v(ss),&rhs); | ||||
|     coalescedWrite(ret_v[ss],tmp); | ||||
|   }); | ||||
| } | ||||
|    | ||||
| template<class obj1,class obj2,class obj3> inline | ||||
| void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ | ||||
|   ret.Checkerboard() = lhs.Checkerboard(); | ||||
|   conformable(ret,lhs); | ||||
|   autoView( ret_v , ret, AcceleratorWrite); | ||||
|   autoView( lhs_v , lhs, AcceleratorRead); | ||||
|   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ | ||||
|     decltype(coalescedRead(obj1())) tmp; | ||||
|     auto lhs_t=lhs_v(ss); | ||||
|     mac(&tmp,&lhs_t,&rhs); | ||||
|     coalescedWrite(ret_v[ss],tmp); | ||||
|   }); | ||||
| } | ||||
|    | ||||
| template<class obj1,class obj2,class obj3> inline | ||||
| void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ | ||||
|   ret.Checkerboard() = lhs.Checkerboard(); | ||||
|   conformable(ret,lhs); | ||||
|   autoView( ret_v , ret, AcceleratorWrite); | ||||
|   autoView( lhs_v , lhs, AcceleratorRead); | ||||
|   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ | ||||
|     decltype(coalescedRead(obj1())) tmp; | ||||
|     auto lhs_t=lhs_v(ss); | ||||
|     sub(&tmp,&lhs_t,&rhs); | ||||
|     coalescedWrite(ret_v[ss],tmp); | ||||
|   }); | ||||
| } | ||||
| template<class obj1,class obj2,class obj3> inline | ||||
| void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ | ||||
|   ret.Checkerboard() = lhs.Checkerboard(); | ||||
|   conformable(lhs,ret); | ||||
|   autoView( ret_v , ret, AcceleratorWrite); | ||||
|   autoView( lhs_v , lhs, AcceleratorRead); | ||||
|   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ | ||||
|     decltype(coalescedRead(obj1())) tmp; | ||||
|     auto lhs_t=lhs_v(ss); | ||||
|     add(&tmp,&lhs_t,&rhs); | ||||
|     coalescedWrite(ret_v[ss],tmp); | ||||
|   }); | ||||
| } | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| //  avoid copy back routines for mult, mac, sub, add | ||||
| ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| template<class obj1,class obj2,class obj3> inline | ||||
| void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ | ||||
|   ret.Checkerboard() = rhs.Checkerboard(); | ||||
|   conformable(ret,rhs); | ||||
|   autoView( ret_v , ret, AcceleratorWrite); | ||||
|   autoView( rhs_v , lhs, AcceleratorRead); | ||||
|   accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ | ||||
|     decltype(coalescedRead(obj1())) tmp; | ||||
|     auto rhs_t=rhs_v(ss); | ||||
|     mult(&tmp,&lhs,&rhs_t); | ||||
|     coalescedWrite(ret_v[ss],tmp); | ||||
|   }); | ||||
| } | ||||
|    | ||||
| template<class obj1,class obj2,class obj3> inline | ||||
| void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ | ||||
|   ret.Checkerboard() = rhs.Checkerboard(); | ||||
|   conformable(ret,rhs); | ||||
|   autoView( ret_v , ret, AcceleratorWrite); | ||||
|   autoView( rhs_v , lhs, AcceleratorRead); | ||||
|   accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ | ||||
|     decltype(coalescedRead(obj1())) tmp; | ||||
|     auto rhs_t=rhs_v(ss); | ||||
|     mac(&tmp,&lhs,&rhs_t); | ||||
|     coalescedWrite(ret_v[ss],tmp); | ||||
|   }); | ||||
| } | ||||
|    | ||||
| template<class obj1,class obj2,class obj3> inline | ||||
| void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ | ||||
|   ret.Checkerboard() = rhs.Checkerboard(); | ||||
|   conformable(ret,rhs); | ||||
|   autoView( ret_v , ret, AcceleratorWrite); | ||||
|   autoView( rhs_v , lhs, AcceleratorRead); | ||||
|   accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ | ||||
|     decltype(coalescedRead(obj1())) tmp; | ||||
|     auto rhs_t=rhs_v(ss); | ||||
|     sub(&tmp,&lhs,&rhs_t); | ||||
|     coalescedWrite(ret_v[ss],tmp); | ||||
|   }); | ||||
| } | ||||
| template<class obj1,class obj2,class obj3> inline | ||||
| void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ | ||||
|   ret.Checkerboard() = rhs.Checkerboard(); | ||||
|   conformable(ret,rhs); | ||||
|   autoView( ret_v , ret, AcceleratorWrite); | ||||
|   autoView( rhs_v , lhs, AcceleratorRead); | ||||
|   accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ | ||||
|     decltype(coalescedRead(obj1())) tmp; | ||||
|     auto rhs_t=rhs_v(ss); | ||||
|     add(&tmp,&lhs,&rhs_t); | ||||
|     coalescedWrite(ret_v[ss],tmp); | ||||
|   }); | ||||
| } | ||||
|    | ||||
| template<class sobj,class vobj> inline | ||||
| void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){ | ||||
|   ret.Checkerboard() = x.Checkerboard(); | ||||
|   conformable(ret,x); | ||||
|   conformable(x,y); | ||||
|   autoView( ret_v , ret, AcceleratorWrite); | ||||
|   autoView( x_v , x, AcceleratorRead); | ||||
|   autoView( y_v , y, AcceleratorRead); | ||||
|   accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ | ||||
|     auto tmp = a*x_v(ss)+y_v(ss); | ||||
|     coalescedWrite(ret_v[ss],tmp); | ||||
|   }); | ||||
| } | ||||
| template<class sobj,class vobj> inline | ||||
| void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){ | ||||
|   ret.Checkerboard() = x.Checkerboard(); | ||||
|   conformable(ret,x); | ||||
|   conformable(x,y); | ||||
|   autoView( ret_v , ret, AcceleratorWrite); | ||||
|   autoView( x_v , x, AcceleratorRead); | ||||
|   autoView( y_v , y, AcceleratorRead); | ||||
|   accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ | ||||
|     auto tmp = a*x_v(ss)+b*y_v(ss); | ||||
|     coalescedWrite(ret_v[ss],tmp); | ||||
|   }); | ||||
| } | ||||
|  | ||||
| template<class sobj,class vobj> inline | ||||
| RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y) | ||||
| { | ||||
|     return axpy_norm_fast(ret,a,x,y); | ||||
| } | ||||
| template<class sobj,class vobj> inline | ||||
| RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y) | ||||
| { | ||||
|     return axpby_norm_fast(ret,a,b,x,y); | ||||
| } | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
|   | ||||
| @@ -9,6 +9,7 @@ Copyright (C) 2015 | ||||
| Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
| Author: Christoph Lehner <christoph@lhnr.de> | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| @@ -27,349 +28,345 @@ with this program; if not, write to the Free Software Foundation, Inc., | ||||
| See the full license in the file "LICENSE" in the top level distribution | ||||
| directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_LATTICE_BASE_H | ||||
| #define GRID_LATTICE_BASE_H | ||||
| 			   /*  END LEGAL */ | ||||
|  | ||||
| #pragma once  | ||||
|  | ||||
| #define STREAMING_STORES | ||||
|  | ||||
| namespace Grid { | ||||
|  | ||||
| // TODO:  | ||||
| //       mac,real,imag | ||||
|  | ||||
| // Functionality: | ||||
| //     -=,+=,*=,() | ||||
| //     add,+,sub,-,mult,mac,* | ||||
| //     adj,conjugate | ||||
| //     real,imag | ||||
| //     transpose,transposeIndex   | ||||
| //     trace,traceIndex | ||||
| //     peekIndex | ||||
| //     innerProduct,outerProduct, | ||||
| //     localNorm2 | ||||
| //     localInnerProduct | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| extern int GridCshiftPermuteMap[4][16]; | ||||
|  | ||||
| //////////////////////////////////////////////// | ||||
| // Basic expressions used in Expression Template | ||||
| //////////////////////////////////////////////// | ||||
|  | ||||
| class LatticeBase | ||||
| { | ||||
| public: | ||||
|     virtual ~LatticeBase(void) = default; | ||||
|     GridBase *_grid; | ||||
| }; | ||||
|      | ||||
| class LatticeExpressionBase {}; | ||||
|  | ||||
| template <typename Op, typename T1>                            | ||||
| class LatticeUnaryExpression  : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase { | ||||
|  public: | ||||
|  LatticeUnaryExpression(const std::pair<Op,std::tuple<T1> > &arg): std::pair<Op,std::tuple<T1> >(arg) {}; | ||||
| }; | ||||
|  | ||||
| template <typename Op, typename T1, typename T2>               | ||||
| class LatticeBinaryExpression : public std::pair<Op,std::tuple<T1,T2> > , public LatticeExpressionBase { | ||||
|  public: | ||||
|  LatticeBinaryExpression(const std::pair<Op,std::tuple<T1,T2> > &arg): std::pair<Op,std::tuple<T1,T2> >(arg) {}; | ||||
| }; | ||||
|  | ||||
| template <typename Op, typename T1, typename T2, typename T3>  | ||||
| class LatticeTrinaryExpression :public std::pair<Op,std::tuple<T1,T2,T3> >, public LatticeExpressionBase { | ||||
|  public: | ||||
|  LatticeTrinaryExpression(const std::pair<Op,std::tuple<T1,T2,T3> > &arg): std::pair<Op,std::tuple<T1,T2,T3> >(arg) {}; | ||||
| }; | ||||
|  | ||||
| void inline conformable(GridBase *lhs,GridBase *rhs) | ||||
| { | ||||
|   assert((lhs == rhs) && " conformable check pointers mismatch "); | ||||
| } | ||||
|  | ||||
| ///////////////////////////////////////////////////////////////////////////////////////// | ||||
| // The real lattice class, with normal copy and assignment semantics. | ||||
| // This contains extra (host resident) grid pointer data that may be accessed by host code | ||||
| ///////////////////////////////////////////////////////////////////////////////////////// | ||||
| template<class vobj> | ||||
| class Lattice : public LatticeBase | ||||
| class Lattice : public LatticeAccelerator<vobj> | ||||
| { | ||||
| public: | ||||
|     int checkerboard; | ||||
|     Vector<vobj> _odata; | ||||
|      | ||||
|     // to pthread need a computable loop where loop induction is not required | ||||
|     int begin(void) { return 0;}; | ||||
|     int end(void)   { return _odata.size(); } | ||||
|     vobj & operator[](int i) { return _odata[i]; }; | ||||
|     const vobj & operator[](int i) const { return _odata[i]; }; | ||||
|   GridBase *Grid(void) const { return this->_grid; } | ||||
|   /////////////////////////////////////////////////// | ||||
|   // Member types | ||||
|   /////////////////////////////////////////////////// | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|   typedef typename vobj::scalar_object scalar_object; | ||||
|   typedef vobj vector_object; | ||||
|  | ||||
| private: | ||||
|   void dealloc(void) | ||||
|   { | ||||
|     if( this->_odata_size ) { | ||||
|       alignedAllocator<vobj> alloc; | ||||
|       alloc.deallocate(this->_odata,this->_odata_size); | ||||
|       this->_odata=nullptr; | ||||
|       this->_odata_size=0; | ||||
|     } | ||||
|   } | ||||
|   void resize(uint64_t size) | ||||
|   { | ||||
|     if ( this->_odata_size != size ) { | ||||
|       alignedAllocator<vobj> alloc; | ||||
|  | ||||
|       dealloc(); | ||||
|        | ||||
|       this->_odata_size = size; | ||||
|       if ( size ) | ||||
| 	this->_odata      = alloc.allocate(this->_odata_size); | ||||
|       else  | ||||
| 	this->_odata      = nullptr; | ||||
|     } | ||||
|   } | ||||
| public: | ||||
|     typedef typename vobj::scalar_type scalar_type; | ||||
|     typedef typename vobj::vector_type vector_type; | ||||
|     typedef vobj vector_object; | ||||
|     | ||||
|  | ||||
|   ///////////////////////////////////////////////////////////////////////////////// | ||||
|   // Can use to make accelerator dirty without copy from host ; useful for temporaries "dont care" prev contents | ||||
|   ///////////////////////////////////////////////////////////////////////////////// | ||||
|   void SetViewMode(ViewMode mode) { | ||||
|     LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode); | ||||
|     accessor.ViewClose(); | ||||
|   } | ||||
|   ///////////////////////////////////////////////////////////////////////////////// | ||||
|   // Return a view object that may be dereferenced in site loops. | ||||
|   // The view is trivially copy constructible and may be copied to an accelerator device | ||||
|   // in device lambdas | ||||
|   ///////////////////////////////////////////////////////////////////////////////// | ||||
|  | ||||
|   LatticeView<vobj> View (ViewMode mode) const  | ||||
|   { | ||||
|     LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode); | ||||
|     return accessor; | ||||
|   } | ||||
|  | ||||
|   ~Lattice() {  | ||||
|     if ( this->_odata_size ) { | ||||
|       dealloc(); | ||||
|     } | ||||
|    } | ||||
|   //////////////////////////////////////////////////////////////////////////////// | ||||
|   // Expression Template closure support | ||||
|   //////////////////////////////////////////////////////////////////////////////// | ||||
|   template <typename Op, typename T1>                         strong_inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr) | ||||
|   template <typename Op, typename T1> inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr) | ||||
|   { | ||||
|     GridBase *egrid(nullptr); | ||||
|     GridFromExpression(egrid,expr); | ||||
|     assert(egrid!=nullptr); | ||||
|     conformable(_grid,egrid); | ||||
|     conformable(this->_grid,egrid); | ||||
|  | ||||
|     int cb=-1; | ||||
|     CBFromExpression(cb,expr); | ||||
|     assert( (cb==Odd) || (cb==Even)); | ||||
|     checkerboard=cb; | ||||
|  | ||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       vobj tmp = eval(ss,expr); | ||||
|       vstream(_odata[ss] ,tmp); | ||||
| #else | ||||
|       _odata[ss]=eval(ss,expr); | ||||
| #endif | ||||
|     } | ||||
|     this->checkerboard=cb; | ||||
|      | ||||
|     auto exprCopy = expr; | ||||
|     ExpressionViewOpen(exprCopy); | ||||
|     auto me  = View(AcceleratorWriteDiscard); | ||||
|     accelerator_for(ss,me.size(),1,{ | ||||
|       auto tmp = eval(ss,exprCopy); | ||||
|       vstream(me[ss],tmp); | ||||
|     }); | ||||
|     me.ViewClose(); | ||||
|     ExpressionViewClose(exprCopy); | ||||
|     return *this; | ||||
|   } | ||||
|   template <typename Op, typename T1,typename T2> strong_inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr) | ||||
|   template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr) | ||||
|   { | ||||
|     GridBase *egrid(nullptr); | ||||
|     GridFromExpression(egrid,expr); | ||||
|     assert(egrid!=nullptr); | ||||
|     conformable(_grid,egrid); | ||||
|     conformable(this->_grid,egrid); | ||||
|  | ||||
|     int cb=-1; | ||||
|     CBFromExpression(cb,expr); | ||||
|     assert( (cb==Odd) || (cb==Even)); | ||||
|     checkerboard=cb; | ||||
|     this->checkerboard=cb; | ||||
|  | ||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       vobj tmp = eval(ss,expr); | ||||
|       vstream(_odata[ss] ,tmp); | ||||
| #else | ||||
|       _odata[ss]=eval(ss,expr); | ||||
| #endif | ||||
|     } | ||||
|     auto exprCopy = expr; | ||||
|     ExpressionViewOpen(exprCopy); | ||||
|     auto me  = View(AcceleratorWriteDiscard); | ||||
|     accelerator_for(ss,me.size(),1,{ | ||||
|       auto tmp = eval(ss,exprCopy); | ||||
|       vstream(me[ss],tmp); | ||||
|     }); | ||||
|     me.ViewClose(); | ||||
|     ExpressionViewClose(exprCopy); | ||||
|     return *this; | ||||
|   } | ||||
|   template <typename Op, typename T1,typename T2,typename T3> strong_inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr) | ||||
|   template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr) | ||||
|   { | ||||
|     GridBase *egrid(nullptr); | ||||
|     GridFromExpression(egrid,expr); | ||||
|     assert(egrid!=nullptr); | ||||
|     conformable(_grid,egrid); | ||||
|     conformable(this->_grid,egrid); | ||||
|  | ||||
|     int cb=-1; | ||||
|     CBFromExpression(cb,expr); | ||||
|     assert( (cb==Odd) || (cb==Even)); | ||||
|     checkerboard=cb; | ||||
|  | ||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       //vobj tmp = eval(ss,expr); | ||||
|       vstream(_odata[ss] ,eval(ss,expr)); | ||||
| #else | ||||
|       _odata[ss] = eval(ss,expr); | ||||
| #endif | ||||
|     } | ||||
|     this->checkerboard=cb; | ||||
|     auto exprCopy = expr; | ||||
|     ExpressionViewOpen(exprCopy); | ||||
|     auto me  = View(AcceleratorWriteDiscard); | ||||
|     accelerator_for(ss,me.size(),1,{ | ||||
|       auto tmp = eval(ss,exprCopy); | ||||
|       vstream(me[ss],tmp); | ||||
|     }); | ||||
|     me.ViewClose(); | ||||
|     ExpressionViewClose(exprCopy); | ||||
|     return *this; | ||||
|   } | ||||
|   //GridFromExpression is tricky to do | ||||
|   template<class Op,class T1> | ||||
|     Lattice(const LatticeUnaryExpression<Op,T1> & expr) { | ||||
|     _grid = nullptr; | ||||
|     GridFromExpression(_grid,expr); | ||||
|     assert(_grid!=nullptr); | ||||
|   Lattice(const LatticeUnaryExpression<Op,T1> & expr) { | ||||
|     this->_grid = nullptr; | ||||
|     GridFromExpression(this->_grid,expr); | ||||
|     assert(this->_grid!=nullptr); | ||||
|  | ||||
|     int cb=-1; | ||||
|     CBFromExpression(cb,expr); | ||||
|     assert( (cb==Odd) || (cb==Even)); | ||||
|     checkerboard=cb; | ||||
|     this->checkerboard=cb; | ||||
|  | ||||
|     _odata.resize(_grid->oSites()); | ||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       vobj tmp = eval(ss,expr); | ||||
|       vstream(_odata[ss] ,tmp); | ||||
| #else | ||||
|       _odata[ss]=eval(ss,expr); | ||||
| #endif | ||||
|     } | ||||
|   }; | ||||
|     resize(this->_grid->oSites()); | ||||
|  | ||||
|     *this = expr; | ||||
|   } | ||||
|   template<class Op,class T1, class T2> | ||||
|   Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) { | ||||
|     _grid = nullptr; | ||||
|     GridFromExpression(_grid,expr); | ||||
|     assert(_grid!=nullptr); | ||||
|     this->_grid = nullptr; | ||||
|     GridFromExpression(this->_grid,expr); | ||||
|     assert(this->_grid!=nullptr); | ||||
|  | ||||
|     int cb=-1; | ||||
|     CBFromExpression(cb,expr); | ||||
|     assert( (cb==Odd) || (cb==Even)); | ||||
|     checkerboard=cb; | ||||
|     this->checkerboard=cb; | ||||
|  | ||||
|     _odata.resize(_grid->oSites()); | ||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | ||||
| #ifdef STREAMING_STORES | ||||
|       vobj tmp = eval(ss,expr); | ||||
|       vstream(_odata[ss] ,tmp); | ||||
| #else | ||||
|       _odata[ss]=eval(ss,expr); | ||||
| #endif | ||||
|     } | ||||
|   }; | ||||
|     resize(this->_grid->oSites()); | ||||
|  | ||||
|     *this = expr; | ||||
|   } | ||||
|   template<class Op,class T1, class T2, class T3> | ||||
|   Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) { | ||||
|     _grid = nullptr; | ||||
|     GridFromExpression(_grid,expr); | ||||
|     assert(_grid!=nullptr); | ||||
|     this->_grid = nullptr; | ||||
|     GridFromExpression(this->_grid,expr); | ||||
|     assert(this->_grid!=nullptr); | ||||
|  | ||||
|     int cb=-1; | ||||
|     CBFromExpression(cb,expr); | ||||
|     assert( (cb==Odd) || (cb==Even)); | ||||
|     checkerboard=cb; | ||||
|     this->checkerboard=cb; | ||||
|  | ||||
|     _odata.resize(_grid->oSites()); | ||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | ||||
|       vstream(_odata[ss] ,eval(ss,expr)); | ||||
|     } | ||||
|   }; | ||||
|     resize(this->_grid->oSites()); | ||||
|  | ||||
|     *this = expr; | ||||
|   } | ||||
|  | ||||
|   template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){ | ||||
|     auto me  = View(CpuWrite); | ||||
|     thread_for(ss,me.size(),{ | ||||
| 	me[ss]= r; | ||||
|     }); | ||||
|     me.ViewClose(); | ||||
|     return *this; | ||||
|   } | ||||
|  | ||||
|   ////////////////////////////////////////////////////////////////// | ||||
|   // Constructor requires "grid" passed. | ||||
|   // what about a default grid? | ||||
|   ////////////////////////////////////////////////////////////////// | ||||
|   Lattice(GridBase *grid) : _odata(grid->oSites()) { | ||||
|     _grid = grid; | ||||
|     //        _odata.reserve(_grid->oSites()); | ||||
|     //        _odata.resize(_grid->oSites()); | ||||
|     //      std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl; | ||||
|     assert((((uint64_t)&_odata[0])&0xF) ==0); | ||||
|     checkerboard=0; | ||||
|   // Follow rule of five, with Constructor requires "grid" passed | ||||
|   // to user defined constructor | ||||
|   /////////////////////////////////////////// | ||||
|   // user defined constructor | ||||
|   /////////////////////////////////////////// | ||||
|   Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) {  | ||||
|     this->_grid = grid; | ||||
|     resize(this->_grid->oSites()); | ||||
|     assert((((uint64_t)&this->_odata[0])&0xF) ==0); | ||||
|     this->checkerboard=0; | ||||
|     SetViewMode(mode); | ||||
|   } | ||||
|    | ||||
|   Lattice(const Lattice& r){ // copy constructor | ||||
|     _grid = r._grid; | ||||
|     checkerboard = r.checkerboard; | ||||
|     _odata.resize(_grid->oSites());// essential | ||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | ||||
|       _odata[ss]=r._odata[ss]; | ||||
|     }  	 | ||||
|   } | ||||
|  | ||||
|   Lattice(Lattice&& r){ // move constructor | ||||
|     _grid = r._grid; | ||||
|     checkerboard = r.checkerboard; | ||||
|     _odata=std::move(r._odata); | ||||
|   } | ||||
|    | ||||
|   inline Lattice<vobj> & operator = (Lattice<vobj> && r) | ||||
|   { | ||||
|     _grid        = r._grid; | ||||
|     checkerboard = r.checkerboard; | ||||
|     _odata       =std::move(r._odata); | ||||
|     return *this; | ||||
|   } | ||||
|  | ||||
|   inline Lattice<vobj> & operator = (const Lattice<vobj> & r){ | ||||
|     _grid        = r._grid; | ||||
|     checkerboard = r.checkerboard; | ||||
|     _odata.resize(_grid->oSites());// essential | ||||
|      | ||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | ||||
|       _odata[ss]=r._odata[ss]; | ||||
|     }  	 | ||||
|     return *this; | ||||
|   } | ||||
|  | ||||
|   template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){ | ||||
|     this->checkerboard = r.checkerboard; | ||||
|     conformable(*this,r); | ||||
|      | ||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | ||||
|       this->_odata[ss]=r._odata[ss]; | ||||
|     } | ||||
|     return *this; | ||||
|   } | ||||
|  | ||||
|   virtual ~Lattice(void) = default; | ||||
|   //  virtual ~Lattice(void) = default; | ||||
|      | ||||
|   void reset(GridBase* grid) { | ||||
|     if (_grid != grid) { | ||||
|       _grid = grid; | ||||
|       _odata.resize(grid->oSites()); | ||||
|       checkerboard = 0; | ||||
|     if (this->_grid != grid) { | ||||
|       this->_grid = grid; | ||||
|       this->resize(grid->oSites()); | ||||
|       this->checkerboard = 0; | ||||
|     } | ||||
|   } | ||||
|    | ||||
|  | ||||
|   template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){ | ||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | ||||
|       this->_odata[ss]=r; | ||||
|     } | ||||
|   /////////////////////////////////////////// | ||||
|   // copy constructor | ||||
|   /////////////////////////////////////////// | ||||
|   Lattice(const Lattice& r){  | ||||
|     this->_grid = r.Grid(); | ||||
|     resize(this->_grid->oSites()); | ||||
|     *this = r; | ||||
|   } | ||||
|   /////////////////////////////////////////// | ||||
|   // move constructor | ||||
|   /////////////////////////////////////////// | ||||
|   Lattice(Lattice && r){  | ||||
|     this->_grid = r.Grid(); | ||||
|     this->_odata      = r._odata; | ||||
|     this->_odata_size = r._odata_size; | ||||
|     this->checkerboard= r.Checkerboard(); | ||||
|     r._odata      = nullptr; | ||||
|     r._odata_size = 0; | ||||
|   } | ||||
|   /////////////////////////////////////////// | ||||
|   // assignment template | ||||
|   /////////////////////////////////////////// | ||||
|   template<class robj> inline Lattice<vobj> & operator = (const Lattice<robj> & r){ | ||||
|     typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0; | ||||
|     conformable(*this,r); | ||||
|     this->checkerboard = r.Checkerboard(); | ||||
|     auto me =   View(AcceleratorWriteDiscard); | ||||
|     auto him= r.View(AcceleratorRead); | ||||
|     accelerator_for(ss,me.size(),vobj::Nsimd(),{ | ||||
|       coalescedWrite(me[ss],him(ss)); | ||||
|     }); | ||||
|     me.ViewClose();    him.ViewClose(); | ||||
|     return *this; | ||||
|   } | ||||
|    | ||||
|    | ||||
|  | ||||
|   /////////////////////////////////////////// | ||||
|   // Copy assignment  | ||||
|   /////////////////////////////////////////// | ||||
|   inline Lattice<vobj> & operator = (const Lattice<vobj> & r){ | ||||
|     this->checkerboard = r.Checkerboard(); | ||||
|     conformable(*this,r); | ||||
|     auto me =   View(AcceleratorWriteDiscard); | ||||
|     auto him= r.View(AcceleratorRead); | ||||
|     accelerator_for(ss,me.size(),vobj::Nsimd(),{ | ||||
|       coalescedWrite(me[ss],him(ss)); | ||||
|     }); | ||||
|     me.ViewClose();    him.ViewClose(); | ||||
|     return *this; | ||||
|   } | ||||
|   /////////////////////////////////////////// | ||||
|   // Move assignment possible if same type | ||||
|   /////////////////////////////////////////// | ||||
|   inline Lattice<vobj> & operator = (Lattice<vobj> && r){ | ||||
|  | ||||
|     resize(0); // deletes if appropriate | ||||
|     this->_grid       = r.Grid(); | ||||
|     this->_odata      = r._odata; | ||||
|     this->_odata_size = r._odata_size; | ||||
|     this->checkerboard= r.Checkerboard(); | ||||
|  | ||||
|     r._odata      = nullptr; | ||||
|     r._odata_size = 0; | ||||
|      | ||||
|     return *this; | ||||
|   } | ||||
|  | ||||
|   ///////////////////////////////////////////////////////////////////////////// | ||||
|   // *=,+=,-= operators inherit behvour from correspond */+/- operation | ||||
|   template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) { | ||||
|   ///////////////////////////////////////////////////////////////////////////// | ||||
|   template<class T> inline Lattice<vobj> &operator *=(const T &r) { | ||||
|     *this = (*this)*r; | ||||
|     return *this; | ||||
|   } | ||||
|    | ||||
|   template<class T> strong_inline Lattice<vobj> &operator -=(const T &r) { | ||||
|   template<class T> inline Lattice<vobj> &operator -=(const T &r) { | ||||
|     *this = (*this)-r; | ||||
|     return *this; | ||||
|   } | ||||
|   template<class T> strong_inline Lattice<vobj> &operator +=(const T &r) { | ||||
|   template<class T> inline Lattice<vobj> &operator +=(const T &r) { | ||||
|     *this = (*this)+r; | ||||
|     return *this; | ||||
|   } | ||||
| }; // class Lattice | ||||
|    | ||||
|   template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){ | ||||
|     std::vector<int> gcoor; | ||||
|     typedef typename vobj::scalar_object sobj; | ||||
|     sobj ss; | ||||
|     for(int g=0;g<o._grid->_gsites;g++){ | ||||
|       o._grid->GlobalIndexToGlobalCoor(g,gcoor); | ||||
|       peekSite(ss,o,gcoor); | ||||
|       stream<<"["; | ||||
|       for(int d=0;d<gcoor.size();d++){ | ||||
| 	stream<<gcoor[d]; | ||||
| 	if(d!=gcoor.size()-1) stream<<","; | ||||
|       } | ||||
|       stream<<"]\t"; | ||||
|       stream<<ss<<std::endl; | ||||
|     } | ||||
|     return stream; | ||||
|  | ||||
|   friend inline void swap(Lattice &l, Lattice &r) {  | ||||
|     conformable(l,r); | ||||
|     LatticeAccelerator<vobj> tmp; | ||||
|     LatticeAccelerator<vobj> *lp = (LatticeAccelerator<vobj> *)&l; | ||||
|     LatticeAccelerator<vobj> *rp = (LatticeAccelerator<vobj> *)&r; | ||||
|     tmp = *lp;    *lp=*rp;    *rp=tmp; | ||||
|   } | ||||
|    | ||||
|  | ||||
| }; // class Lattice | ||||
|  | ||||
| template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){ | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   for(int g=0;g<o.Grid()->_gsites;g++){ | ||||
|  | ||||
|     Coordinate gcoor; | ||||
|     o.Grid()->GlobalIndexToGlobalCoor(g,gcoor); | ||||
|  | ||||
|     sobj ss; | ||||
|     peekSite(ss,o,gcoor); | ||||
|     stream<<"["; | ||||
|     for(int d=0;d<gcoor.size();d++){ | ||||
|       stream<<gcoor[d]; | ||||
|       if(d!=gcoor.size()-1) stream<<","; | ||||
|     } | ||||
|     stream<<"]\t"; | ||||
|     stream<<ss<<std::endl; | ||||
|   } | ||||
|   return stream; | ||||
| } | ||||
|    | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
|  | ||||
|  | ||||
| #include "Lattice_conformable.h" | ||||
| #define GRID_LATTICE_EXPRESSION_TEMPLATES | ||||
| #ifdef  GRID_LATTICE_EXPRESSION_TEMPLATES | ||||
| #include "Lattice_ET.h" | ||||
| #else  | ||||
| #include "Lattice_overload.h" | ||||
| #endif | ||||
| #include "Lattice_arith.h" | ||||
| #include "Lattice_trace.h" | ||||
| #include "Lattice_transpose.h" | ||||
| #include "Lattice_local.h" | ||||
| #include "Lattice_reduction.h" | ||||
| #include "Lattice_peekpoke.h" | ||||
| #include "Lattice_reality.h" | ||||
| #include "Lattice_comparison_utils.h" | ||||
| #include "Lattice_comparison.h" | ||||
| #include "Lattice_coordinate.h" | ||||
| #include "Lattice_where.h" | ||||
| #include "Lattice_rng.h" | ||||
| #include "Lattice_unary.h" | ||||
| #include "Lattice_transfer.h" | ||||
|  | ||||
|  | ||||
| #endif | ||||
|   | ||||
							
								
								
									
										226
									
								
								Grid/lattice/Lattice_basis.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										226
									
								
								Grid/lattice/Lattice_basis.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,226 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
| Grid physics library, www.github.com/paboyle/Grid | ||||
|  | ||||
| Source file: ./lib/lattice/Lattice_basis.h | ||||
|  | ||||
| Copyright (C) 2015 | ||||
|  | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
| Author: Christoph Lehner <christoph@lhnr.de> | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
|  | ||||
| You should have received a copy of the GNU General Public License along | ||||
| with this program; if not, write to the Free Software Foundation, Inc., | ||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
| See the full license in the file "LICENSE" in the top level distribution | ||||
| directory | ||||
| *************************************************************************************/ | ||||
| 			   /*  END LEGAL */ | ||||
|  | ||||
| #pragma once | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| template<class Field> | ||||
| void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)  | ||||
| { | ||||
|   // If assume basis[j] are already orthonormal, | ||||
|   // can take all inner products in parallel saving 2x bandwidth | ||||
|   // Save 3x bandwidth on the second line of loop. | ||||
|   // perhaps 2.5x speed up. | ||||
|   // 2x overall in Multigrid Lanczos   | ||||
|   for(int j=0; j<k; ++j){ | ||||
|     auto ip = innerProduct(basis[j],w); | ||||
|     w = w - ip*basis[j]; | ||||
|   } | ||||
| } | ||||
|  | ||||
| template<class VField, class Matrix> | ||||
| void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)  | ||||
| { | ||||
|   typedef decltype(basis[0]) Field; | ||||
|   typedef decltype(basis[0].View(AcceleratorRead)) View; | ||||
|  | ||||
|   Vector<View> basis_v; basis_v.reserve(basis.size()); | ||||
|   GridBase* grid = basis[0].Grid(); | ||||
|        | ||||
|   for(int k=0;k<basis.size();k++){ | ||||
|     basis_v.push_back(basis[k].View(AcceleratorWrite)); | ||||
|   } | ||||
|  | ||||
|  | ||||
|   View *basis_vp = &basis_v[0]; | ||||
|  | ||||
|   int nrot = j1-j0; | ||||
|   if (!nrot) // edge case not handled gracefully by Cuda | ||||
|     return; | ||||
|  | ||||
|   uint64_t oSites   =grid->oSites(); | ||||
|   uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead | ||||
|  | ||||
|   typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj; | ||||
|  | ||||
|   Vector <vobj> Bt(siteBlock * nrot);  | ||||
|   auto Bp=&Bt[0]; | ||||
|  | ||||
|   // GPU readable copy of matrix | ||||
|   Vector<double> Qt_jv(Nm*Nm); | ||||
|   double *Qt_p = & Qt_jv[0]; | ||||
|   thread_for(i,Nm*Nm,{ | ||||
|       int j = i/Nm; | ||||
|       int k = i%Nm; | ||||
|       Qt_p[i]=Qt(j,k); | ||||
|   }); | ||||
|  | ||||
|   // Block the loop to keep storage footprint down | ||||
|   for(uint64_t s=0;s<oSites;s+=siteBlock){ | ||||
|  | ||||
|     // remaining work in this block | ||||
|     int ssites=MIN(siteBlock,oSites-s); | ||||
|  | ||||
|     // zero out the accumulators | ||||
|     accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{ | ||||
| 	decltype(coalescedRead(Bp[ss])) z; | ||||
| 	z=Zero(); | ||||
| 	coalescedWrite(Bp[ss],z); | ||||
|       }); | ||||
|  | ||||
|     accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{ | ||||
| 	 | ||||
| 	int j =sj%nrot; | ||||
| 	int jj  =j0+j; | ||||
| 	int ss =sj/nrot; | ||||
| 	int sss=ss+s; | ||||
|  | ||||
| 	for(int k=k0; k<k1; ++k){ | ||||
| 	  auto tmp = coalescedRead(Bp[ss*nrot+j]); | ||||
| 	  coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss])); | ||||
| 	} | ||||
|       }); | ||||
|  | ||||
|     accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{ | ||||
| 	int j =sj%nrot; | ||||
| 	int jj  =j0+j; | ||||
| 	int ss =sj/nrot; | ||||
| 	int sss=ss+s; | ||||
| 	coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j])); | ||||
|       }); | ||||
|   } | ||||
|  | ||||
|   for(int k=0;k<basis.size();k++) basis_v[k].ViewClose(); | ||||
| } | ||||
|  | ||||
| // Extract a single rotated vector | ||||
| template<class Field> | ||||
| void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)  | ||||
| { | ||||
|   typedef decltype(basis[0].View(AcceleratorRead)) View; | ||||
|   typedef typename Field::vector_object vobj; | ||||
|   GridBase* grid = basis[0].Grid(); | ||||
|  | ||||
|   result.Checkerboard() = basis[0].Checkerboard(); | ||||
|  | ||||
|   Vector<View> basis_v; basis_v.reserve(basis.size()); | ||||
|   for(int k=0;k<basis.size();k++){ | ||||
|     basis_v.push_back(basis[k].View(AcceleratorRead)); | ||||
|   } | ||||
|   vobj zz=Zero(); | ||||
|   Vector<double> Qt_jv(Nm); | ||||
|   double * Qt_j = & Qt_jv[0]; | ||||
|   for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k); | ||||
|  | ||||
|   autoView(result_v,result,AcceleratorWrite); | ||||
|   accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{ | ||||
|     auto B=coalescedRead(zz); | ||||
|     for(int k=k0; k<k1; ++k){ | ||||
|       B +=Qt_j[k] * coalescedRead(basis_v[k][ss]); | ||||
|     } | ||||
|     coalescedWrite(result_v[ss], B); | ||||
|   }); | ||||
|   for(int k=0;k<basis.size();k++) basis_v[k].ViewClose(); | ||||
| } | ||||
|  | ||||
| template<class Field> | ||||
| void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx)  | ||||
| { | ||||
|   int vlen = idx.size(); | ||||
|  | ||||
|   assert(vlen>=1); | ||||
|   assert(vlen<=sort_vals.size()); | ||||
|   assert(vlen<=_v.size()); | ||||
|  | ||||
|   for (size_t i=0;i<vlen;i++) { | ||||
|  | ||||
|     if (idx[i] != i) { | ||||
|  | ||||
|       ////////////////////////////////////// | ||||
|       // idx[i] is a table of desired sources giving a permutation. | ||||
|       // Swap v[i] with v[idx[i]]. | ||||
|       // Find  j>i for which _vnew[j] = _vold[i], | ||||
|       // track the move idx[j] => idx[i] | ||||
|       // track the move idx[i] => i | ||||
|       ////////////////////////////////////// | ||||
|       size_t j; | ||||
|       for (j=i;j<idx.size();j++) | ||||
| 	if (idx[j]==i) | ||||
| 	  break; | ||||
|  | ||||
|       assert(idx[i] > i);     assert(j!=idx.size());      assert(idx[j]==i); | ||||
|  | ||||
|       swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy | ||||
|       std::swap(sort_vals[i],sort_vals[idx[i]]); | ||||
|  | ||||
|       idx[j] = idx[i]; | ||||
|       idx[i] = i; | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals)  | ||||
| { | ||||
|   std::vector<int> idx(sort_vals.size()); | ||||
|   std::iota(idx.begin(), idx.end(), 0); | ||||
|  | ||||
|   // sort indexes based on comparing values in v | ||||
|   std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) { | ||||
|     return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]); | ||||
|   }); | ||||
|   return idx; | ||||
| } | ||||
|  | ||||
| template<class Field> | ||||
| void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse)  | ||||
| { | ||||
|   std::vector<int> idx = basisSortGetIndex(sort_vals); | ||||
|   if (reverse) | ||||
|     std::reverse(idx.begin(), idx.end()); | ||||
|    | ||||
|   basisReorderInPlace(_v,sort_vals,idx); | ||||
| } | ||||
|  | ||||
| // PAB: faster to compute the inner products first then fuse loops. | ||||
| // If performance critical can improve. | ||||
| template<class Field> | ||||
| void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) { | ||||
|   result = Zero(); | ||||
|   assert(_v.size()==eval.size()); | ||||
|   int N = (int)_v.size(); | ||||
|   for (int i=0;i<N;i++) { | ||||
|     Field& tmp = _v[i]; | ||||
|     axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result); | ||||
|   } | ||||
| } | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -24,146 +24,184 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_LATTICE_COMPARISON_H | ||||
| #define GRID_LATTICE_COMPARISON_H | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
|     ////////////////////////////////////////////////////////////////////////// | ||||
|     // relational operators | ||||
|     //  | ||||
|     // Support <,>,<=,>=,==,!= | ||||
|     // | ||||
|     //Query supporting bitwise &, |, ^, ! | ||||
|     //Query supporting logical &&, ||,  | ||||
|     ////////////////////////////////////////////////////////////////////////// | ||||
| ////////////////////////////////////////////////////////////////////////// | ||||
| // relational operators | ||||
| //  | ||||
| // Support <,>,<=,>=,==,!= | ||||
| // | ||||
| //Query supporting bitwise &, |, ^, ! | ||||
| //Query supporting logical &&, ||,  | ||||
| ////////////////////////////////////////////////////////////////////////// | ||||
|  | ||||
|   ////////////////////////////////////////////////////////////////////////// | ||||
|   // compare lattice to lattice | ||||
|   ////////////////////////////////////////////////////////////////////////// | ||||
|   template<class vfunctor,class lobj,class robj>   | ||||
|     inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs) | ||||
|   { | ||||
|     Lattice<vInteger> ret(rhs._grid); | ||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
|       ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]); | ||||
|     } | ||||
|     return ret; | ||||
| typedef iScalar<vInteger> vPredicate ; | ||||
|  | ||||
| /* | ||||
| template <class iobj, class vobj, class robj> accelerator_inline  | ||||
| vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, const robj &iffalse)  | ||||
| { | ||||
|   typename std::remove_const<vobj>::type ret; | ||||
|  | ||||
|   typedef typename vobj::scalar_object scalar_object; | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|  | ||||
|   const int Nsimd = vobj::vector_type::Nsimd(); | ||||
|  | ||||
|   ExtractBuffer<Integer> mask(Nsimd); | ||||
|   ExtractBuffer<scalar_object> truevals(Nsimd); | ||||
|   ExtractBuffer<scalar_object> falsevals(Nsimd); | ||||
|  | ||||
|   extract(iftrue, truevals); | ||||
|   extract(iffalse, falsevals); | ||||
|   extract<vInteger, Integer>(TensorRemove(predicate), mask); | ||||
|  | ||||
|   for (int s = 0; s < Nsimd; s++) { | ||||
|     if (mask[s]) falsevals[s] = truevals[s]; | ||||
|   } | ||||
|   ////////////////////////////////////////////////////////////////////////// | ||||
|   // compare lattice to scalar | ||||
|   ////////////////////////////////////////////////////////////////////////// | ||||
|   template<class vfunctor,class lobj,class robj>  | ||||
|     inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs) | ||||
|   { | ||||
|     Lattice<vInteger> ret(lhs._grid); | ||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){ | ||||
|       ret._odata[ss]=op(lhs._odata[ss],rhs); | ||||
|     } | ||||
|     return ret; | ||||
|   } | ||||
|   ////////////////////////////////////////////////////////////////////////// | ||||
|   // compare scalar to lattice | ||||
|   ////////////////////////////////////////////////////////////////////////// | ||||
|   template<class vfunctor,class lobj,class robj>  | ||||
|     inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs) | ||||
|   { | ||||
|     Lattice<vInteger> ret(rhs._grid); | ||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
|       ret._odata[ss]=op(lhs._odata[ss],rhs); | ||||
|     } | ||||
|     return ret; | ||||
|   } | ||||
|    | ||||
|   ////////////////////////////////////////////////////////////////////////// | ||||
|   // Map to functors | ||||
|   ////////////////////////////////////////////////////////////////////////// | ||||
|   // Less than | ||||
|   template<class lobj,class robj> | ||||
|     inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { | ||||
|     return LLComparison(vlt<lobj,robj>(),lhs,rhs); | ||||
|   } | ||||
|   template<class lobj,class robj> | ||||
|     inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) { | ||||
|     return LSComparison(vlt<lobj,robj>(),lhs,rhs); | ||||
|   } | ||||
|   template<class lobj,class robj> | ||||
|     inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) { | ||||
|     return SLComparison(vlt<lobj,robj>(),lhs,rhs); | ||||
|   } | ||||
|    | ||||
|   // Less than equal | ||||
|   template<class lobj,class robj> | ||||
|     inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { | ||||
|     return LLComparison(vle<lobj,robj>(),lhs,rhs); | ||||
|   } | ||||
|   template<class lobj,class robj> | ||||
|     inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) { | ||||
|     return LSComparison(vle<lobj,robj>(),lhs,rhs); | ||||
|   } | ||||
|   template<class lobj,class robj> | ||||
|     inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) { | ||||
|     return SLComparison(vle<lobj,robj>(),lhs,rhs); | ||||
|   } | ||||
|    | ||||
|   // Greater than  | ||||
|   template<class lobj,class robj> | ||||
|     inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { | ||||
|     return LLComparison(vgt<lobj,robj>(),lhs,rhs); | ||||
|   } | ||||
|   template<class lobj,class robj> | ||||
|     inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) { | ||||
|     return LSComparison(vgt<lobj,robj>(),lhs,rhs); | ||||
|   } | ||||
|   template<class lobj,class robj> | ||||
|     inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) { | ||||
|      return SLComparison(vgt<lobj,robj>(),lhs,rhs); | ||||
|   } | ||||
|    | ||||
|    | ||||
|   // Greater than equal | ||||
|    template<class lobj,class robj> | ||||
|      inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { | ||||
|      return LLComparison(vge<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|    template<class lobj,class robj> | ||||
|    inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const robj & rhs) { | ||||
|      return LSComparison(vge<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|    template<class lobj,class robj> | ||||
|      inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) { | ||||
|      return SLComparison(vge<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|     | ||||
|    // equal | ||||
|    template<class lobj,class robj> | ||||
|      inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { | ||||
|      return LLComparison(veq<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|    template<class lobj,class robj> | ||||
|      inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) { | ||||
|      return LSComparison(veq<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|    template<class lobj,class robj> | ||||
|      inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) { | ||||
|      return SLComparison(veq<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|     | ||||
|     | ||||
|    // not equal | ||||
|    template<class lobj,class robj> | ||||
|      inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { | ||||
|      return LLComparison(vne<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|    template<class lobj,class robj> | ||||
|      inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) { | ||||
|      return LSComparison(vne<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|    template<class lobj,class robj> | ||||
|      inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) { | ||||
|      return SLComparison(vne<lobj,robj>(),lhs,rhs); | ||||
|    } | ||||
|  | ||||
|   merge(ret, falsevals); | ||||
|   return ret; | ||||
| } | ||||
| */ | ||||
| ////////////////////////////////////////////////////////////////////////// | ||||
| // compare lattice to lattice | ||||
| ////////////////////////////////////////////////////////////////////////// | ||||
|  | ||||
| template<class vfunctor,class lobj,class robj>   | ||||
| inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs) | ||||
| { | ||||
|   Lattice<vPredicate> ret(rhs.Grid()); | ||||
|   autoView( lhs_v, lhs, CpuRead); | ||||
|   autoView( rhs_v, rhs, CpuRead); | ||||
|   autoView( ret_v, ret, CpuWrite); | ||||
|   thread_for( ss, rhs_v.size(), { | ||||
|       ret_v[ss]=op(lhs_v[ss],rhs_v[ss]); | ||||
|   }); | ||||
|   return ret; | ||||
| } | ||||
| ////////////////////////////////////////////////////////////////////////// | ||||
| // compare lattice to scalar | ||||
| ////////////////////////////////////////////////////////////////////////// | ||||
| template<class vfunctor,class lobj,class robj>  | ||||
| inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs) | ||||
| { | ||||
|   Lattice<vPredicate> ret(lhs.Grid()); | ||||
|   autoView( lhs_v, lhs, CpuRead); | ||||
|   autoView( ret_v, ret, CpuWrite); | ||||
|   thread_for( ss, lhs_v.size(), { | ||||
|     ret_v[ss]=op(lhs_v[ss],rhs); | ||||
|   }); | ||||
|   return ret; | ||||
| } | ||||
| ////////////////////////////////////////////////////////////////////////// | ||||
| // compare scalar to lattice | ||||
| ////////////////////////////////////////////////////////////////////////// | ||||
| template<class vfunctor,class lobj,class robj>  | ||||
| inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs) | ||||
| { | ||||
|   Lattice<vPredicate> ret(rhs.Grid()); | ||||
|   autoView( rhs_v, rhs, CpuRead); | ||||
|   autoView( ret_v, ret, CpuWrite); | ||||
|   thread_for( ss, rhs_v.size(), { | ||||
|     ret_v[ss]=op(lhs,rhs_v[ss]); | ||||
|   }); | ||||
|   return ret; | ||||
| } | ||||
|    | ||||
| ////////////////////////////////////////////////////////////////////////// | ||||
| // Map to functors | ||||
| ////////////////////////////////////////////////////////////////////////// | ||||
| // Less than | ||||
| template<class lobj,class robj> | ||||
| inline Lattice<vPredicate> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { | ||||
|   return LLComparison(vlt<lobj,robj>(),lhs,rhs); | ||||
| } | ||||
| template<class lobj,class robj> | ||||
| inline Lattice<vPredicate> operator < (const Lattice<lobj> & lhs, const robj & rhs) { | ||||
|   return LSComparison(vlt<lobj,robj>(),lhs,rhs); | ||||
| } | ||||
| template<class lobj,class robj> | ||||
| inline Lattice<vPredicate> operator < (const lobj & lhs, const Lattice<robj> & rhs) { | ||||
|   return SLComparison(vlt<lobj,robj>(),lhs,rhs); | ||||
| } | ||||
|    | ||||
| // Less than equal | ||||
| template<class lobj,class robj> | ||||
| inline Lattice<vPredicate> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { | ||||
|   return LLComparison(vle<lobj,robj>(),lhs,rhs); | ||||
| } | ||||
| template<class lobj,class robj> | ||||
| inline Lattice<vPredicate> operator <= (const Lattice<lobj> & lhs, const robj & rhs) { | ||||
|   return LSComparison(vle<lobj,robj>(),lhs,rhs); | ||||
| } | ||||
| template<class lobj,class robj> | ||||
| inline Lattice<vPredicate> operator <= (const lobj & lhs, const Lattice<robj> & rhs) { | ||||
|   return SLComparison(vle<lobj,robj>(),lhs,rhs); | ||||
| } | ||||
|    | ||||
| // Greater than  | ||||
| template<class lobj,class robj> | ||||
| inline Lattice<vPredicate> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { | ||||
|   return LLComparison(vgt<lobj,robj>(),lhs,rhs); | ||||
| } | ||||
| template<class lobj,class robj> | ||||
| inline Lattice<vPredicate> operator > (const Lattice<lobj> & lhs, const robj & rhs) { | ||||
|   return LSComparison(vgt<lobj,robj>(),lhs,rhs); | ||||
| } | ||||
| template<class lobj,class robj> | ||||
| inline Lattice<vPredicate> operator > (const lobj & lhs, const Lattice<robj> & rhs) { | ||||
|   return SLComparison(vgt<lobj,robj>(),lhs,rhs); | ||||
| } | ||||
|    | ||||
|    | ||||
| // Greater than equal | ||||
| template<class lobj,class robj> | ||||
| inline Lattice<vPredicate> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { | ||||
|   return LLComparison(vge<lobj,robj>(),lhs,rhs); | ||||
| } | ||||
| template<class lobj,class robj> | ||||
| inline Lattice<vPredicate> operator >= (const Lattice<lobj> & lhs, const robj & rhs) { | ||||
|   return LSComparison(vge<lobj,robj>(),lhs,rhs); | ||||
| } | ||||
| template<class lobj,class robj> | ||||
| inline Lattice<vPredicate> operator >= (const lobj & lhs, const Lattice<robj> & rhs) { | ||||
|   return SLComparison(vge<lobj,robj>(),lhs,rhs); | ||||
| } | ||||
|     | ||||
| // equal | ||||
| template<class lobj,class robj> | ||||
| inline Lattice<vPredicate> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { | ||||
|   return LLComparison(veq<lobj,robj>(),lhs,rhs); | ||||
| } | ||||
| template<class lobj,class robj> | ||||
| inline Lattice<vPredicate> operator == (const Lattice<lobj> & lhs, const robj & rhs) { | ||||
|   return LSComparison(veq<lobj,robj>(),lhs,rhs); | ||||
| } | ||||
| template<class lobj,class robj> | ||||
| inline Lattice<vPredicate> operator == (const lobj & lhs, const Lattice<robj> & rhs) { | ||||
|   return SLComparison(veq<lobj,robj>(),lhs,rhs); | ||||
| } | ||||
|     | ||||
|     | ||||
| // not equal | ||||
| template<class lobj,class robj> | ||||
| inline Lattice<vPredicate> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { | ||||
|   return LLComparison(vne<lobj,robj>(),lhs,rhs); | ||||
| } | ||||
| template<class lobj,class robj> | ||||
| inline Lattice<vPredicate> operator != (const Lattice<lobj> & lhs, const robj & rhs) { | ||||
|   return LSComparison(vne<lobj,robj>(),lhs,rhs); | ||||
| } | ||||
| template<class lobj,class robj> | ||||
| inline Lattice<vPredicate> operator != (const lobj & lhs, const Lattice<robj> & rhs) { | ||||
|   return SLComparison(vne<lobj,robj>(),lhs,rhs); | ||||
| } | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
|   | ||||
| @@ -26,10 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #ifndef GRID_COMPARISON_H | ||||
| #define GRID_COMPARISON_H | ||||
|  | ||||
| namespace Grid { | ||||
| #pragma once | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
|   ///////////////////////////////////////// | ||||
|   // This implementation is a bit poor. | ||||
| @@ -44,42 +44,42 @@ namespace Grid { | ||||
|   // | ||||
|   template<class lobj,class robj> class veq { | ||||
|   public: | ||||
|     vInteger operator()(const lobj &lhs, const robj &rhs) | ||||
|     accelerator vInteger operator()(const lobj &lhs, const robj &rhs) | ||||
|     {  | ||||
|       return (lhs) == (rhs); | ||||
|     } | ||||
|   }; | ||||
|   template<class lobj,class robj> class vne { | ||||
|   public: | ||||
|     vInteger operator()(const lobj &lhs, const robj &rhs) | ||||
|     accelerator vInteger operator()(const lobj &lhs, const robj &rhs) | ||||
|     {  | ||||
|       return (lhs) != (rhs); | ||||
|     } | ||||
|   }; | ||||
|   template<class lobj,class robj> class vlt { | ||||
|   public: | ||||
|     vInteger operator()(const lobj &lhs, const robj &rhs) | ||||
|     accelerator vInteger operator()(const lobj &lhs, const robj &rhs) | ||||
|     {  | ||||
|       return (lhs) < (rhs); | ||||
|     } | ||||
|   }; | ||||
|   template<class lobj,class robj> class vle { | ||||
|   public: | ||||
|     vInteger operator()(const lobj &lhs, const robj &rhs) | ||||
|     accelerator vInteger operator()(const lobj &lhs, const robj &rhs) | ||||
|     {  | ||||
|       return (lhs) <= (rhs); | ||||
|     } | ||||
|   }; | ||||
|   template<class lobj,class robj> class vgt { | ||||
|   public: | ||||
|     vInteger operator()(const lobj &lhs, const robj &rhs) | ||||
|     accelerator vInteger operator()(const lobj &lhs, const robj &rhs) | ||||
|     {  | ||||
|       return (lhs) > (rhs); | ||||
|     } | ||||
|   }; | ||||
|   template<class lobj,class robj> class vge { | ||||
|     public: | ||||
|     vInteger operator()(const lobj &lhs, const robj &rhs) | ||||
|     accelerator vInteger operator()(const lobj &lhs, const robj &rhs) | ||||
|     {  | ||||
|       return (lhs) >= (rhs); | ||||
|     } | ||||
| @@ -88,42 +88,42 @@ namespace Grid { | ||||
|   // Generic list of functors | ||||
|   template<class lobj,class robj> class seq { | ||||
|   public: | ||||
|     Integer operator()(const lobj &lhs, const robj &rhs) | ||||
|     accelerator Integer operator()(const lobj &lhs, const robj &rhs) | ||||
|     {  | ||||
|       return (lhs) == (rhs); | ||||
|     } | ||||
|   }; | ||||
|   template<class lobj,class robj> class sne { | ||||
|   public: | ||||
|     Integer operator()(const lobj &lhs, const robj &rhs) | ||||
|     accelerator Integer operator()(const lobj &lhs, const robj &rhs) | ||||
|     {  | ||||
|       return (lhs) != (rhs); | ||||
|     } | ||||
|   }; | ||||
|   template<class lobj,class robj> class slt { | ||||
|   public: | ||||
|     Integer operator()(const lobj &lhs, const robj &rhs) | ||||
|     accelerator Integer operator()(const lobj &lhs, const robj &rhs) | ||||
|     {  | ||||
|       return (lhs) < (rhs); | ||||
|     } | ||||
|   }; | ||||
|   template<class lobj,class robj> class sle { | ||||
|   public: | ||||
|     Integer operator()(const lobj &lhs, const robj &rhs) | ||||
|     accelerator Integer operator()(const lobj &lhs, const robj &rhs) | ||||
|     {  | ||||
|       return (lhs) <= (rhs); | ||||
|     } | ||||
|   }; | ||||
|   template<class lobj,class robj> class sgt { | ||||
|   public: | ||||
|     Integer operator()(const lobj &lhs, const robj &rhs) | ||||
|     accelerator Integer operator()(const lobj &lhs, const robj &rhs) | ||||
|     {  | ||||
|       return (lhs) > (rhs); | ||||
|     } | ||||
|   }; | ||||
|   template<class lobj,class robj> class sge { | ||||
|   public: | ||||
|     Integer operator()(const lobj &lhs, const robj &rhs) | ||||
|     accelerator Integer operator()(const lobj &lhs, const robj &rhs) | ||||
|     {  | ||||
|       return (lhs) >= (rhs); | ||||
|     } | ||||
| @@ -133,12 +133,12 @@ namespace Grid { | ||||
|   // Integer and real get extra relational functions. | ||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>  | ||||
|     inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs) | ||||
|     accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs) | ||||
|     { | ||||
|       typedef typename vsimd::scalar_type scalar; | ||||
|       std::vector<scalar> vlhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation | ||||
|       std::vector<scalar> vrhs(vsimd::Nsimd()); | ||||
|       std::vector<Integer> vpred(vsimd::Nsimd()); | ||||
|       ExtractBuffer<scalar> vlhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation | ||||
|       ExtractBuffer<scalar> vrhs(vsimd::Nsimd()); | ||||
|       ExtractBuffer<Integer> vpred(vsimd::Nsimd()); | ||||
|       vInteger ret; | ||||
|       extract<vsimd,scalar>(lhs,vlhs); | ||||
|       extract<vsimd,scalar>(rhs,vrhs); | ||||
| @@ -150,11 +150,11 @@ namespace Grid { | ||||
|     } | ||||
|  | ||||
|   template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>  | ||||
|     inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs) | ||||
|     accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs) | ||||
|     { | ||||
|       typedef typename vsimd::scalar_type scalar; | ||||
|       std::vector<scalar> vlhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation | ||||
|       std::vector<Integer> vpred(vsimd::Nsimd()); | ||||
|       ExtractBuffer<scalar> vlhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation | ||||
|       ExtractBuffer<Integer> vpred(vsimd::Nsimd()); | ||||
|       vInteger ret; | ||||
|       extract<vsimd,scalar>(lhs,vlhs); | ||||
|       for(int s=0;s<vsimd::Nsimd();s++){ | ||||
| @@ -165,11 +165,11 @@ namespace Grid { | ||||
|     } | ||||
|  | ||||
|   template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>  | ||||
|     inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs) | ||||
|     accelerator_inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs) | ||||
|     { | ||||
|       typedef typename vsimd::scalar_type scalar; | ||||
|       std::vector<scalar> vrhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation | ||||
|       std::vector<Integer> vpred(vsimd::Nsimd()); | ||||
|       ExtractBuffer<scalar> vrhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation | ||||
|       ExtractBuffer<Integer> vpred(vsimd::Nsimd()); | ||||
|       vInteger ret; | ||||
|       extract<vsimd,scalar>(rhs,vrhs); | ||||
|       for(int s=0;s<vsimd::Nsimd();s++){ | ||||
| @@ -181,30 +181,30 @@ namespace Grid { | ||||
|  | ||||
| #define DECLARE_RELATIONAL_EQ(op,functor) \ | ||||
|   template<class vsimd,IfSimd<vsimd> = 0>\ | ||||
|     inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\ | ||||
|     accelerator_inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\ | ||||
|     {\ | ||||
|       typedef typename vsimd::scalar_type scalar;\ | ||||
|       return Comparison(functor<scalar,scalar>(),lhs,rhs);\ | ||||
|     }\ | ||||
|   template<class vsimd,IfSimd<vsimd> = 0>\ | ||||
|     inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \ | ||||
|     accelerator_inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \ | ||||
|     {\ | ||||
|       typedef typename vsimd::scalar_type scalar;\ | ||||
|       return Comparison(functor<scalar,scalar>(),lhs,rhs);\ | ||||
|     }\ | ||||
|   template<class vsimd,IfSimd<vsimd> = 0>\ | ||||
|     inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \ | ||||
|     accelerator_inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \ | ||||
|     {\ | ||||
|       typedef typename vsimd::scalar_type scalar;\ | ||||
|       return Comparison(functor<scalar,scalar>(),lhs,rhs);\ | ||||
|     }\ | ||||
|   template<class vsimd>\ | ||||
|     inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \ | ||||
|     accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \ | ||||
|     {									\ | ||||
|       return lhs._internal op rhs;					\ | ||||
|     }									\ | ||||
|   template<class vsimd>\ | ||||
|     inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \ | ||||
|     accelerator_inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \ | ||||
|     {									\ | ||||
|       return lhs op rhs._internal;					\ | ||||
|     }									\ | ||||
| @@ -212,7 +212,7 @@ namespace Grid { | ||||
| #define DECLARE_RELATIONAL(op,functor) \ | ||||
|   DECLARE_RELATIONAL_EQ(op,functor)    \ | ||||
|   template<class vsimd>\ | ||||
|     inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\ | ||||
|     accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\ | ||||
|     {									\ | ||||
|       return lhs._internal op rhs._internal;				\ | ||||
|     }									 | ||||
| @@ -226,7 +226,7 @@ DECLARE_RELATIONAL(!=,sne); | ||||
|  | ||||
| #undef DECLARE_RELATIONAL | ||||
|  | ||||
| } | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
|  | ||||
|  | ||||
| #endif | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -23,18 +23,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_LATTICE_CONFORMABLE_H | ||||
| #define GRID_LATTICE_CONFORMABLE_H | ||||
|  | ||||
| namespace Grid { | ||||
|  | ||||
|     template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs) | ||||
|     { | ||||
|         assert(lhs._grid == rhs._grid); | ||||
|         assert(lhs.checkerboard == rhs.checkerboard); | ||||
|     } | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs) | ||||
| { | ||||
|   assert(lhs.Grid() == rhs.Grid()); | ||||
|   assert(lhs.Checkerboard() == rhs.Checkerboard()); | ||||
| } | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -23,34 +23,33 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #ifndef GRID_LATTICE_COORDINATE_H | ||||
| #define GRID_LATTICE_COORDINATE_H | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #pragma once  | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
|     template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu) | ||||
|     { | ||||
|       typedef typename iobj::scalar_type scalar_type; | ||||
|       typedef typename iobj::vector_type vector_type; | ||||
| template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu) | ||||
| { | ||||
|   typedef typename iobj::scalar_type scalar_type; | ||||
|   typedef typename iobj::vector_type vector_type; | ||||
|  | ||||
|       GridBase *grid = l._grid; | ||||
|       int Nsimd = grid->iSites(); | ||||
|   GridBase *grid = l.Grid(); | ||||
|   int Nsimd = grid->iSites(); | ||||
|  | ||||
|       std::vector<int> gcoor; | ||||
|       std::vector<scalar_type> mergebuf(Nsimd); | ||||
|   autoView(l_v, l, CpuWrite); | ||||
|   thread_for( o, grid->oSites(), { | ||||
|     vector_type vI; | ||||
|     Coordinate gcoor; | ||||
|     ExtractBuffer<scalar_type> mergebuf(Nsimd); | ||||
|     for(int i=0;i<grid->iSites();i++){ | ||||
|       grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor); | ||||
|       mergebuf[i]=(Integer)gcoor[mu]; | ||||
|     } | ||||
|     merge<vector_type,scalar_type>(vI,mergebuf); | ||||
|     l_v[o]=vI; | ||||
|   }); | ||||
| }; | ||||
|  | ||||
|       vector_type vI; | ||||
|       for(int o=0;o<grid->oSites();o++){ | ||||
| 	for(int i=0;i<grid->iSites();i++){ | ||||
| 	  grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor); | ||||
| 	  mergebuf[i]=(Integer)gcoor[mu]; | ||||
| 	} | ||||
| 	merge<vector_type,scalar_type>(vI,mergebuf); | ||||
| 	l._odata[o]=vI; | ||||
|       } | ||||
|     }; | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
| } | ||||
| #endif | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -23,8 +23,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_LATTICE_LOCALREDUCTION_H | ||||
| #define GRID_LATTICE_LOCALREDUCTION_H | ||||
|  | ||||
| @@ -32,44 +32,56 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| // localInner, localNorm, outerProduct | ||||
| /////////////////////////////////////////////// | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
|   ///////////////////////////////////////////////////// | ||||
|   // Non site, reduced locally reduced routines | ||||
|   ///////////////////////////////////////////////////// | ||||
| ///////////////////////////////////////////////////// | ||||
| // Non site, reduced locally reduced routines | ||||
| ///////////////////////////////////////////////////// | ||||
|    | ||||
|   // localNorm2, | ||||
|   template<class vobj> | ||||
|     inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced> | ||||
|     { | ||||
|       Lattice<typename vobj::tensor_reduced> ret(rhs._grid); | ||||
|       parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
| 	ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]); | ||||
|       } | ||||
|       return ret; | ||||
|     } | ||||
|    | ||||
|   // localInnerProduct | ||||
|   template<class vobj> | ||||
|     inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced> | ||||
|     { | ||||
|       Lattice<typename vobj::tensor_reduced> ret(rhs._grid); | ||||
|       parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
| 	ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]); | ||||
|       } | ||||
|       return ret; | ||||
|     } | ||||
|    | ||||
|   // outerProduct Scalar x Scalar -> Scalar | ||||
|   //              Vector x Vector -> Matrix | ||||
|   template<class ll,class rr> | ||||
|     inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> | ||||
|   { | ||||
|     Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid); | ||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
|       ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]); | ||||
|     } | ||||
|     return ret; | ||||
|   } | ||||
| // localNorm2, | ||||
| template<class vobj> | ||||
| inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced> | ||||
| { | ||||
|   Lattice<typename vobj::tensor_reduced> ret(rhs.Grid()); | ||||
|   autoView( rhs_v , rhs, AcceleratorRead); | ||||
|   autoView( ret_v , ret, AcceleratorWrite); | ||||
|   accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ | ||||
|     coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss))); | ||||
|   }); | ||||
|   return ret; | ||||
| } | ||||
|    | ||||
| // localInnerProduct | ||||
| template<class vobj> | ||||
| inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced> | ||||
| { | ||||
|   Lattice<typename vobj::tensor_reduced> ret(rhs.Grid()); | ||||
|   autoView( lhs_v , lhs, AcceleratorRead); | ||||
|   autoView( rhs_v , rhs, AcceleratorRead); | ||||
|   autoView( ret_v , ret, AcceleratorWrite); | ||||
|   accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ | ||||
|     coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss))); | ||||
|   }); | ||||
|   return ret; | ||||
| } | ||||
|    | ||||
| // outerProduct Scalar x Scalar -> Scalar | ||||
| //              Vector x Vector -> Matrix | ||||
| template<class ll,class rr> | ||||
| inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(ll(),rr()))> | ||||
| { | ||||
|   typedef decltype(coalescedRead(ll())) sll; | ||||
|   typedef decltype(coalescedRead(rr())) srr; | ||||
|   Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid()); | ||||
|   autoView( lhs_v , lhs, AcceleratorRead); | ||||
|   autoView( rhs_v , rhs, AcceleratorRead); | ||||
|   autoView( ret_v , ret, AcceleratorWrite); | ||||
|   accelerator_for(ss,rhs_v.size(),1,{ | ||||
|     // FIXME had issues with scalar version of outer  | ||||
|     // Use vector [] operator and don't read coalesce this loop | ||||
|     ret_v[ss]=outerProduct(lhs_v[ss],rhs_v[ss]); | ||||
|   }); | ||||
|   return ret; | ||||
| } | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
|   | ||||
							
								
								
									
										202
									
								
								Grid/lattice/Lattice_matrix_reduction.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										202
									
								
								Grid/lattice/Lattice_matrix_reduction.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,202 @@ | ||||
| /************************************************************************************* | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|     Source file: ./lib/lattice/Lattice_reduction.h | ||||
|     Copyright (C) 2015 | ||||
| Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #pragma once  | ||||
| #include <Grid/Grid_Eigen_Dense.h> | ||||
|  | ||||
| #ifdef GRID_WARN_SUBOPTIMAL | ||||
| #warning "Optimisation alert all these reduction loops are NOT threaded " | ||||
| #endif      | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| template<class vobj> | ||||
| static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)  | ||||
| {     | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|  | ||||
|   int Nblock = X.Grid()->GlobalDimensions()[Orthog]; | ||||
|  | ||||
|   GridBase *FullGrid  = X.Grid(); | ||||
|   //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); | ||||
|  | ||||
|   //  Lattice<vobj> Xslice(SliceGrid); | ||||
|   //  Lattice<vobj> Rslice(SliceGrid); | ||||
|  | ||||
|   assert( FullGrid->_simd_layout[Orthog]==1); | ||||
|  | ||||
|   //FIXME package in a convenient iterator | ||||
|   //Should loop over a plane orthogonal to direction "Orthog" | ||||
|   int stride=FullGrid->_slice_stride[Orthog]; | ||||
|   int block =FullGrid->_slice_block [Orthog]; | ||||
|   int nblock=FullGrid->_slice_nblock[Orthog]; | ||||
|   int ostride=FullGrid->_ostride[Orthog]; | ||||
|   autoView( X_v , X, CpuRead); | ||||
|   autoView( Y_v , Y, CpuRead); | ||||
|   autoView( R_v , R, CpuWrite); | ||||
|   thread_region | ||||
|   { | ||||
|     std::vector<vobj> s_x(Nblock); | ||||
|  | ||||
|     thread_loop_collapse2( (int n=0;n<nblock;n++),{ | ||||
|       for(int b=0;b<block;b++){ | ||||
| 	int o  = n*stride + b; | ||||
|  | ||||
| 	for(int i=0;i<Nblock;i++){ | ||||
| 	  s_x[i] = X_v[o+i*ostride]; | ||||
| 	} | ||||
|  | ||||
| 	vobj dot; | ||||
| 	for(int i=0;i<Nblock;i++){ | ||||
| 	  dot = Y_v[o+i*ostride]; | ||||
| 	  for(int j=0;j<Nblock;j++){ | ||||
| 	    dot = dot + s_x[j]*(scale*aa(j,i)); | ||||
| 	  } | ||||
| 	  R_v[o+i*ostride]=dot; | ||||
| 	} | ||||
|       }}); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template<class vobj> | ||||
| static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)  | ||||
| {     | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|  | ||||
|   int Nblock = X.Grid()->GlobalDimensions()[Orthog]; | ||||
|  | ||||
|   GridBase *FullGrid  = X.Grid(); | ||||
|   assert( FullGrid->_simd_layout[Orthog]==1); | ||||
|  | ||||
|   //FIXME package in a convenient iterator | ||||
|   //Should loop over a plane orthogonal to direction "Orthog" | ||||
|   int stride=FullGrid->_slice_stride[Orthog]; | ||||
|   int block =FullGrid->_slice_block [Orthog]; | ||||
|   int nblock=FullGrid->_slice_nblock[Orthog]; | ||||
|   int ostride=FullGrid->_ostride[Orthog]; | ||||
|  | ||||
|   autoView( X_v , X, CpuRead); | ||||
|   autoView( R_v , R, CpuWrite); | ||||
|  | ||||
|   thread_region | ||||
|   { | ||||
|     std::vector<vobj> s_x(Nblock); | ||||
|      | ||||
|     thread_loop_collapse2( (int n=0;n<nblock;n++),{ | ||||
|       for(int b=0;b<block;b++){ | ||||
| 	int o  = n*stride + b; | ||||
|  | ||||
| 	for(int i=0;i<Nblock;i++){ | ||||
| 	  s_x[i] = X_v[o+i*ostride]; | ||||
| 	} | ||||
|  | ||||
| 	vobj dot; | ||||
| 	for(int i=0;i<Nblock;i++){ | ||||
| 	  dot = s_x[0]*(scale*aa(0,i)); | ||||
| 	  for(int j=1;j<Nblock;j++){ | ||||
| 	    dot = dot + s_x[j]*(scale*aa(j,i)); | ||||
| 	  } | ||||
| 	  R_v[o+i*ostride]=dot; | ||||
| 	} | ||||
|     }}); | ||||
|   } | ||||
|  | ||||
| }; | ||||
|  | ||||
|  | ||||
| template<class vobj> | ||||
| static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)  | ||||
| { | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|    | ||||
|   GridBase *FullGrid  = lhs.Grid(); | ||||
|   //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); | ||||
|    | ||||
|   int Nblock = FullGrid->GlobalDimensions()[Orthog]; | ||||
|    | ||||
|   //  Lattice<vobj> Lslice(SliceGrid); | ||||
|   //  Lattice<vobj> Rslice(SliceGrid); | ||||
|    | ||||
|   mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); | ||||
|  | ||||
|   assert( FullGrid->_simd_layout[Orthog]==1); | ||||
|   //  int nh =  FullGrid->_ndimension; | ||||
|   //  int nl = SliceGrid->_ndimension; | ||||
|   //  int nl = nh-1; | ||||
|  | ||||
|   //FIXME package in a convenient iterator | ||||
|   //Should loop over a plane orthogonal to direction "Orthog" | ||||
|   int stride=FullGrid->_slice_stride[Orthog]; | ||||
|   int block =FullGrid->_slice_block [Orthog]; | ||||
|   int nblock=FullGrid->_slice_nblock[Orthog]; | ||||
|   int ostride=FullGrid->_ostride[Orthog]; | ||||
|  | ||||
|   typedef typename vobj::vector_typeD vector_typeD; | ||||
|   autoView( lhs_v , lhs, CpuRead); | ||||
|   autoView( rhs_v , rhs, CpuRead); | ||||
|   thread_region { | ||||
|     std::vector<vobj> Left(Nblock); | ||||
|     std::vector<vobj> Right(Nblock); | ||||
|     Eigen::MatrixXcd  mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock); | ||||
|  | ||||
|     thread_loop_collapse2((int n=0;n<nblock;n++),{ | ||||
|       for(int b=0;b<block;b++){ | ||||
|  | ||||
| 	int o  = n*stride + b; | ||||
|  | ||||
| 	for(int i=0;i<Nblock;i++){ | ||||
| 	  Left [i] = lhs_v[o+i*ostride]; | ||||
| 	  Right[i] = rhs_v[o+i*ostride]; | ||||
| 	} | ||||
|  | ||||
| 	for(int i=0;i<Nblock;i++){ | ||||
| 	  for(int j=0;j<Nblock;j++){ | ||||
| 	    auto tmp = innerProduct(Left[i],Right[j]); | ||||
| 	    auto rtmp = TensorRemove(tmp); | ||||
| 	    ComplexD z = Reduce(rtmp); | ||||
| 	    mat_thread(i,j) += std::complex<double>(real(z),imag(z)); | ||||
| 	  }} | ||||
|     }}); | ||||
|     thread_critical { | ||||
|       mat += mat_thread; | ||||
|     }   | ||||
|   } | ||||
|  | ||||
|   for(int i=0;i<Nblock;i++){ | ||||
|     for(int j=0;j<Nblock;j++){ | ||||
|       ComplexD sum = mat(i,j); | ||||
|       FullGrid->GlobalSum(sum); | ||||
|       mat(i,j)=sum; | ||||
|     }} | ||||
|  | ||||
|   return; | ||||
| } | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
|  | ||||
|  | ||||
| @@ -1,138 +0,0 @@ | ||||
|     /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
|     Source file: ./lib/lattice/Lattice_overload.h | ||||
|  | ||||
|     Copyright (C) 2015 | ||||
|  | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
|  | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
|  | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #ifndef GRID_LATTICE_OVERLOAD_H | ||||
| #define GRID_LATTICE_OVERLOAD_H | ||||
|  | ||||
| namespace Grid { | ||||
|  | ||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // unary negation | ||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   template<class vobj> | ||||
|   inline Lattice<vobj> operator -(const Lattice<vobj> &r) | ||||
|   { | ||||
|     Lattice<vobj> ret(r._grid); | ||||
|     parallel_for(int ss=0;ss<r._grid->oSites();ss++){ | ||||
|       vstream(ret._odata[ss], -r._odata[ss]); | ||||
|     } | ||||
|     return ret; | ||||
|   }  | ||||
|   ///////////////////////////////////////////////////////////////////////////////////// | ||||
|   // Lattice BinOp Lattice, | ||||
|   //NB mult performs conformable check. Do not reapply here for performance. | ||||
|   ///////////////////////////////////////////////////////////////////////////////////// | ||||
|   template<class left,class right> | ||||
|     inline auto operator * (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])> | ||||
|   { | ||||
|     Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid); | ||||
|     mult(ret,lhs,rhs); | ||||
|     return ret; | ||||
|   } | ||||
|   template<class left,class right> | ||||
|     inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]+rhs._odata[0])> | ||||
|   { | ||||
|     Lattice<decltype(lhs._odata[0]+rhs._odata[0])> ret(rhs._grid); | ||||
|     add(ret,lhs,rhs); | ||||
|     return ret; | ||||
|   } | ||||
|   template<class left,class right> | ||||
|     inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]-rhs._odata[0])> | ||||
|   { | ||||
|     Lattice<decltype(lhs._odata[0]-rhs._odata[0])> ret(rhs._grid); | ||||
|     sub(ret,lhs,rhs); | ||||
|     return ret; | ||||
|   } | ||||
|    | ||||
|   // Scalar BinOp Lattice ;generate return type | ||||
|   template<class left,class right> | ||||
|   inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])> | ||||
|   { | ||||
|     Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid); | ||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
|       decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss];  | ||||
|       vstream(ret._odata[ss],tmp); | ||||
| 	   //      ret._odata[ss]=lhs*rhs._odata[ss]; | ||||
|     } | ||||
|     return ret; | ||||
|   } | ||||
|   template<class left,class right> | ||||
|     inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])> | ||||
|     { | ||||
|       Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid); | ||||
|       parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
| 	decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];   | ||||
| 	vstream(ret._odata[ss],tmp); | ||||
| 	//	ret._odata[ss]=lhs+rhs._odata[ss]; | ||||
|       } | ||||
|         return ret; | ||||
|     } | ||||
|   template<class left,class right> | ||||
|     inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])> | ||||
|   { | ||||
|     Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid); | ||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
|       decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];   | ||||
|       vstream(ret._odata[ss],tmp); | ||||
|     } | ||||
|     return ret; | ||||
|   } | ||||
|     template<class left,class right> | ||||
|       inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)> | ||||
|     { | ||||
|       Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid); | ||||
|       parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){ | ||||
| 	decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs; | ||||
| 	vstream(ret._odata[ss],tmp); | ||||
| 	//            ret._odata[ss]=lhs._odata[ss]*rhs; | ||||
|       } | ||||
|       return ret; | ||||
|     } | ||||
|     template<class left,class right> | ||||
|       inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)> | ||||
|     { | ||||
|         Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid); | ||||
| 	parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
| 	  decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs;  | ||||
| 	  vstream(ret._odata[ss],tmp); | ||||
| 	  //	  ret._odata[ss]=lhs._odata[ss]+rhs; | ||||
|         } | ||||
|         return ret; | ||||
|     } | ||||
|     template<class left,class right> | ||||
|       inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)> | ||||
|     { | ||||
|       Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid); | ||||
|       parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
| 	  decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs; | ||||
| 	  vstream(ret._odata[ss],tmp); | ||||
| 	  //	ret._odata[ss]=lhs._odata[ss]-rhs; | ||||
|       } | ||||
|       return ret; | ||||
|     } | ||||
| } | ||||
| #endif | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -25,8 +25,8 @@ Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_LATTICE_PEEK_H | ||||
| #define GRID_LATTICE_PEEK_H | ||||
|  | ||||
| @@ -34,172 +34,182 @@ Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local> | ||||
| // Peeking and poking around | ||||
| /////////////////////////////////////////////// | ||||
|  | ||||
| namespace Grid { | ||||
|  | ||||
|     //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|     // Peek internal indices of a Lattice object | ||||
|     //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|     template<int Index,class vobj> | ||||
|        auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> | ||||
|     { | ||||
|       Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid); | ||||
|       ret.checkerboard=lhs.checkerboard; | ||||
|       parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| 	ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i); | ||||
|       } | ||||
|       return ret; | ||||
|     }; | ||||
|     template<int Index,class vobj> | ||||
|       auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> | ||||
|     { | ||||
|       Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid); | ||||
|       ret.checkerboard=lhs.checkerboard; | ||||
|       parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| 	ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j); | ||||
|       } | ||||
|       return ret; | ||||
|     }; | ||||
|  | ||||
|     //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|     // Poke internal indices of a Lattice object | ||||
|     //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|     template<int Index,class vobj>  | ||||
|     void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0))> & rhs,int i) | ||||
|     { | ||||
|       parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| 	pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i); | ||||
|       }       | ||||
|     } | ||||
|     template<int Index,class vobj> | ||||
|       void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0,0))> & rhs,int i,int j) | ||||
|     { | ||||
|       parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| 	pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j); | ||||
|       }       | ||||
|     } | ||||
|  | ||||
|     ////////////////////////////////////////////////////// | ||||
|     // Poke a scalar object into the SIMD array | ||||
|     ////////////////////////////////////////////////////// | ||||
|     template<class vobj,class sobj> | ||||
|     void pokeSite(const sobj &s,Lattice<vobj> &l,const std::vector<int> &site){ | ||||
|  | ||||
|       GridBase *grid=l._grid; | ||||
|  | ||||
|       typedef typename vobj::scalar_type scalar_type; | ||||
|       typedef typename vobj::vector_type vector_type; | ||||
|  | ||||
|       int Nsimd = grid->Nsimd(); | ||||
|  | ||||
|       assert( l.checkerboard== l._grid->CheckerBoard(site)); | ||||
|       assert( sizeof(sobj)*Nsimd == sizeof(vobj)); | ||||
|  | ||||
|       int rank,odx,idx; | ||||
|       // Optional to broadcast from node 0. | ||||
|       grid->GlobalCoorToRankIndex(rank,odx,idx,site); | ||||
|       grid->Broadcast(grid->BossRank(),s); | ||||
|  | ||||
|       std::vector<sobj> buf(Nsimd); | ||||
|  | ||||
|       // extract-modify-merge cycle is easiest way and this is not perf critical | ||||
|       if ( rank == grid->ThisRank() ) { | ||||
| 	extract(l._odata[odx],buf); | ||||
| 	buf[idx] = s; | ||||
| 	merge(l._odata[odx],buf); | ||||
|       } | ||||
|  | ||||
|       return; | ||||
|     }; | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
|  | ||||
|     ////////////////////////////////////////////////////////// | ||||
|     // Peek a scalar object from the SIMD array | ||||
|     ////////////////////////////////////////////////////////// | ||||
|     template<class vobj,class sobj> | ||||
|       void peekSite(sobj &s,const Lattice<vobj> &l,const std::vector<int> &site){ | ||||
|          | ||||
|       GridBase *grid=l._grid; | ||||
|  | ||||
|       typedef typename vobj::scalar_type scalar_type; | ||||
|       typedef typename vobj::vector_type vector_type; | ||||
|  | ||||
|       int Nsimd = grid->Nsimd(); | ||||
|  | ||||
|       assert( l.checkerboard == l._grid->CheckerBoard(site)); | ||||
|  | ||||
|       int rank,odx,idx; | ||||
|       grid->GlobalCoorToRankIndex(rank,odx,idx,site); | ||||
|  | ||||
|       std::vector<sobj> buf(Nsimd); | ||||
|       extract(l._odata[odx],buf); | ||||
|  | ||||
|       s = buf[idx]; | ||||
|  | ||||
|       grid->Broadcast(rank,s); | ||||
|  | ||||
|       return; | ||||
|     }; | ||||
|  | ||||
|  | ||||
|     ////////////////////////////////////////////////////////// | ||||
|     // Peek a scalar object from the SIMD array | ||||
|     ////////////////////////////////////////////////////////// | ||||
|     template<class vobj,class sobj> | ||||
|     void peekLocalSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){ | ||||
|          | ||||
|       GridBase *grid = l._grid; | ||||
|  | ||||
|       typedef typename vobj::scalar_type scalar_type; | ||||
|       typedef typename vobj::vector_type vector_type; | ||||
|  | ||||
|       int Nsimd = grid->Nsimd(); | ||||
|  | ||||
|       assert( l.checkerboard== l._grid->CheckerBoard(site)); | ||||
|       assert( sizeof(sobj)*Nsimd == sizeof(vobj)); | ||||
|  | ||||
|       static const int words=sizeof(vobj)/sizeof(vector_type); | ||||
|       int odx,idx; | ||||
|       idx= grid->iIndex(site); | ||||
|       odx= grid->oIndex(site); | ||||
|  | ||||
|       scalar_type * vp = (scalar_type *)&l._odata[odx]; | ||||
|       scalar_type * pt = (scalar_type *)&s; | ||||
|        | ||||
|       for(int w=0;w<words;w++){ | ||||
|         pt[w] = vp[idx+w*Nsimd]; | ||||
|       } | ||||
|        | ||||
|       return; | ||||
|     }; | ||||
|  | ||||
|     template<class vobj,class sobj> | ||||
|     void pokeLocalSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){ | ||||
|  | ||||
|       GridBase *grid=l._grid; | ||||
|  | ||||
|       typedef typename vobj::scalar_type scalar_type; | ||||
|       typedef typename vobj::vector_type vector_type; | ||||
|  | ||||
|       int Nsimd = grid->Nsimd(); | ||||
|  | ||||
|       assert( l.checkerboard== l._grid->CheckerBoard(site)); | ||||
|       assert( sizeof(sobj)*Nsimd == sizeof(vobj)); | ||||
|  | ||||
|       static const int words=sizeof(vobj)/sizeof(vector_type); | ||||
|       int odx,idx; | ||||
|       idx= grid->iIndex(site); | ||||
|       odx= grid->oIndex(site); | ||||
|  | ||||
|       scalar_type * vp = (scalar_type *)&l._odata[odx]; | ||||
|       scalar_type * pt = (scalar_type *)&s; | ||||
|        | ||||
|       for(int w=0;w<words;w++){ | ||||
|         vp[idx+w*Nsimd] = pt[w]; | ||||
|       } | ||||
|  | ||||
|       return; | ||||
|     }; | ||||
| // FIXME accelerator_loop and accelerator_inline these | ||||
| //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // Peek internal indices of a Lattice object | ||||
| //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| template<int Index,class vobj>  | ||||
| auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Index>(vobj(),i))> | ||||
| { | ||||
|   Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid()); | ||||
|   ret.Checkerboard()=lhs.Checkerboard(); | ||||
|   autoView( ret_v, ret, AcceleratorWrite); | ||||
|   autoView( lhs_v, lhs, AcceleratorRead); | ||||
|   accelerator_for( ss, lhs_v.size(), 1, { | ||||
|     ret_v[ss] = peekIndex<Index>(lhs_v[ss],i); | ||||
|   }); | ||||
|   return ret; | ||||
| }; | ||||
| template<int Index,class vobj>  | ||||
| auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(vobj(),i,j))> | ||||
| { | ||||
|   Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid()); | ||||
|   ret.Checkerboard()=lhs.Checkerboard(); | ||||
|   autoView( ret_v, ret, AcceleratorWrite); | ||||
|   autoView( lhs_v, lhs, AcceleratorRead); | ||||
|   accelerator_for( ss, lhs_v.size(), 1, { | ||||
|     ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j); | ||||
|   }); | ||||
|   return ret; | ||||
| }; | ||||
|  | ||||
| //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // Poke internal indices of a Lattice object | ||||
| //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| template<int Index,class vobj>   | ||||
| void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i) | ||||
| { | ||||
|   autoView( rhs_v, rhs, AcceleratorRead); | ||||
|   autoView( lhs_v, lhs, AcceleratorWrite); | ||||
|   accelerator_for( ss, lhs_v.size(), 1, { | ||||
|     pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i); | ||||
|   }); | ||||
| } | ||||
| template<int Index,class vobj>  | ||||
| void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j) | ||||
| { | ||||
|   autoView( rhs_v, rhs, AcceleratorRead); | ||||
|   autoView( lhs_v, lhs, AcceleratorWrite); | ||||
|   accelerator_for( ss, lhs_v.size(), 1, { | ||||
|     pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j); | ||||
|   }); | ||||
| } | ||||
|  | ||||
| ////////////////////////////////////////////////////// | ||||
| // Poke a scalar object into the SIMD array | ||||
| ////////////////////////////////////////////////////// | ||||
| template<class vobj,class sobj>  | ||||
| void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){ | ||||
|  | ||||
|   GridBase *grid=l.Grid(); | ||||
|  | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|  | ||||
|   int Nsimd = grid->Nsimd(); | ||||
|  | ||||
|   assert( l.Checkerboard()== l.Grid()->CheckerBoard(site)); | ||||
|   assert( sizeof(sobj)*Nsimd == sizeof(vobj)); | ||||
|  | ||||
|   int rank,odx,idx; | ||||
|   // Optional to broadcast from node 0. | ||||
|   grid->GlobalCoorToRankIndex(rank,odx,idx,site); | ||||
|   grid->Broadcast(grid->BossRank(),s); | ||||
|  | ||||
|   // extract-modify-merge cycle is easiest way and this is not perf critical | ||||
|   ExtractBuffer<sobj> buf(Nsimd); | ||||
|   autoView( l_v , l, CpuWrite); | ||||
|   if ( rank == grid->ThisRank() ) { | ||||
|     extract(l_v[odx],buf); | ||||
|     buf[idx] = s; | ||||
|     merge(l_v[odx],buf); | ||||
|   } | ||||
|  | ||||
|   return; | ||||
| }; | ||||
|  | ||||
|  | ||||
| ////////////////////////////////////////////////////////// | ||||
| // Peek a scalar object from the SIMD array | ||||
| ////////////////////////////////////////////////////////// | ||||
| template<class vobj,class sobj> | ||||
| void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){ | ||||
|          | ||||
|   GridBase *grid=l.Grid(); | ||||
|  | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|  | ||||
|   int Nsimd = grid->Nsimd(); | ||||
|  | ||||
|   assert( l.Checkerboard() == l.Grid()->CheckerBoard(site)); | ||||
|  | ||||
|   int rank,odx,idx; | ||||
|   grid->GlobalCoorToRankIndex(rank,odx,idx,site); | ||||
|  | ||||
|   ExtractBuffer<sobj> buf(Nsimd); | ||||
|   autoView( l_v , l, CpuWrite); | ||||
|   extract(l_v[odx],buf); | ||||
|  | ||||
|   s = buf[idx]; | ||||
|  | ||||
|   grid->Broadcast(rank,s); | ||||
|  | ||||
|   return; | ||||
| }; | ||||
|  | ||||
| ////////////////////////////////////////////////////////// | ||||
| // Peek a scalar object from the SIMD array | ||||
| ////////////////////////////////////////////////////////// | ||||
| // Must be CPU read view | ||||
| template<class vobj,class sobj> | ||||
| inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site) | ||||
| { | ||||
|   GridBase *grid = l.getGrid(); | ||||
|   assert(l.mode==CpuRead); | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|  | ||||
|   int Nsimd = grid->Nsimd(); | ||||
|  | ||||
|   assert( l.Checkerboard()== grid->CheckerBoard(site)); | ||||
|   assert( sizeof(sobj)*Nsimd == sizeof(vobj)); | ||||
|  | ||||
|   static const int words=sizeof(vobj)/sizeof(vector_type); | ||||
|   int odx,idx; | ||||
|   idx= grid->iIndex(site); | ||||
|   odx= grid->oIndex(site); | ||||
|    | ||||
|   scalar_type * vp = (scalar_type *)&l[odx]; | ||||
|   scalar_type * pt = (scalar_type *)&s; | ||||
|        | ||||
|   for(int w=0;w<words;w++){ | ||||
|     pt[w] = vp[idx+w*Nsimd]; | ||||
|   } | ||||
|        | ||||
|   return; | ||||
| }; | ||||
| // Must be CPU write view | ||||
| template<class vobj,class sobj> | ||||
| inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site) | ||||
| { | ||||
|   GridBase *grid=l.getGrid(); | ||||
|   assert(l.mode==CpuWrite); | ||||
|  | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|  | ||||
|   int Nsimd = grid->Nsimd(); | ||||
|  | ||||
|   assert( l.Checkerboard()== grid->CheckerBoard(site)); | ||||
|   assert( sizeof(sobj)*Nsimd == sizeof(vobj)); | ||||
|  | ||||
|   static const int words=sizeof(vobj)/sizeof(vector_type); | ||||
|   int odx,idx; | ||||
|   idx= grid->iIndex(site); | ||||
|   odx= grid->oIndex(site); | ||||
|  | ||||
|   scalar_type * vp = (scalar_type *)&l[odx]; | ||||
|   scalar_type * pt = (scalar_type *)&s; | ||||
|   for(int w=0;w<words;w++){ | ||||
|     vp[idx+w*Nsimd] = pt[w]; | ||||
|   } | ||||
|   return; | ||||
| }; | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
|  | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -25,8 +25,8 @@ Author: neo <cossu@post.kek.jp> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_LATTICE_REALITY_H | ||||
| #define GRID_LATTICE_REALITY_H | ||||
|  | ||||
| @@ -36,22 +36,34 @@ Author: neo <cossu@post.kek.jp> | ||||
| // The choice of burying complex in the SIMD | ||||
| // is making the use of "real" and "imag" very cumbersome | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
|     template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){ | ||||
|         Lattice<vobj> ret(lhs._grid); | ||||
| 	parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
|             ret._odata[ss] = adj(lhs._odata[ss]); | ||||
|         } | ||||
|         return ret; | ||||
|     }; | ||||
| template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){ | ||||
|   Lattice<vobj> ret(lhs.Grid()); | ||||
|  | ||||
|   autoView( lhs_v, lhs, AcceleratorRead); | ||||
|   autoView( ret_v, ret, AcceleratorWrite); | ||||
|  | ||||
|   ret.Checkerboard()=lhs.Checkerboard(); | ||||
|   accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { | ||||
|     coalescedWrite(ret_v[ss], adj(lhs_v(ss))); | ||||
|   }); | ||||
|   return ret; | ||||
| }; | ||||
|  | ||||
| template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){ | ||||
|   Lattice<vobj> ret(lhs.Grid()); | ||||
|  | ||||
|   autoView( lhs_v, lhs, AcceleratorRead); | ||||
|   autoView( ret_v, ret, AcceleratorWrite); | ||||
|  | ||||
|   ret.Checkerboard() = lhs.Checkerboard(); | ||||
|   accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { | ||||
|     coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss))); | ||||
|   }); | ||||
|   return ret; | ||||
| }; | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
|     template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){ | ||||
|         Lattice<vobj> ret(lhs._grid); | ||||
| 	parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| 	  ret._odata[ss] = conjugate(lhs._odata[ss]); | ||||
|         } | ||||
|         return ret; | ||||
|     }; | ||||
| } | ||||
| #endif | ||||
|   | ||||
| @@ -5,6 +5,7 @@ | ||||
| Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
| Author: Christoph Lehner <christoph@lhnr.de> | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
| @@ -19,58 +20,172 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #ifndef GRID_LATTICE_REDUCTION_H | ||||
| #define GRID_LATTICE_REDUCTION_H | ||||
| #pragma once | ||||
|  | ||||
| #include <Grid/Grid_Eigen_Dense.h> | ||||
|  | ||||
| namespace Grid { | ||||
| #ifdef GRID_WARN_SUBOPTIMAL | ||||
| #warning "Optimisation alert all these reduction loops are NOT threaded " | ||||
| #endif      | ||||
|  | ||||
|   //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // Deterministic Reduction operations | ||||
|   //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| #if defined(GRID_CUDA)||defined(GRID_HIP) | ||||
| #include <Grid/lattice/Lattice_reduction_gpu.h> | ||||
| #endif | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| ////////////////////////////////////////////////////// | ||||
| // FIXME this should promote to double and accumulate | ||||
| ////////////////////////////////////////////////////// | ||||
| template<class vobj> | ||||
| inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites) | ||||
| { | ||||
|   typedef typename vobj::scalar_object  sobj; | ||||
|  | ||||
|   //  const int Nsimd = vobj::Nsimd(); | ||||
|   const int nthread = GridThread::GetThreads(); | ||||
|  | ||||
|   Vector<sobj> sumarray(nthread); | ||||
|   for(int i=0;i<nthread;i++){ | ||||
|     sumarray[i]=Zero(); | ||||
|   } | ||||
|    | ||||
|   thread_for(thr,nthread, { | ||||
|     int nwork, mywork, myoff; | ||||
|     nwork = osites; | ||||
|     GridThread::GetWork(nwork,thr,mywork,myoff); | ||||
|     vobj vvsum=Zero(); | ||||
|     for(int ss=myoff;ss<mywork+myoff; ss++){ | ||||
|       vvsum = vvsum + arg[ss]; | ||||
|     } | ||||
|     sumarray[thr]=Reduce(vvsum); | ||||
|   }); | ||||
|    | ||||
|   sobj ssum=Zero();  // sum across threads | ||||
|   for(int i=0;i<nthread;i++){ | ||||
|     ssum = ssum+sumarray[i]; | ||||
|   }  | ||||
|   return ssum; | ||||
| } | ||||
| template<class vobj> | ||||
| inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites) | ||||
| { | ||||
|   typedef typename vobj::scalar_objectD  sobj; | ||||
|  | ||||
|   const int nthread = GridThread::GetThreads(); | ||||
|  | ||||
|   Vector<sobj> sumarray(nthread); | ||||
|   for(int i=0;i<nthread;i++){ | ||||
|     sumarray[i]=Zero(); | ||||
|   } | ||||
|    | ||||
|   thread_for(thr,nthread, { | ||||
|     int nwork, mywork, myoff; | ||||
|     nwork = osites; | ||||
|     GridThread::GetWork(nwork,thr,mywork,myoff); | ||||
|     vobj vvsum=Zero(); | ||||
|     for(int ss=myoff;ss<mywork+myoff; ss++){ | ||||
|       vvsum = vvsum + arg[ss]; | ||||
|     } | ||||
|     sumarray[thr]=Reduce(vvsum); | ||||
|   }); | ||||
|    | ||||
|   sobj ssum=Zero();  // sum across threads | ||||
|   for(int i=0;i<nthread;i++){ | ||||
|     ssum = ssum+sumarray[i]; | ||||
|   }  | ||||
|    | ||||
|   typedef typename vobj::scalar_object ssobj; | ||||
|   ssobj ret = ssum; | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
|  | ||||
| template<class vobj> | ||||
| inline typename vobj::scalar_object sum(const vobj *arg, Integer osites) | ||||
| { | ||||
| #if defined(GRID_CUDA)||defined(GRID_HIP) | ||||
|   return sum_gpu(arg,osites); | ||||
| #else | ||||
|   return sum_cpu(arg,osites); | ||||
| #endif   | ||||
| } | ||||
| template<class vobj> | ||||
| inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites) | ||||
| { | ||||
| #if defined(GRID_CUDA)||defined(GRID_HIP) | ||||
|   return sumD_gpu(arg,osites); | ||||
| #else | ||||
|   return sumD_cpu(arg,osites); | ||||
| #endif   | ||||
| } | ||||
|  | ||||
| template<class vobj> | ||||
| inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) | ||||
| { | ||||
| #if defined(GRID_CUDA)||defined(GRID_HIP) | ||||
|   autoView( arg_v, arg, AcceleratorRead); | ||||
|   Integer osites = arg.Grid()->oSites(); | ||||
|   auto ssum= sum_gpu(&arg_v[0],osites); | ||||
| #else | ||||
|   autoView(arg_v, arg, CpuRead); | ||||
|   Integer osites = arg.Grid()->oSites(); | ||||
|   auto ssum= sum_cpu(&arg_v[0],osites); | ||||
| #endif   | ||||
|   arg.Grid()->GlobalSum(ssum); | ||||
|   return ssum; | ||||
| } | ||||
|  | ||||
| //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // Deterministic Reduction operations | ||||
| //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){ | ||||
|   auto nrm = innerProduct(arg,arg); | ||||
|   return std::real(nrm);  | ||||
|   ComplexD nrm = innerProduct(arg,arg); | ||||
|   return real(nrm);  | ||||
| } | ||||
|  | ||||
| // Double inner product | ||||
| template<class vobj> | ||||
| inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) | ||||
| inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) | ||||
| { | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_typeD vector_type; | ||||
|   GridBase *grid = left._grid; | ||||
|   const int pad = 8; | ||||
|  | ||||
|   ComplexD  inner; | ||||
|   Vector<ComplexD> sumarray(grid->SumArraySize()*pad); | ||||
|  | ||||
|   parallel_for(int thr=0;thr<grid->SumArraySize();thr++){ | ||||
|     int nwork, mywork, myoff; | ||||
|     GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); | ||||
|      | ||||
|     decltype(innerProductD(left._odata[0],right._odata[0])) vinner=zero; // private to thread; sub summation | ||||
|     for(int ss=myoff;ss<mywork+myoff; ss++){ | ||||
|       vinner = vinner + innerProductD(left._odata[ss],right._odata[ss]); | ||||
|     } | ||||
|     // All threads sum across SIMD; reduce serial work at end | ||||
|     // one write per cacheline with streaming store | ||||
|     ComplexD tmp = Reduce(TensorRemove(vinner)) ; | ||||
|     vstream(sumarray[thr*pad],tmp); | ||||
|   } | ||||
|   ComplexD  nrm; | ||||
|    | ||||
|   inner=0.0; | ||||
|   for(int i=0;i<grid->SumArraySize();i++){ | ||||
|     inner = inner+sumarray[i*pad]; | ||||
|   }  | ||||
|   right._grid->GlobalSum(inner); | ||||
|   return inner; | ||||
|   GridBase *grid = left.Grid(); | ||||
|  | ||||
|   const uint64_t nsimd = grid->Nsimd(); | ||||
|   const uint64_t sites = grid->oSites(); | ||||
|    | ||||
|   // Might make all code paths go this way. | ||||
|   typedef decltype(innerProductD(vobj(),vobj())) inner_t; | ||||
|   Vector<inner_t> inner_tmp(sites); | ||||
|   auto inner_tmp_v = &inner_tmp[0]; | ||||
|      | ||||
|   { | ||||
|     autoView( left_v , left, AcceleratorRead); | ||||
|     autoView( right_v,right, AcceleratorRead); | ||||
|  | ||||
|     // GPU - SIMT lane compliance... | ||||
|     accelerator_for( ss, sites, 1,{ | ||||
| 	auto x_l = left_v[ss]; | ||||
| 	auto y_l = right_v[ss]; | ||||
| 	inner_tmp_v[ss]=innerProductD(x_l,y_l); | ||||
|     }); | ||||
|   } | ||||
|  | ||||
|   // This is in single precision and fails some tests | ||||
|   auto anrm = sum(inner_tmp_v,sites);   | ||||
|   nrm = anrm; | ||||
|   return nrm; | ||||
| } | ||||
|  | ||||
| template<class vobj> | ||||
| inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) { | ||||
|   GridBase *grid = left.Grid(); | ||||
|   ComplexD nrm = rankInnerProduct(left,right); | ||||
|   grid->GlobalSum(nrm); | ||||
|   return nrm; | ||||
| } | ||||
|  | ||||
|  | ||||
| ///////////////////////// | ||||
| // Fast axpby_norm | ||||
| // z = a x + b y | ||||
| @@ -86,8 +201,7 @@ axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj | ||||
| template<class sobj,class vobj> strong_inline RealD  | ||||
| axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)  | ||||
| { | ||||
|   const int pad = 8; | ||||
|   z.checkerboard = x.checkerboard; | ||||
|   z.Checkerboard() = x.Checkerboard(); | ||||
|   conformable(z,x); | ||||
|   conformable(x,y); | ||||
|  | ||||
| @@ -95,43 +209,79 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt | ||||
|   typedef typename vobj::vector_typeD vector_type; | ||||
|   RealD  nrm; | ||||
|    | ||||
|   GridBase *grid = x._grid; | ||||
|   GridBase *grid = x.Grid(); | ||||
|  | ||||
|   const uint64_t nsimd = grid->Nsimd(); | ||||
|   const uint64_t sites = grid->oSites(); | ||||
|    | ||||
|   Vector<RealD> sumarray(grid->SumArraySize()*pad); | ||||
|    | ||||
|   parallel_for(int thr=0;thr<grid->SumArraySize();thr++){ | ||||
|     int nwork, mywork, myoff; | ||||
|     GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff); | ||||
|      | ||||
|     // private to thread; sub summation | ||||
|     decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero;  | ||||
|     for(int ss=myoff;ss<mywork+myoff; ss++){ | ||||
|       vobj tmp = a*x._odata[ss]+b*y._odata[ss]; | ||||
|       vnrm = vnrm + innerProductD(tmp,tmp); | ||||
|       vstream(z._odata[ss],tmp); | ||||
|     } | ||||
|     vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ; | ||||
|   } | ||||
|    | ||||
|   nrm = 0.0; // sum across threads; linear in thread count but fast | ||||
|   for(int i=0;i<grid->SumArraySize();i++){ | ||||
|     nrm = nrm+sumarray[i*pad]; | ||||
|   }  | ||||
|   z._grid->GlobalSum(nrm); | ||||
|   // GPU | ||||
|   autoView( x_v, x, AcceleratorRead); | ||||
|   autoView( y_v, y, AcceleratorRead); | ||||
|   autoView( z_v, z, AcceleratorWrite); | ||||
|  | ||||
|   typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t; | ||||
|   Vector<inner_t> inner_tmp(sites); | ||||
|   auto inner_tmp_v = &inner_tmp[0]; | ||||
|  | ||||
|   accelerator_for( ss, sites, 1,{ | ||||
|       auto tmp = a*x_v[ss]+b*y_v[ss]; | ||||
|       inner_tmp_v[ss]=innerProductD(tmp,tmp); | ||||
|       z_v[ss]=tmp; | ||||
|   }); | ||||
|   nrm = real(TensorRemove(sum(inner_tmp_v,sites))); | ||||
|   grid->GlobalSum(nrm); | ||||
|   return nrm;  | ||||
| } | ||||
|  | ||||
|   | ||||
| template<class vobj> strong_inline void | ||||
| innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Lattice<vobj> &right) | ||||
| { | ||||
|   conformable(left,right); | ||||
|  | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_typeD vector_type; | ||||
|   Vector<ComplexD> tmp(2); | ||||
|  | ||||
|   GridBase *grid = left.Grid(); | ||||
|  | ||||
|   const uint64_t nsimd = grid->Nsimd(); | ||||
|   const uint64_t sites = grid->oSites(); | ||||
|  | ||||
|   // GPU | ||||
|   typedef decltype(innerProductD(vobj(),vobj())) inner_t; | ||||
|   typedef decltype(innerProductD(vobj(),vobj())) norm_t; | ||||
|   Vector<inner_t> inner_tmp(sites); | ||||
|   Vector<norm_t>  norm_tmp(sites); | ||||
|   auto inner_tmp_v = &inner_tmp[0]; | ||||
|   auto norm_tmp_v = &norm_tmp[0]; | ||||
|   { | ||||
|     autoView(left_v,left, AcceleratorRead); | ||||
|     autoView(right_v,right,AcceleratorRead); | ||||
|     accelerator_for( ss, sites, 1,{ | ||||
| 	auto left_tmp = left_v[ss]; | ||||
| 	inner_tmp_v[ss]=innerProductD(left_tmp,right_v[ss]); | ||||
|         norm_tmp_v [ss]=innerProductD(left_tmp,left_tmp); | ||||
|       }); | ||||
|   } | ||||
|  | ||||
|   tmp[0] = TensorRemove(sum(inner_tmp_v,sites)); | ||||
|   tmp[1] = TensorRemove(sum(norm_tmp_v,sites)); | ||||
|  | ||||
|   grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector | ||||
|   ip = tmp[0]; | ||||
|   nrm = real(tmp[1]); | ||||
| } | ||||
|  | ||||
| template<class Op,class T1> | ||||
| inline auto sum(const LatticeUnaryExpression<Op,T1> & expr) | ||||
|   ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object | ||||
|   ->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object | ||||
| { | ||||
|   return sum(closure(expr)); | ||||
| } | ||||
|  | ||||
| template<class Op,class T1,class T2> | ||||
| inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr) | ||||
|       ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object | ||||
|       ->typename decltype(expr.op.func(eval(0,expr.arg1),eval(0,expr.arg2)))::scalar_object | ||||
| { | ||||
|   return sum(closure(expr)); | ||||
| } | ||||
| @@ -139,54 +289,14 @@ inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr) | ||||
|  | ||||
| template<class Op,class T1,class T2,class T3> | ||||
| inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) | ||||
|   ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)), | ||||
| 				      eval(0,std::get<1>(expr.second)), | ||||
| 				      eval(0,std::get<2>(expr.second)) | ||||
|   ->typename decltype(expr.op.func(eval(0,expr.arg1), | ||||
| 				      eval(0,expr.arg2), | ||||
| 				      eval(0,expr.arg3) | ||||
| 				      ))::scalar_object | ||||
| { | ||||
|   return sum(closure(expr)); | ||||
| } | ||||
|  | ||||
| template<class vobj> | ||||
| inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) | ||||
| { | ||||
|   GridBase *grid=arg._grid; | ||||
|   int Nsimd = grid->Nsimd(); | ||||
|    | ||||
|   std::vector<vobj,alignedAllocator<vobj> > sumarray(grid->SumArraySize()); | ||||
|   for(int i=0;i<grid->SumArraySize();i++){ | ||||
|     sumarray[i]=zero; | ||||
|   } | ||||
|    | ||||
|   parallel_for(int thr=0;thr<grid->SumArraySize();thr++){ | ||||
|     int nwork, mywork, myoff; | ||||
|     GridThread::GetWork(grid->oSites(),thr,mywork,myoff); | ||||
|      | ||||
|     vobj vvsum=zero; | ||||
|     for(int ss=myoff;ss<mywork+myoff; ss++){ | ||||
|       vvsum = vvsum + arg._odata[ss]; | ||||
|     } | ||||
|     sumarray[thr]=vvsum; | ||||
|   } | ||||
|    | ||||
|   vobj vsum=zero;  // sum across threads | ||||
|   for(int i=0;i<grid->SumArraySize();i++){ | ||||
|     vsum = vsum+sumarray[i]; | ||||
|   }  | ||||
|    | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   sobj ssum=zero; | ||||
|    | ||||
|   std::vector<sobj>               buf(Nsimd); | ||||
|   extract(vsum,buf); | ||||
|    | ||||
|   for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i]; | ||||
|   arg._grid->GlobalSum(ssum); | ||||
|    | ||||
|   return ssum; | ||||
| } | ||||
|  | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc... | ||||
| ////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| @@ -199,7 +309,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector< | ||||
|   // But easily avoided by using double precision fields | ||||
|   /////////////////////////////////////////////////////// | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   GridBase  *grid = Data._grid; | ||||
|   GridBase  *grid = Data.Grid(); | ||||
|   assert(grid!=NULL); | ||||
|  | ||||
|   const int    Nd = grid->_ndimension; | ||||
| @@ -212,13 +322,13 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector< | ||||
|   int ld=grid->_ldimensions[orthogdim]; | ||||
|   int rd=grid->_rdimensions[orthogdim]; | ||||
|  | ||||
|   std::vector<vobj,alignedAllocator<vobj> > lvSum(rd); // will locally sum vectors first | ||||
|   std::vector<sobj> lsSum(ld,zero);                    // sum across these down to scalars | ||||
|   std::vector<sobj> extracted(Nsimd);                  // splitting the SIMD | ||||
|   Vector<vobj> lvSum(rd); // will locally sum vectors first | ||||
|   Vector<sobj> lsSum(ld,Zero());                    // sum across these down to scalars | ||||
|   ExtractBuffer<sobj> extracted(Nsimd);                  // splitting the SIMD | ||||
|  | ||||
|   result.resize(fd); // And then global sum to return the same vector to every node  | ||||
|   for(int r=0;r<rd;r++){ | ||||
|     lvSum[r]=zero; | ||||
|     lvSum[r]=Zero(); | ||||
|   } | ||||
|  | ||||
|   int e1=    grid->_slice_nblock[orthogdim]; | ||||
| @@ -227,20 +337,19 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector< | ||||
|  | ||||
|   // sum over reduced dimension planes, breaking out orthog dir | ||||
|   // Parallel over orthog direction | ||||
|   parallel_for(int r=0;r<rd;r++){ | ||||
|  | ||||
|   autoView( Data_v, Data, CpuRead); | ||||
|   thread_for( r,rd, { | ||||
|     int so=r*grid->_ostride[orthogdim]; // base offset for start of plane  | ||||
|  | ||||
|     for(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
| 	int ss= so+n*stride+b; | ||||
| 	lvSum[r]=lvSum[r]+Data._odata[ss]; | ||||
| 	lvSum[r]=lvSum[r]+Data_v[ss]; | ||||
|       } | ||||
|     } | ||||
|   } | ||||
|   }); | ||||
|  | ||||
|   // Sum across simd lanes in the plane, breaking out orthog dir. | ||||
|   std::vector<int> icoor(Nd); | ||||
|   Coordinate icoor(Nd); | ||||
|  | ||||
|   for(int rt=0;rt<rd;rt++){ | ||||
|  | ||||
| @@ -265,7 +374,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector< | ||||
|     if ( pt == grid->_processor_coor[orthogdim] ) { | ||||
|       gsum=lsSum[lt]; | ||||
|     } else { | ||||
|       gsum=zero; | ||||
|       gsum=Zero(); | ||||
|     } | ||||
|  | ||||
|     grid->GlobalSum(gsum); | ||||
| @@ -274,123 +383,14 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector< | ||||
|   } | ||||
| } | ||||
|  | ||||
| template<class vobj> | ||||
| static void mySliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) | ||||
| { | ||||
|   // std::cout << GridLogMessage << "Start mySliceInnerProductVector" << std::endl; | ||||
|  | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   std::vector<scalar_type> lsSum; | ||||
|   localSliceInnerProductVector(result, lhs, rhs, lsSum, orthogdim); | ||||
|   globalSliceInnerProductVector(result, lhs, lsSum, orthogdim); | ||||
|   // std::cout << GridLogMessage << "End mySliceInnerProductVector" << std::endl; | ||||
| } | ||||
|  | ||||
| template <class vobj> | ||||
| static void localSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, const Lattice<vobj> &rhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim) | ||||
| { | ||||
|   // std::cout << GridLogMessage << "Start prep" << std::endl; | ||||
|   typedef typename vobj::vector_type   vector_type; | ||||
|   typedef typename vobj::scalar_type   scalar_type; | ||||
|   GridBase  *grid = lhs._grid; | ||||
|   assert(grid!=NULL); | ||||
|   conformable(grid,rhs._grid); | ||||
|  | ||||
|   const int    Nd = grid->_ndimension; | ||||
|   const int Nsimd = grid->Nsimd(); | ||||
|  | ||||
|   assert(orthogdim >= 0); | ||||
|   assert(orthogdim < Nd); | ||||
|  | ||||
|   int fd=grid->_fdimensions[orthogdim]; | ||||
|   int ld=grid->_ldimensions[orthogdim]; | ||||
|   int rd=grid->_rdimensions[orthogdim]; | ||||
|   // std::cout << GridLogMessage << "Start alloc" << std::endl; | ||||
|  | ||||
|   std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first | ||||
|   lsSum.resize(ld,scalar_type(0.0));                    // sum across these down to scalars | ||||
|   std::vector<iScalar<scalar_type>> extracted(Nsimd);   // splitting the SIMD   | ||||
|   // std::cout << GridLogMessage << "End alloc" << std::endl; | ||||
|  | ||||
|   result.resize(fd); // And then global sum to return the same vector to every node for IO to file | ||||
|   for(int r=0;r<rd;r++){ | ||||
|     lvSum[r]=zero; | ||||
|   } | ||||
|  | ||||
|   int e1=    grid->_slice_nblock[orthogdim]; | ||||
|   int e2=    grid->_slice_block [orthogdim]; | ||||
|   int stride=grid->_slice_stride[orthogdim]; | ||||
|   // std::cout << GridLogMessage << "End prep" << std::endl; | ||||
|   // std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl; | ||||
|   vector_type vv; | ||||
|   parallel_for(int r=0;r<rd;r++) | ||||
|   { | ||||
|  | ||||
|     int so=r*grid->_ostride[orthogdim]; // base offset for start of plane  | ||||
|  | ||||
|     for(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
|         int ss = so + n * stride + b; | ||||
|         vv = TensorRemove(innerProduct(lhs._odata[ss], rhs._odata[ss])); | ||||
|         lvSum[r] = lvSum[r] + vv; | ||||
|       } | ||||
|     } | ||||
|   } | ||||
|   // std::cout << GridLogMessage << "End parallel inner product" << std::endl; | ||||
|  | ||||
|   // Sum across simd lanes in the plane, breaking out orthog dir. | ||||
|   std::vector<int> icoor(Nd); | ||||
|   for(int rt=0;rt<rd;rt++){ | ||||
|  | ||||
|     iScalar<vector_type> temp;  | ||||
|     temp._internal = lvSum[rt]; | ||||
|     extract(temp,extracted); | ||||
|  | ||||
|     for(int idx=0;idx<Nsimd;idx++){ | ||||
|  | ||||
|       grid->iCoorFromIindex(icoor,idx); | ||||
|  | ||||
|       int ldx =rt+icoor[orthogdim]*rd; | ||||
|  | ||||
|       lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal; | ||||
|  | ||||
|     } | ||||
|   } | ||||
|   // std::cout << GridLogMessage << "End sum over simd lanes" << std::endl; | ||||
| } | ||||
| template <class vobj> | ||||
| static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim) | ||||
| { | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   GridBase *grid = lhs._grid; | ||||
|   int fd = result.size(); | ||||
|   int ld = lsSum.size(); | ||||
|   // sum over nodes. | ||||
|   std::vector<scalar_type> gsum; | ||||
|   gsum.resize(fd, scalar_type(0.0)); | ||||
|   // std::cout << GridLogMessage << "Start of gsum[t] creation:" << std::endl; | ||||
|   for(int t=0;t<fd;t++){ | ||||
|     int pt = t/ld; // processor plane | ||||
|     int lt = t%ld; | ||||
|     if ( pt == grid->_processor_coor[orthogdim] ) { | ||||
|       gsum[t]=lsSum[lt]; | ||||
|     } | ||||
|   } | ||||
|   // std::cout << GridLogMessage << "End of gsum[t] creation:" << std::endl; | ||||
|   // std::cout << GridLogMessage << "Start of GlobalSumVector:" << std::endl; | ||||
|   grid->GlobalSumVector(&gsum[0], fd); | ||||
|   // std::cout << GridLogMessage << "End of GlobalSumVector:" << std::endl; | ||||
|  | ||||
|   result = gsum; | ||||
| } | ||||
| template<class vobj> | ||||
| static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)  | ||||
| { | ||||
|   typedef typename vobj::vector_type   vector_type; | ||||
|   typedef typename vobj::scalar_type   scalar_type; | ||||
|   GridBase  *grid = lhs._grid; | ||||
|   GridBase  *grid = lhs.Grid(); | ||||
|   assert(grid!=NULL); | ||||
|   conformable(grid,rhs._grid); | ||||
|   conformable(grid,rhs.Grid()); | ||||
|  | ||||
|   const int    Nd = grid->_ndimension; | ||||
|   const int Nsimd = grid->Nsimd(); | ||||
| @@ -402,34 +402,36 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti | ||||
|   int ld=grid->_ldimensions[orthogdim]; | ||||
|   int rd=grid->_rdimensions[orthogdim]; | ||||
|  | ||||
|   std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first | ||||
|   std::vector<scalar_type > lsSum(ld,scalar_type(0.0));                    // sum across these down to scalars | ||||
|   std::vector<iScalar<scalar_type> > extracted(Nsimd);                  // splitting the SIMD | ||||
|   Vector<vector_type> lvSum(rd); // will locally sum vectors first | ||||
|   Vector<scalar_type > lsSum(ld,scalar_type(0.0));                    // sum across these down to scalars | ||||
|   ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);   // splitting the SIMD   | ||||
|  | ||||
|   result.resize(fd); // And then global sum to return the same vector to every node for IO to file | ||||
|   for(int r=0;r<rd;r++){ | ||||
|     lvSum[r]=zero; | ||||
|     lvSum[r]=Zero(); | ||||
|   } | ||||
|  | ||||
|   int e1=    grid->_slice_nblock[orthogdim]; | ||||
|   int e2=    grid->_slice_block [orthogdim]; | ||||
|   int stride=grid->_slice_stride[orthogdim]; | ||||
|  | ||||
|   parallel_for(int r=0;r<rd;r++){ | ||||
|   autoView( lhv, lhs, CpuRead); | ||||
|   autoView( rhv, rhs, CpuRead); | ||||
|   thread_for( r,rd,{ | ||||
|  | ||||
|     int so=r*grid->_ostride[orthogdim]; // base offset for start of plane  | ||||
|  | ||||
|     for(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
| 	int ss= so+n*stride+b; | ||||
| 	vector_type vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss])); | ||||
| 	vector_type vv = TensorRemove(innerProduct(lhv[ss],rhv[ss])); | ||||
| 	lvSum[r]=lvSum[r]+vv; | ||||
|       } | ||||
|     } | ||||
|   } | ||||
|   }); | ||||
|  | ||||
|   // Sum across simd lanes in the plane, breaking out orthog dir. | ||||
|   std::vector<int> icoor(Nd); | ||||
|   Coordinate icoor(Nd); | ||||
|   for(int rt=0;rt<rd;rt++){ | ||||
|  | ||||
|     iScalar<vector_type> temp;  | ||||
| @@ -470,7 +472,7 @@ static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Ortho | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|    | ||||
|   int Nblock = rhs._grid->GlobalDimensions()[Orthog]; | ||||
|   int Nblock = rhs.Grid()->GlobalDimensions()[Orthog]; | ||||
|   std::vector<ComplexD> ip(Nblock); | ||||
|   sn.resize(Nblock); | ||||
|    | ||||
| @@ -492,7 +494,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice | ||||
|    | ||||
|   scalar_type zscale(scale); | ||||
|  | ||||
|   GridBase *grid  = X._grid; | ||||
|   GridBase *grid  = X.Grid(); | ||||
|  | ||||
|   int Nsimd  =grid->Nsimd(); | ||||
|   int Nblock =grid->GlobalDimensions()[orthogdim]; | ||||
| @@ -505,8 +507,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice | ||||
|   int e2     =grid->_slice_block [orthogdim]; | ||||
|   int stride =grid->_slice_stride[orthogdim]; | ||||
|  | ||||
|   std::vector<int> icoor; | ||||
|  | ||||
|   Coordinate icoor; | ||||
|   for(int r=0;r<rd;r++){ | ||||
|  | ||||
|     int so=r*grid->_ostride[orthogdim]; // base offset for start of plane  | ||||
| @@ -522,12 +523,13 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice | ||||
|  | ||||
|     tensor_reduced at; at=av; | ||||
|  | ||||
|     parallel_for_nest2(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
|     autoView( Rv, R, CpuWrite); | ||||
|     autoView( Xv, X, CpuRead); | ||||
|     autoView( Yv, Y, CpuRead); | ||||
|     thread_for2d( n, e1, b,e2, { | ||||
| 	int ss= so+n*stride+b; | ||||
| 	R._odata[ss] = at*X._odata[ss]+Y._odata[ss]; | ||||
|       } | ||||
|     } | ||||
| 	Rv[ss] = at*Xv[ss]+Yv[ss]; | ||||
|     }); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| @@ -559,18 +561,18 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|  | ||||
|   int Nblock = X._grid->GlobalDimensions()[Orthog]; | ||||
|   int Nblock = X.Grid()->GlobalDimensions()[Orthog]; | ||||
|  | ||||
|   GridBase *FullGrid  = X._grid; | ||||
|   GridBase *FullGrid  = X.Grid(); | ||||
|   //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); | ||||
|  | ||||
|   //  Lattice<vobj> Xslice(SliceGrid); | ||||
|   //  Lattice<vobj> Rslice(SliceGrid); | ||||
|  | ||||
|   assert( FullGrid->_simd_layout[Orthog]==1); | ||||
|   int nh =  FullGrid->_ndimension; | ||||
|   //  int nh =  FullGrid->_ndimension; | ||||
|   //  int nl = SliceGrid->_ndimension; | ||||
|   int nl = nh-1; | ||||
|   //  int nl = nh-1; | ||||
|  | ||||
|   //FIXME package in a convenient iterator | ||||
|   //Should loop over a plane orthogonal to direction "Orthog" | ||||
| @@ -578,28 +580,31 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice | ||||
|   int block =FullGrid->_slice_block [Orthog]; | ||||
|   int nblock=FullGrid->_slice_nblock[Orthog]; | ||||
|   int ostride=FullGrid->_ostride[Orthog]; | ||||
| #pragma omp parallel  | ||||
|   { | ||||
|     std::vector<vobj> s_x(Nblock); | ||||
|  | ||||
| #pragma omp for collapse(2) | ||||
|     for(int n=0;n<nblock;n++){ | ||||
|     for(int b=0;b<block;b++){ | ||||
|   autoView( X_v, X, CpuRead); | ||||
|   autoView( Y_v, Y, CpuRead); | ||||
|   autoView( R_v, R, CpuWrite); | ||||
|   thread_region | ||||
|   { | ||||
|     Vector<vobj> s_x(Nblock); | ||||
|  | ||||
|     thread_for_collapse_in_region(2, n,nblock, { | ||||
|      for(int b=0;b<block;b++){ | ||||
|       int o  = n*stride + b; | ||||
|  | ||||
|       for(int i=0;i<Nblock;i++){ | ||||
| 	s_x[i] = X[o+i*ostride]; | ||||
| 	s_x[i] = X_v[o+i*ostride]; | ||||
|       } | ||||
|  | ||||
|       vobj dot; | ||||
|       for(int i=0;i<Nblock;i++){ | ||||
| 	dot = Y[o+i*ostride]; | ||||
| 	dot = Y_v[o+i*ostride]; | ||||
| 	for(int j=0;j<Nblock;j++){ | ||||
| 	  dot = dot + s_x[j]*(scale*aa(j,i)); | ||||
| 	} | ||||
| 	R[o+i*ostride]=dot; | ||||
| 	R_v[o+i*ostride]=dot; | ||||
|       } | ||||
|     }} | ||||
|     }}); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| @@ -610,35 +615,38 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice< | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|  | ||||
|   int Nblock = X._grid->GlobalDimensions()[Orthog]; | ||||
|   int Nblock = X.Grid()->GlobalDimensions()[Orthog]; | ||||
|  | ||||
|   GridBase *FullGrid  = X._grid; | ||||
|   GridBase *FullGrid  = X.Grid(); | ||||
|   //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); | ||||
|   //  Lattice<vobj> Xslice(SliceGrid); | ||||
|   //  Lattice<vobj> Rslice(SliceGrid); | ||||
|  | ||||
|   assert( FullGrid->_simd_layout[Orthog]==1); | ||||
|   int nh =  FullGrid->_ndimension; | ||||
|   //  int nh =  FullGrid->_ndimension; | ||||
|   //  int nl = SliceGrid->_ndimension; | ||||
|   int nl=1; | ||||
|   //  int nl=1; | ||||
|  | ||||
|   //FIXME package in a convenient iterator | ||||
|   // thread_for2d_in_region | ||||
|   //Should loop over a plane orthogonal to direction "Orthog" | ||||
|   int stride=FullGrid->_slice_stride[Orthog]; | ||||
|   int block =FullGrid->_slice_block [Orthog]; | ||||
|   int nblock=FullGrid->_slice_nblock[Orthog]; | ||||
|   int ostride=FullGrid->_ostride[Orthog]; | ||||
| #pragma omp parallel  | ||||
|   autoView( R_v, R, CpuWrite); | ||||
|   autoView( X_v, X, CpuRead); | ||||
|   thread_region | ||||
|   { | ||||
|     std::vector<vobj> s_x(Nblock); | ||||
|  | ||||
| #pragma omp for collapse(2) | ||||
|     for(int n=0;n<nblock;n++){ | ||||
|  | ||||
|     thread_for_collapse_in_region( 2 ,n,nblock,{ | ||||
|     for(int b=0;b<block;b++){ | ||||
|       int o  = n*stride + b; | ||||
|  | ||||
|       for(int i=0;i<Nblock;i++){ | ||||
| 	s_x[i] = X[o+i*ostride]; | ||||
| 	s_x[i] = X_v[o+i*ostride]; | ||||
|       } | ||||
|  | ||||
|       vobj dot; | ||||
| @@ -647,11 +655,10 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice< | ||||
| 	for(int j=1;j<Nblock;j++){ | ||||
| 	  dot = dot + s_x[j]*(scale*aa(j,i)); | ||||
| 	} | ||||
| 	R[o+i*ostride]=dot; | ||||
| 	R_v[o+i*ostride]=dot; | ||||
|       } | ||||
|     }} | ||||
|     }}); | ||||
|   } | ||||
|  | ||||
| }; | ||||
|  | ||||
|  | ||||
| @@ -662,7 +669,7 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|    | ||||
|   GridBase *FullGrid  = lhs._grid; | ||||
|   GridBase *FullGrid  = lhs.Grid(); | ||||
|   //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); | ||||
|    | ||||
|   int Nblock = FullGrid->GlobalDimensions()[Orthog]; | ||||
| @@ -673,9 +680,9 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> | ||||
|   mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); | ||||
|  | ||||
|   assert( FullGrid->_simd_layout[Orthog]==1); | ||||
|   int nh =  FullGrid->_ndimension; | ||||
|   //  int nh =  FullGrid->_ndimension; | ||||
|   //  int nl = SliceGrid->_ndimension; | ||||
|   int nl = nh-1; | ||||
|   //  int nl = nh-1; | ||||
|  | ||||
|   //FIXME package in a convenient iterator | ||||
|   //Should loop over a plane orthogonal to direction "Orthog" | ||||
| @@ -686,31 +693,33 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> | ||||
|  | ||||
|   typedef typename vobj::vector_typeD vector_typeD; | ||||
|  | ||||
| #pragma omp parallel  | ||||
|   autoView( lhs_v, lhs, CpuRead); | ||||
|   autoView( rhs_v, rhs, CpuRead); | ||||
|   thread_region | ||||
|   { | ||||
|     std::vector<vobj> Left(Nblock); | ||||
|     std::vector<vobj> Right(Nblock); | ||||
|     Eigen::MatrixXcd  mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock); | ||||
|  | ||||
| #pragma omp for collapse(2) | ||||
|     for(int n=0;n<nblock;n++){ | ||||
|     thread_for_collapse_in_region( 2, n,nblock,{ | ||||
|     for(int b=0;b<block;b++){ | ||||
|  | ||||
|       int o  = n*stride + b; | ||||
|  | ||||
|       for(int i=0;i<Nblock;i++){ | ||||
| 	Left [i] = lhs[o+i*ostride]; | ||||
| 	Right[i] = rhs[o+i*ostride]; | ||||
| 	Left [i] = lhs_v[o+i*ostride]; | ||||
| 	Right[i] = rhs_v[o+i*ostride]; | ||||
|       } | ||||
|  | ||||
|       for(int i=0;i<Nblock;i++){ | ||||
|       for(int j=0;j<Nblock;j++){ | ||||
| 	auto tmp = innerProduct(Left[i],Right[j]); | ||||
| 	auto rtmp = TensorRemove(tmp); | ||||
| 	mat_thread(i,j) += Reduce(rtmp); | ||||
| 	auto red  =  Reduce(rtmp); | ||||
| 	mat_thread(i,j) += std::complex<double>(real(red),imag(red)); | ||||
|       }} | ||||
|     }} | ||||
| #pragma omp critical | ||||
|     }}); | ||||
|     thread_critical | ||||
|     { | ||||
|       mat += mat_thread; | ||||
|     }   | ||||
| @@ -726,8 +735,8 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> | ||||
|   return; | ||||
| } | ||||
|  | ||||
| } /*END NAMESPACE GRID*/ | ||||
| #endif | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|   | ||||
							
								
								
									
										231
									
								
								Grid/lattice/Lattice_reduction_gpu.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										231
									
								
								Grid/lattice/Lattice_reduction_gpu.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,231 @@ | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| #ifdef GRID_HIP | ||||
| extern hipDeviceProp_t *gpu_props; | ||||
| #endif | ||||
| #ifdef GRID_CUDA | ||||
| extern cudaDeviceProp *gpu_props; | ||||
| #endif | ||||
|  | ||||
| #define WARP_SIZE 32 | ||||
| __device__ unsigned int retirementCount = 0; | ||||
|  | ||||
| template <class Iterator> | ||||
| unsigned int nextPow2(Iterator x) { | ||||
|   --x; | ||||
|   x |= x >> 1; | ||||
|   x |= x >> 2; | ||||
|   x |= x >> 4; | ||||
|   x |= x >> 8; | ||||
|   x |= x >> 16; | ||||
|   return ++x; | ||||
| } | ||||
|  | ||||
| template <class Iterator> | ||||
| void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) { | ||||
|    | ||||
|   int device; | ||||
| #ifdef GRID_CUDA | ||||
|   cudaGetDevice(&device); | ||||
| #endif | ||||
| #ifdef GRID_HIP | ||||
|   hipGetDevice(&device); | ||||
| #endif | ||||
|    | ||||
|   Iterator warpSize            = gpu_props[device].warpSize; | ||||
|   Iterator sharedMemPerBlock   = gpu_props[device].sharedMemPerBlock; | ||||
|   Iterator maxThreadsPerBlock  = gpu_props[device].maxThreadsPerBlock; | ||||
|   Iterator multiProcessorCount = gpu_props[device].multiProcessorCount; | ||||
|    | ||||
|   std::cout << GridLogDebug << "GPU has:" << std::endl; | ||||
|   std::cout << GridLogDebug << "\twarpSize            = " << warpSize << std::endl; | ||||
|   std::cout << GridLogDebug << "\tsharedMemPerBlock   = " << sharedMemPerBlock << std::endl; | ||||
|   std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << maxThreadsPerBlock << std::endl; | ||||
|   std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << warpSize << std::endl; | ||||
|   std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl; | ||||
|    | ||||
|   if (warpSize != WARP_SIZE) { | ||||
|     std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl; | ||||
|     exit(EXIT_FAILURE); | ||||
|   } | ||||
|    | ||||
|   // let the number of threads in a block be a multiple of 2, starting from warpSize | ||||
|   threads = warpSize; | ||||
|   while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2; | ||||
|   // keep all the streaming multiprocessors busy | ||||
|   blocks = nextPow2(multiProcessorCount); | ||||
|    | ||||
| } | ||||
|  | ||||
| template <class sobj, class Iterator> | ||||
| __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid) { | ||||
|    | ||||
|   Iterator blockSize = blockDim.x; | ||||
|    | ||||
|   // cannot use overloaded operators for sobj as they are not volatile-qualified | ||||
|   memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj)); | ||||
|   __syncwarp(); | ||||
|    | ||||
|   const Iterator VEC = WARP_SIZE; | ||||
|   const Iterator vid = tid & (VEC-1); | ||||
|    | ||||
|   sobj beta, temp; | ||||
|   memcpy((void *)&beta, (void *)&mySum, sizeof(sobj)); | ||||
|    | ||||
|   for (int i = VEC/2; i > 0; i>>=1) { | ||||
|     if (vid < i) { | ||||
|       memcpy((void *)&temp, (void *)&sdata[tid+i], sizeof(sobj)); | ||||
|       beta += temp; | ||||
|       memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj)); | ||||
|     } | ||||
|     __syncwarp(); | ||||
|   } | ||||
|   __syncthreads(); | ||||
|    | ||||
|   if (threadIdx.x == 0) { | ||||
|     beta  = Zero(); | ||||
|     for (Iterator i = 0; i < blockSize; i += VEC) { | ||||
|       memcpy((void *)&temp, (void *)&sdata[i], sizeof(sobj)); | ||||
|       beta  += temp; | ||||
|     } | ||||
|     memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj)); | ||||
|   } | ||||
|   __syncthreads(); | ||||
| } | ||||
|  | ||||
|  | ||||
| template <class vobj, class sobj, class Iterator> | ||||
| __device__ void reduceBlocks(const vobj *g_idata, sobj *g_odata, Iterator n)  | ||||
| { | ||||
|   constexpr Iterator nsimd = vobj::Nsimd(); | ||||
|    | ||||
|   Iterator blockSize = blockDim.x; | ||||
|    | ||||
|   // force shared memory alignment | ||||
|   extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[]; | ||||
|   // it's not possible to have two extern __shared__ arrays with same name | ||||
|   // but different types in different scopes -- need to cast each time | ||||
|   sobj *sdata = (sobj *)shmem_pointer; | ||||
|    | ||||
|   // first level of reduction, | ||||
|   // each thread writes result in mySum | ||||
|   Iterator tid = threadIdx.x; | ||||
|   Iterator i = blockIdx.x*(blockSize*2) + threadIdx.x; | ||||
|   Iterator gridSize = blockSize*2*gridDim.x; | ||||
|   sobj mySum = Zero(); | ||||
|    | ||||
|   while (i < n) { | ||||
|     Iterator lane = i % nsimd; | ||||
|     Iterator ss   = i / nsimd; | ||||
|     auto tmp = extractLane(lane,g_idata[ss]); | ||||
|     sobj tmpD; | ||||
|     tmpD=tmp; | ||||
|     mySum   +=tmpD; | ||||
|      | ||||
|     if (i + blockSize < n) { | ||||
|       lane = (i+blockSize) % nsimd; | ||||
|       ss   = (i+blockSize) / nsimd; | ||||
|       tmp = extractLane(lane,g_idata[ss]); | ||||
|       tmpD = tmp; | ||||
|       mySum += tmpD; | ||||
|     } | ||||
|     i += gridSize; | ||||
|   } | ||||
|    | ||||
|   // copy mySum to shared memory and perform | ||||
|   // reduction for all threads in this block | ||||
|   reduceBlock(sdata, mySum, tid); | ||||
|   if (tid == 0) g_odata[blockIdx.x] = sdata[0]; | ||||
| } | ||||
|  | ||||
| template <class vobj, class sobj,class Iterator> | ||||
| __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) { | ||||
|    | ||||
|   Iterator blockSize = blockDim.x; | ||||
|    | ||||
|   // perform reduction for this block and | ||||
|   // write result to global memory buffer | ||||
|   reduceBlocks(lat, buffer, n); | ||||
|    | ||||
|   if (gridDim.x > 1) { | ||||
|      | ||||
|     const Iterator tid = threadIdx.x; | ||||
|     __shared__ bool amLast; | ||||
|     // force shared memory alignment | ||||
|     extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[]; | ||||
|     // it's not possible to have two extern __shared__ arrays with same name | ||||
|     // but different types in different scopes -- need to cast each time | ||||
|     sobj *smem = (sobj *)shmem_pointer; | ||||
|      | ||||
|     // wait until all outstanding memory instructions in this thread are finished | ||||
|     acceleratorFence(); | ||||
|      | ||||
|     if (tid==0) { | ||||
|       unsigned int ticket = atomicInc(&retirementCount, gridDim.x); | ||||
|       // true if this block is the last block to be done | ||||
|       amLast = (ticket == gridDim.x-1); | ||||
|     } | ||||
|      | ||||
|     // each thread must read the correct value of amLast | ||||
|     acceleratorSynchroniseAll(); | ||||
|  | ||||
|     if (amLast) { | ||||
|       // reduce buffer[0], ..., buffer[gridDim.x-1] | ||||
|       Iterator i = tid; | ||||
|       sobj mySum = Zero(); | ||||
|        | ||||
|       while (i < gridDim.x) { | ||||
|         mySum += buffer[i]; | ||||
|         i += blockSize; | ||||
|       } | ||||
|        | ||||
|       reduceBlock(smem, mySum, tid); | ||||
|        | ||||
|       if (tid==0) { | ||||
|         buffer[0] = smem[0]; | ||||
|         // reset count variable | ||||
|         retirementCount = 0; | ||||
|       } | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| ///////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // Possibly promote to double and sum | ||||
| ///////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| template <class vobj> | ||||
| inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)  | ||||
| { | ||||
|   typedef typename vobj::scalar_objectD sobj; | ||||
|   typedef decltype(lat) Iterator; | ||||
|    | ||||
|   Integer nsimd= vobj::Nsimd(); | ||||
|   Integer size = osites*nsimd; | ||||
|  | ||||
|   Integer numThreads, numBlocks; | ||||
|   getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks); | ||||
|   Integer smemSize = numThreads * sizeof(sobj); | ||||
|  | ||||
|   Vector<sobj> buffer(numBlocks); | ||||
|   sobj *buffer_v = &buffer[0]; | ||||
|    | ||||
|   reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); | ||||
|   accelerator_barrier(); | ||||
|   auto result = buffer_v[0]; | ||||
|   return result; | ||||
| } | ||||
| ///////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // Return as same precision as input performing reduction in double precision though | ||||
| ///////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| template <class vobj> | ||||
| inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)  | ||||
| { | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   sobj result; | ||||
|   result = sumD_gpu(lat,osites); | ||||
|   return result; | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -24,8 +24,8 @@ | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_LATTICE_RNG_H | ||||
| #define GRID_LATTICE_RNG_H | ||||
|  | ||||
| @@ -41,282 +41,289 @@ | ||||
| #undef  RNG_FAST_DISCARD | ||||
| #endif | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
|   ////////////////////////////////////////////////////////////// | ||||
|   // Allow the RNG state to be less dense than the fine grid | ||||
|   ////////////////////////////////////////////////////////////// | ||||
|   inline int RNGfillable(GridBase *coarse,GridBase *fine) | ||||
|   { | ||||
| ////////////////////////////////////////////////////////////// | ||||
| // Allow the RNG state to be less dense than the fine grid | ||||
| ////////////////////////////////////////////////////////////// | ||||
| inline int RNGfillable(GridBase *coarse,GridBase *fine) | ||||
| { | ||||
|  | ||||
|     int rngdims = coarse->_ndimension; | ||||
|   int rngdims = coarse->_ndimension; | ||||
|  | ||||
|     // trivially extended in higher dims, with locality guaranteeing RNG state is local to node | ||||
|     int lowerdims   = fine->_ndimension - coarse->_ndimension; | ||||
|     assert(lowerdims >= 0); | ||||
|     for(int d=0;d<lowerdims;d++){ | ||||
|       assert(fine->_simd_layout[d]==1); | ||||
|       assert(fine->_processors[d]==1); | ||||
|     } | ||||
|  | ||||
|     int multiplicity=1; | ||||
|     for(int d=0;d<lowerdims;d++){ | ||||
|       multiplicity=multiplicity*fine->_rdimensions[d]; | ||||
|     } | ||||
|     // local and global volumes subdivide cleanly after SIMDization | ||||
|     for(int d=0;d<rngdims;d++){ | ||||
|       int fd= d+lowerdims; | ||||
|       assert(coarse->_processors[d]  == fine->_processors[fd]); | ||||
|       assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]); | ||||
|       assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]);  | ||||
|  | ||||
|       multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d];  | ||||
|     } | ||||
|     return multiplicity; | ||||
|   // trivially extended in higher dims, with locality guaranteeing RNG state is local to node | ||||
|   int lowerdims   = fine->_ndimension - coarse->_ndimension; | ||||
|   assert(lowerdims >= 0); | ||||
|   for(int d=0;d<lowerdims;d++){ | ||||
|     assert(fine->_simd_layout[d]==1); | ||||
|     assert(fine->_processors[d]==1); | ||||
|   } | ||||
|  | ||||
|   int multiplicity=1; | ||||
|   for(int d=0;d<lowerdims;d++){ | ||||
|     multiplicity=multiplicity*fine->_rdimensions[d]; | ||||
|   } | ||||
|   // local and global volumes subdivide cleanly after SIMDization | ||||
|   for(int d=0;d<rngdims;d++){ | ||||
|     int fd= d+lowerdims; | ||||
|     assert(coarse->_processors[d]  == fine->_processors[fd]); | ||||
|     assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]); | ||||
|     assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]);  | ||||
|  | ||||
|     multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d];  | ||||
|   } | ||||
|   return multiplicity; | ||||
| } | ||||
|  | ||||
|    | ||||
| // merge of April 11 2017 | ||||
|   // this function is necessary for the LS vectorised field | ||||
|   inline int RNGfillable_general(GridBase *coarse,GridBase *fine) | ||||
|   { | ||||
|     int rngdims = coarse->_ndimension; | ||||
| // this function is necessary for the LS vectorised field | ||||
| inline int RNGfillable_general(GridBase *coarse,GridBase *fine) | ||||
| { | ||||
|   int rngdims = coarse->_ndimension; | ||||
|      | ||||
|     // trivially extended in higher dims, with locality guaranteeing RNG state is local to node | ||||
|     int lowerdims   = fine->_ndimension - coarse->_ndimension;  assert(lowerdims >= 0); | ||||
|     // assumes that the higher dimensions are not using more processors | ||||
|     // all further divisions are local | ||||
|     for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1); | ||||
|     for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]); | ||||
|   // trivially extended in higher dims, with locality guaranteeing RNG state is local to node | ||||
|   int lowerdims   = fine->_ndimension - coarse->_ndimension;  assert(lowerdims >= 0); | ||||
|   // assumes that the higher dimensions are not using more processors | ||||
|   // all further divisions are local | ||||
|   for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1); | ||||
|   for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]); | ||||
|  | ||||
|     // then divide the number of local sites | ||||
|     // check that the total number of sims agree, meanse the iSites are the same | ||||
|     assert(fine->Nsimd() == coarse->Nsimd()); | ||||
|   // then divide the number of local sites | ||||
|   // check that the total number of sims agree, meanse the iSites are the same | ||||
|   assert(fine->Nsimd() == coarse->Nsimd()); | ||||
|  | ||||
|     // check that the two grids divide cleanly | ||||
|     assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() ); | ||||
|   // check that the two grids divide cleanly | ||||
|   assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() ); | ||||
|  | ||||
|     return fine->lSites() / coarse->lSites(); | ||||
|   } | ||||
|   return fine->lSites() / coarse->lSites(); | ||||
| } | ||||
|    | ||||
|   // real scalars are one component | ||||
|   template<class scalar,class distribution,class generator>  | ||||
|   void fillScalar(scalar &s,distribution &dist,generator & gen) | ||||
|   { | ||||
|     s=dist(gen); | ||||
|   } | ||||
|   template<class distribution,class generator>  | ||||
|   void fillScalar(ComplexF &s,distribution &dist, generator &gen) | ||||
|   { | ||||
|     s=ComplexF(dist(gen),dist(gen)); | ||||
|   } | ||||
|   template<class distribution,class generator>  | ||||
|   void fillScalar(ComplexD &s,distribution &dist,generator &gen) | ||||
|   { | ||||
|     s=ComplexD(dist(gen),dist(gen)); | ||||
|   } | ||||
| // real scalars are one component | ||||
| template<class scalar,class distribution,class generator>  | ||||
| void fillScalar(scalar &s,distribution &dist,generator & gen) | ||||
| { | ||||
|   s=dist(gen); | ||||
| } | ||||
| template<class distribution,class generator>  | ||||
| void fillScalar(ComplexF &s,distribution &dist, generator &gen) | ||||
| { | ||||
|   //  s=ComplexF(dist(gen),dist(gen)); | ||||
|   s.real(dist(gen)); | ||||
|   s.imag(dist(gen)); | ||||
| } | ||||
| template<class distribution,class generator>  | ||||
| void fillScalar(ComplexD &s,distribution &dist,generator &gen) | ||||
| { | ||||
|   //  s=ComplexD(dist(gen),dist(gen)); | ||||
|   s.real(dist(gen)); | ||||
|   s.imag(dist(gen)); | ||||
| } | ||||
|    | ||||
|   class GridRNGbase { | ||||
|   public: | ||||
|     // One generator per site. | ||||
|     // Uniform and Gaussian distributions from these generators. | ||||
| class GridRNGbase { | ||||
| public: | ||||
|   // One generator per site. | ||||
|   // Uniform and Gaussian distributions from these generators. | ||||
| #ifdef RNG_RANLUX | ||||
|     typedef std::ranlux48 RngEngine; | ||||
|     typedef uint64_t      RngStateType; | ||||
|     static const int RngStateCount = 15; | ||||
|   typedef std::ranlux48 RngEngine; | ||||
|   typedef uint64_t      RngStateType; | ||||
|   static const int RngStateCount = 15; | ||||
| #endif  | ||||
| #ifdef RNG_MT19937  | ||||
|     typedef std::mt19937 RngEngine; | ||||
|     typedef uint32_t     RngStateType; | ||||
|     static const int     RngStateCount = std::mt19937::state_size; | ||||
|   typedef std::mt19937 RngEngine; | ||||
|   typedef uint32_t     RngStateType; | ||||
|   static const int     RngStateCount = std::mt19937::state_size; | ||||
| #endif | ||||
| #ifdef RNG_SITMO | ||||
|     typedef sitmo::prng_engine 	RngEngine; | ||||
|     typedef uint64_t    	RngStateType; | ||||
|     static const int    	RngStateCount = 13; | ||||
|   typedef sitmo::prng_engine 	RngEngine; | ||||
|   typedef uint64_t    	RngStateType; | ||||
|   static const int    	RngStateCount = 13; | ||||
| #endif | ||||
|  | ||||
|     std::vector<RngEngine>                             _generators; | ||||
|     std::vector<std::uniform_real_distribution<RealD> > _uniform; | ||||
|     std::vector<std::normal_distribution<RealD> >       _gaussian; | ||||
|     std::vector<std::discrete_distribution<int32_t> >   _bernoulli; | ||||
|     std::vector<std::uniform_int_distribution<uint32_t> > _uid; | ||||
|   std::vector<RngEngine>                             _generators; | ||||
|   std::vector<std::uniform_real_distribution<RealD> > _uniform; | ||||
|   std::vector<std::normal_distribution<RealD> >       _gaussian; | ||||
|   std::vector<std::discrete_distribution<int32_t> >   _bernoulli; | ||||
|   std::vector<std::uniform_int_distribution<uint32_t> > _uid; | ||||
|  | ||||
|     /////////////////////// | ||||
|     // support for parallel init | ||||
|     /////////////////////// | ||||
|   /////////////////////// | ||||
|   // support for parallel init | ||||
|   /////////////////////// | ||||
| #ifdef RNG_FAST_DISCARD | ||||
|     static void Skip(RngEngine &eng,uint64_t site) | ||||
|     { | ||||
|       ///////////////////////////////////////////////////////////////////////////////////// | ||||
|       // Skip by 2^40 elements between successive lattice sites | ||||
|       // This goes by 10^12. | ||||
|       // Consider quenched updating; likely never exceeding rate of 1000 sweeps | ||||
|       // per second on any machine. This gives us of order 10^9 seconds, or 100 years | ||||
|       // skip ahead. | ||||
|       // For HMC unlikely to go at faster than a solve per second, and  | ||||
|       // tens of seconds per trajectory so this is clean in all reasonable cases, | ||||
|       // and margin of safety is orders of magnitude. | ||||
|       // We could hack Sitmo to skip in the higher order words of state if necessary | ||||
|   static void Skip(RngEngine &eng,uint64_t site) | ||||
|   { | ||||
|     ///////////////////////////////////////////////////////////////////////////////////// | ||||
|     // Skip by 2^40 elements between successive lattice sites | ||||
|     // This goes by 10^12. | ||||
|     // Consider quenched updating; likely never exceeding rate of 1000 sweeps | ||||
|     // per second on any machine. This gives us of order 10^9 seconds, or 100 years | ||||
|     // skip ahead. | ||||
|     // For HMC unlikely to go at faster than a solve per second, and  | ||||
|     // tens of seconds per trajectory so this is clean in all reasonable cases, | ||||
|     // and margin of safety is orders of magnitude. | ||||
|     // We could hack Sitmo to skip in the higher order words of state if necessary | ||||
|       // | ||||
|       // Replace with 2^30 ; avoid problem on large volumes | ||||
|       // | ||||
|       ///////////////////////////////////////////////////////////////////////////////////// | ||||
|       //      uint64_t skip = site+1;  //   Old init Skipped then drew.  Checked compat with faster init | ||||
|       const int shift = 30; | ||||
|     ///////////////////////////////////////////////////////////////////////////////////// | ||||
|     //      uint64_t skip = site+1;  //   Old init Skipped then drew.  Checked compat with faster init | ||||
|     const int shift = 30; | ||||
|  | ||||
|       uint64_t skip = site; | ||||
|     //////////////////////////////////////////////////////////////////// | ||||
|     // Weird compiler bug in Intel 2018.1 under O3 was generating 32bit and not 64 bit left shift. | ||||
|     //////////////////////////////////////////////////////////////////// | ||||
|     volatile uint64_t skip = site; | ||||
|  | ||||
|       skip = skip<<shift; | ||||
|     skip = skip<<shift; | ||||
|  | ||||
|       assert((skip >> shift)==site); // check for overflow | ||||
|     assert((skip >> shift)==site); // check for overflow | ||||
|  | ||||
|       eng.discard(skip); | ||||
|       //      std::cout << " Engine  " <<site << " state " <<eng<<std::endl; | ||||
|     }  | ||||
|     eng.discard(skip); | ||||
|     //      std::cout << " Engine  " <<site << " state " <<eng<<std::endl; | ||||
|   }  | ||||
| #endif | ||||
|     static RngEngine Reseed(RngEngine &eng) | ||||
|     { | ||||
|       std::vector<uint32_t> newseed; | ||||
|       std::uniform_int_distribution<uint32_t> uid; | ||||
|       return Reseed(eng,newseed,uid); | ||||
|     } | ||||
|     static RngEngine Reseed(RngEngine &eng,std::vector<uint32_t> & newseed, | ||||
| 			    std::uniform_int_distribution<uint32_t> &uid) | ||||
|     { | ||||
|       const int reseeds=4; | ||||
|   static RngEngine Reseed(RngEngine &eng) | ||||
|   { | ||||
|     std::vector<uint32_t> newseed; | ||||
|     std::uniform_int_distribution<uint32_t> uid; | ||||
|     return Reseed(eng,newseed,uid); | ||||
|   } | ||||
|   static RngEngine Reseed(RngEngine &eng,std::vector<uint32_t> & newseed, | ||||
| 			  std::uniform_int_distribution<uint32_t> &uid) | ||||
|   { | ||||
|     const int reseeds=4; | ||||
|        | ||||
|       newseed.resize(reseeds); | ||||
|       for(int i=0;i<reseeds;i++){ | ||||
| 	newseed[i] = uid(eng); | ||||
|       } | ||||
|       std::seed_seq sseq(newseed.begin(),newseed.end()); | ||||
|       return RngEngine(sseq); | ||||
|     }     | ||||
|     newseed.resize(reseeds); | ||||
|     for(int i=0;i<reseeds;i++){ | ||||
|       newseed[i] = uid(eng); | ||||
|     } | ||||
|     std::seed_seq sseq(newseed.begin(),newseed.end()); | ||||
|     return RngEngine(sseq); | ||||
|   }     | ||||
|  | ||||
|     void GetState(std::vector<RngStateType> & saved,RngEngine &eng) { | ||||
|       saved.resize(RngStateCount); | ||||
|       std::stringstream ss; | ||||
|       ss<<eng; | ||||
|       ss.seekg(0,ss.beg); | ||||
|       for(int i=0;i<RngStateCount;i++){ | ||||
|         ss>>saved[i]; | ||||
|       } | ||||
|   void GetState(std::vector<RngStateType> & saved,RngEngine &eng) { | ||||
|     saved.resize(RngStateCount); | ||||
|     std::stringstream ss; | ||||
|     ss<<eng; | ||||
|     ss.seekg(0,ss.beg); | ||||
|     for(int i=0;i<RngStateCount;i++){ | ||||
|       ss>>saved[i]; | ||||
|     } | ||||
|     void GetState(std::vector<RngStateType> & saved,int gen) { | ||||
|       GetState(saved,_generators[gen]); | ||||
|   } | ||||
|   void GetState(std::vector<RngStateType> & saved,int gen) { | ||||
|     GetState(saved,_generators[gen]); | ||||
|   } | ||||
|   void SetState(std::vector<RngStateType> & saved,RngEngine &eng){ | ||||
|     assert(saved.size()==RngStateCount); | ||||
|     std::stringstream ss; | ||||
|     for(int i=0;i<RngStateCount;i++){ | ||||
|       ss<< saved[i]<<" "; | ||||
|     } | ||||
|     void SetState(std::vector<RngStateType> & saved,RngEngine &eng){ | ||||
|       assert(saved.size()==RngStateCount); | ||||
|       std::stringstream ss; | ||||
|       for(int i=0;i<RngStateCount;i++){ | ||||
|         ss<< saved[i]<<" "; | ||||
|       } | ||||
|       ss.seekg(0,ss.beg); | ||||
|       ss>>eng; | ||||
|     } | ||||
|     void SetState(std::vector<RngStateType> & saved,int gen){ | ||||
|       SetState(saved,_generators[gen]); | ||||
|     } | ||||
|     void SetEngine(RngEngine &Eng, int gen){ | ||||
|       _generators[gen]=Eng; | ||||
|     } | ||||
|     void GetEngine(RngEngine &Eng, int gen){ | ||||
|       Eng=_generators[gen]; | ||||
|     } | ||||
|     template<class source> void Seed(source &src, int gen) | ||||
|     { | ||||
|       _generators[gen] = RngEngine(src); | ||||
|     }     | ||||
|   }; | ||||
|     ss.seekg(0,ss.beg); | ||||
|     ss>>eng; | ||||
|   } | ||||
|   void SetState(std::vector<RngStateType> & saved,int gen){ | ||||
|     SetState(saved,_generators[gen]); | ||||
|   } | ||||
|   void SetEngine(RngEngine &Eng, int gen){ | ||||
|     _generators[gen]=Eng; | ||||
|   } | ||||
|   void GetEngine(RngEngine &Eng, int gen){ | ||||
|     Eng=_generators[gen]; | ||||
|   } | ||||
|   template<class source> void Seed(source &src, int gen) | ||||
|   { | ||||
|     _generators[gen] = RngEngine(src); | ||||
|   }     | ||||
| }; | ||||
|  | ||||
|   class GridSerialRNG : public GridRNGbase { | ||||
|   public: | ||||
| class GridSerialRNG : public GridRNGbase { | ||||
| public: | ||||
|  | ||||
|     GridSerialRNG() : GridRNGbase() { | ||||
|       _generators.resize(1); | ||||
|       _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1}); | ||||
|       _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) ); | ||||
|       _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1}); | ||||
|       _uid.resize(1,std::uniform_int_distribution<uint32_t>() ); | ||||
|     } | ||||
|   GridSerialRNG() : GridRNGbase() { | ||||
|     _generators.resize(1); | ||||
|     _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1}); | ||||
|     _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) ); | ||||
|     _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1}); | ||||
|     _uid.resize(1,std::uniform_int_distribution<uint32_t>() ); | ||||
|   } | ||||
|  | ||||
|     template <class sobj,class distribution> inline void fill(sobj &l,std::vector<distribution> &dist){ | ||||
|   template <class sobj,class distribution> inline void fill(sobj &l,std::vector<distribution> &dist){ | ||||
|  | ||||
|       typedef typename sobj::scalar_type scalar_type; | ||||
|     typedef typename sobj::scalar_type scalar_type; | ||||
|   | ||||
|       int words = sizeof(sobj)/sizeof(scalar_type); | ||||
|     int words = sizeof(sobj)/sizeof(scalar_type); | ||||
|  | ||||
|       scalar_type *buf = (scalar_type *) & l; | ||||
|     scalar_type *buf = (scalar_type *) & l; | ||||
|  | ||||
|       dist[0].reset(); | ||||
|       for(int idx=0;idx<words;idx++){ | ||||
| 	fillScalar(buf[idx],dist[0],_generators[0]); | ||||
|       } | ||||
|     dist[0].reset(); | ||||
|     for(int idx=0;idx<words;idx++){ | ||||
|       fillScalar(buf[idx],dist[0],_generators[0]); | ||||
|     } | ||||
|  | ||||
|       CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); | ||||
|     CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); | ||||
|  | ||||
|     }; | ||||
|   } | ||||
|  | ||||
|     template <class distribution>  inline void fill(ComplexF &l,std::vector<distribution> &dist){ | ||||
|       dist[0].reset(); | ||||
|       fillScalar(l,dist[0],_generators[0]); | ||||
|       CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); | ||||
|   template <class distribution>  inline void fill(ComplexF &l,std::vector<distribution> &dist){ | ||||
|     dist[0].reset(); | ||||
|     fillScalar(l,dist[0],_generators[0]); | ||||
|     CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); | ||||
|   } | ||||
|   template <class distribution>  inline void fill(ComplexD &l,std::vector<distribution> &dist){ | ||||
|     dist[0].reset(); | ||||
|     fillScalar(l,dist[0],_generators[0]); | ||||
|     CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); | ||||
|   } | ||||
|   template <class distribution>  inline void fill(RealF &l,std::vector<distribution> &dist){ | ||||
|     dist[0].reset(); | ||||
|     fillScalar(l,dist[0],_generators[0]); | ||||
|     CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); | ||||
|   } | ||||
|   template <class distribution>  inline void fill(RealD &l,std::vector<distribution> &dist){ | ||||
|     dist[0].reset(); | ||||
|     fillScalar(l,dist[0],_generators[0]); | ||||
|     CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); | ||||
|   } | ||||
|   // vector fill | ||||
|   template <class distribution>  inline void fill(vComplexF &l,std::vector<distribution> &dist){ | ||||
|     RealF *pointer=(RealF *)&l; | ||||
|     dist[0].reset(); | ||||
|     for(int i=0;i<2*vComplexF::Nsimd();i++){ | ||||
|       fillScalar(pointer[i],dist[0],_generators[0]); | ||||
|     } | ||||
|     template <class distribution>  inline void fill(ComplexD &l,std::vector<distribution> &dist){ | ||||
|       dist[0].reset(); | ||||
|       fillScalar(l,dist[0],_generators[0]); | ||||
|       CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); | ||||
|     CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); | ||||
|   } | ||||
|   template <class distribution>  inline void fill(vComplexD &l,std::vector<distribution> &dist){ | ||||
|     RealD *pointer=(RealD *)&l; | ||||
|     dist[0].reset(); | ||||
|     for(int i=0;i<2*vComplexD::Nsimd();i++){ | ||||
|       fillScalar(pointer[i],dist[0],_generators[0]); | ||||
|     } | ||||
|     template <class distribution>  inline void fill(RealF &l,std::vector<distribution> &dist){ | ||||
|       dist[0].reset(); | ||||
|       fillScalar(l,dist[0],_generators[0]); | ||||
|       CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); | ||||
|     CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); | ||||
|   } | ||||
|   template <class distribution>  inline void fill(vRealF &l,std::vector<distribution> &dist){ | ||||
|     RealF *pointer=(RealF *)&l; | ||||
|     dist[0].reset(); | ||||
|     for(int i=0;i<vRealF::Nsimd();i++){ | ||||
|       fillScalar(pointer[i],dist[0],_generators[0]); | ||||
|     } | ||||
|     template <class distribution>  inline void fill(RealD &l,std::vector<distribution> &dist){ | ||||
|       dist[0].reset(); | ||||
|       fillScalar(l,dist[0],_generators[0]); | ||||
|       CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); | ||||
|     } | ||||
|     // vector fill | ||||
|     template <class distribution>  inline void fill(vComplexF &l,std::vector<distribution> &dist){ | ||||
|       RealF *pointer=(RealF *)&l; | ||||
|       dist[0].reset(); | ||||
|       for(int i=0;i<2*vComplexF::Nsimd();i++){ | ||||
| 	fillScalar(pointer[i],dist[0],_generators[0]); | ||||
|       } | ||||
|       CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); | ||||
|     } | ||||
|     template <class distribution>  inline void fill(vComplexD &l,std::vector<distribution> &dist){ | ||||
|       RealD *pointer=(RealD *)&l; | ||||
|       dist[0].reset(); | ||||
|       for(int i=0;i<2*vComplexD::Nsimd();i++){ | ||||
| 	fillScalar(pointer[i],dist[0],_generators[0]); | ||||
|       } | ||||
|       CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); | ||||
|     } | ||||
|     template <class distribution>  inline void fill(vRealF &l,std::vector<distribution> &dist){ | ||||
|       RealF *pointer=(RealF *)&l; | ||||
|       dist[0].reset(); | ||||
|       for(int i=0;i<vRealF::Nsimd();i++){ | ||||
| 	fillScalar(pointer[i],dist[0],_generators[0]); | ||||
|       } | ||||
|       CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); | ||||
|     } | ||||
|     template <class distribution>  inline void fill(vRealD &l,std::vector<distribution> &dist){ | ||||
|       RealD *pointer=(RealD *)&l; | ||||
|       dist[0].reset(); | ||||
|       for(int i=0;i<vRealD::Nsimd();i++){ | ||||
| 	fillScalar(pointer[i],dist[0],_generators[0]); | ||||
|       } | ||||
|       CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); | ||||
|     CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); | ||||
|   } | ||||
|   template <class distribution>  inline void fill(vRealD &l,std::vector<distribution> &dist){ | ||||
|     RealD *pointer=(RealD *)&l; | ||||
|     dist[0].reset(); | ||||
|     for(int i=0;i<vRealD::Nsimd();i++){ | ||||
|       fillScalar(pointer[i],dist[0],_generators[0]); | ||||
|     } | ||||
|     CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); | ||||
|   } | ||||
|      | ||||
|     void SeedFixedIntegers(const std::vector<int> &seeds){ | ||||
|       CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size()); | ||||
|       std::seed_seq src(seeds.begin(),seeds.end()); | ||||
|       Seed(src,0); | ||||
|     } | ||||
|   void SeedFixedIntegers(const std::vector<int> &seeds){ | ||||
|     CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size()); | ||||
|     std::seed_seq src(seeds.begin(),seeds.end()); | ||||
|     Seed(src,0); | ||||
|   } | ||||
|  | ||||
|     void SeedUniqueString(const std::string &s){ | ||||
|       std::vector<int> seeds; | ||||
| @@ -330,65 +337,67 @@ namespace Grid { | ||||
|       std::cout << GridLogMessage << "Seed SHA256: " << sha.str() << std::endl; | ||||
|       SeedFixedIntegers(seeds); | ||||
|     } | ||||
|   }; | ||||
| }; | ||||
|  | ||||
|   class GridParallelRNG : public GridRNGbase { | ||||
| class GridParallelRNG : public GridRNGbase { | ||||
| private: | ||||
|   double _time_counter; | ||||
|   GridBase *_grid; | ||||
|   unsigned int _vol; | ||||
|  | ||||
|     double _time_counter; | ||||
| public: | ||||
|   GridBase *Grid(void) const { return _grid; } | ||||
|   int generator_idx(int os,int is) { | ||||
|     return is*_grid->oSites()+os; | ||||
|   } | ||||
|  | ||||
|   public: | ||||
|     GridBase *_grid; | ||||
|     unsigned int _vol; | ||||
|   GridParallelRNG(GridBase *grid) : GridRNGbase() { | ||||
|     _grid = grid; | ||||
|     _vol  =_grid->iSites()*_grid->oSites(); | ||||
|  | ||||
|     int generator_idx(int os,int is) { | ||||
|       return is*_grid->oSites()+os; | ||||
|     } | ||||
|     _generators.resize(_vol); | ||||
|     _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1}); | ||||
|     _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) ); | ||||
|     _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1}); | ||||
|     _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() ); | ||||
|   } | ||||
|  | ||||
|     GridParallelRNG(GridBase *grid) : GridRNGbase() { | ||||
|       _grid = grid; | ||||
|       _vol  =_grid->iSites()*_grid->oSites(); | ||||
|   template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){ | ||||
|  | ||||
|       _generators.resize(_vol); | ||||
|       _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1}); | ||||
|       _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) ); | ||||
|       _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1}); | ||||
|       _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() ); | ||||
|     } | ||||
|     typedef typename vobj::scalar_object scalar_object; | ||||
|     typedef typename vobj::scalar_type scalar_type; | ||||
|     typedef typename vobj::vector_type vector_type; | ||||
|  | ||||
|     template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){ | ||||
|     double inner_time_counter = usecond(); | ||||
|  | ||||
|       typedef typename vobj::scalar_object scalar_object; | ||||
|       typedef typename vobj::scalar_type scalar_type; | ||||
|       typedef typename vobj::vector_type vector_type; | ||||
|     int multiplicity = RNGfillable_general(_grid, l.Grid()); // l has finer or same grid | ||||
|     int Nsimd  = _grid->Nsimd();  // guaranteed to be the same for l.Grid() too | ||||
|     int osites = _grid->oSites();  // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity | ||||
|     int words  = sizeof(scalar_object) / sizeof(scalar_type); | ||||
|  | ||||
|       double inner_time_counter = usecond(); | ||||
|     autoView(l_v, l, CpuWrite); | ||||
|     thread_for( ss, osites, { | ||||
|       ExtractBuffer<scalar_object> buf(Nsimd); | ||||
|       for (int m = 0; m < multiplicity; m++) {  // Draw from same generator multiplicity times | ||||
|  | ||||
|       int multiplicity = RNGfillable_general(_grid, l._grid); // l has finer or same grid | ||||
|       int Nsimd  = _grid->Nsimd();  // guaranteed to be the same for l._grid too | ||||
|       int osites = _grid->oSites();  // guaranteed to be <= l._grid->oSites() by a factor multiplicity | ||||
|       int words  = sizeof(scalar_object) / sizeof(scalar_type); | ||||
| 	int sm = multiplicity * ss + m;  // Maps the generator site to the fine site | ||||
|  | ||||
|       parallel_for(int ss=0;ss<osites;ss++){ | ||||
|         std::vector<scalar_object> buf(Nsimd); | ||||
|         for (int m = 0; m < multiplicity; m++) {  // Draw from same generator multiplicity times | ||||
|  | ||||
|           int sm = multiplicity * ss + m;  // Maps the generator site to the fine site | ||||
|  | ||||
|           for (int si = 0; si < Nsimd; si++) { | ||||
| 	for (int si = 0; si < Nsimd; si++) { | ||||
|              | ||||
|             int gdx = generator_idx(ss, si);  // index of generator state | ||||
|             scalar_type *pointer = (scalar_type *)&buf[si]; | ||||
|             dist[gdx].reset(); | ||||
|             for (int idx = 0; idx < words; idx++)  | ||||
|               fillScalar(pointer[idx], dist[gdx], _generators[gdx]); | ||||
|           } | ||||
|           // merge into SIMD lanes, FIXME suboptimal implementation | ||||
|           merge(l._odata[sm], buf); | ||||
|         } | ||||
| 	  int gdx = generator_idx(ss, si);  // index of generator state | ||||
| 	  scalar_type *pointer = (scalar_type *)&buf[si]; | ||||
| 	  dist[gdx].reset(); | ||||
| 	  for (int idx = 0; idx < words; idx++)  | ||||
| 	    fillScalar(pointer[idx], dist[gdx], _generators[gdx]); | ||||
| 	} | ||||
| 	// merge into SIMD lanes, FIXME suboptimal implementation | ||||
| 	merge(l_v[sm], buf); | ||||
|       } | ||||
|       }); | ||||
|     //    }); | ||||
|  | ||||
|       _time_counter += usecond()- inner_time_counter; | ||||
|     }; | ||||
|     _time_counter += usecond()- inner_time_counter; | ||||
|   } | ||||
|  | ||||
|     void SeedUniqueString(const std::string &s){ | ||||
|       std::vector<int> seeds; | ||||
| @@ -398,119 +407,119 @@ namespace Grid { | ||||
|       std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl; | ||||
|       SeedFixedIntegers(seeds); | ||||
|     } | ||||
|     void SeedFixedIntegers(const std::vector<int> &seeds){ | ||||
|   void SeedFixedIntegers(const std::vector<int> &seeds){ | ||||
|  | ||||
|       // Everyone generates the same seed_seq based on input seeds | ||||
|       CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size()); | ||||
|     // Everyone generates the same seed_seq based on input seeds | ||||
|     CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size()); | ||||
|  | ||||
|       std::seed_seq source(seeds.begin(),seeds.end()); | ||||
|     std::seed_seq source(seeds.begin(),seeds.end()); | ||||
|  | ||||
|       RngEngine master_engine(source); | ||||
|     RngEngine master_engine(source); | ||||
|  | ||||
| #ifdef RNG_FAST_DISCARD | ||||
|       //////////////////////////////////////////////// | ||||
|       // Skip ahead through a single stream. | ||||
|       // Applicable to SITMO and other has based/crypto RNGs | ||||
|       // Should be applicable to Mersenne Twister, but the C++11 | ||||
|       // MT implementation does not implement fast discard even though | ||||
|       // in principle this is possible | ||||
|       //////////////////////////////////////////////// | ||||
|  | ||||
|       // Everybody loops over global volume. | ||||
|       parallel_for(int gidx=0;gidx<_grid->_gsites;gidx++){ | ||||
|     //////////////////////////////////////////////// | ||||
|     // Skip ahead through a single stream. | ||||
|     // Applicable to SITMO and other has based/crypto RNGs | ||||
|     // Should be applicable to Mersenne Twister, but the C++11 | ||||
|     // MT implementation does not implement fast discard even though | ||||
|     // in principle this is possible | ||||
|     //////////////////////////////////////////////// | ||||
|  | ||||
|     // Everybody loops over global volume. | ||||
|     thread_for( gidx, _grid->_gsites, { | ||||
| 	// Where is it? | ||||
| 	int rank,o_idx,i_idx; | ||||
| 	std::vector<int> gcoor; | ||||
| 	int rank; | ||||
| 	int o_idx; | ||||
| 	int i_idx; | ||||
|  | ||||
| 	Coordinate gcoor; | ||||
| 	_grid->GlobalIndexToGlobalCoor(gidx,gcoor); | ||||
| 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); | ||||
|  | ||||
| 	 | ||||
| 	// If this is one of mine we take it | ||||
| 	if( rank == _grid->ThisRank() ){ | ||||
| 	  int l_idx=generator_idx(o_idx,i_idx); | ||||
| 	  _generators[l_idx] = master_engine; | ||||
| 	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence | ||||
| 	} | ||||
|  | ||||
|       } | ||||
|     }); | ||||
| #else  | ||||
|       //////////////////////////////////////////////////////////////// | ||||
|       // Machine and thread decomposition dependent seeding is efficient | ||||
|       // and maximally parallel; but NOT reproducible from machine to machine.  | ||||
|       // Not ideal, but fastest way to reseed all nodes. | ||||
|       //////////////////////////////////////////////////////////////// | ||||
|       { | ||||
| 	// Obtain one Reseed per processor | ||||
| 	int Nproc = _grid->ProcessorCount(); | ||||
| 	std::vector<RngEngine> seeders(Nproc); | ||||
| 	int me= _grid->ThisRank(); | ||||
| 	for(int p=0;p<Nproc;p++){ | ||||
| 	  seeders[p] = Reseed(master_engine); | ||||
| 	} | ||||
| 	master_engine = seeders[me]; | ||||
|     //////////////////////////////////////////////////////////////// | ||||
|     // Machine and thread decomposition dependent seeding is efficient | ||||
|     // and maximally parallel; but NOT reproducible from machine to machine.  | ||||
|     // Not ideal, but fastest way to reseed all nodes. | ||||
|     //////////////////////////////////////////////////////////////// | ||||
|     { | ||||
|       // Obtain one Reseed per processor | ||||
|       int Nproc = _grid->ProcessorCount(); | ||||
|       std::vector<RngEngine> seeders(Nproc); | ||||
|       int me= _grid->ThisRank(); | ||||
|       for(int p=0;p<Nproc;p++){ | ||||
| 	seeders[p] = Reseed(master_engine); | ||||
|       } | ||||
|       master_engine = seeders[me]; | ||||
|     } | ||||
|  | ||||
|     { | ||||
|       // Obtain one reseeded generator per thread       | ||||
|       int Nthread = 32; // Hardwire a good level or parallelism | ||||
|       std::vector<RngEngine> seeders(Nthread); | ||||
|       for(int t=0;t<Nthread;t++){ | ||||
| 	seeders[t] = Reseed(master_engine); | ||||
|       } | ||||
|  | ||||
|       { | ||||
| 	// Obtain one reseeded generator per thread | ||||
| 	int Nthread = GridThread::GetThreads(); | ||||
| 	std::vector<RngEngine> seeders(Nthread); | ||||
| 	for(int t=0;t<Nthread;t++){ | ||||
| 	  seeders[t] = Reseed(master_engine); | ||||
| 	} | ||||
|  | ||||
| 	parallel_for(int t=0;t<Nthread;t++) { | ||||
| 	  // set up one per local site in threaded fashion | ||||
| 	  std::vector<uint32_t> newseeds; | ||||
| 	  std::uniform_int_distribution<uint32_t> uid;	 | ||||
| 	  for(int l=0;l<_grid->lSites();l++) { | ||||
| 	    if ( (l%Nthread)==t ) { | ||||
| 	      _generators[l] = Reseed(seeders[t],newseeds,uid); | ||||
| 	    } | ||||
|       thread_for( t, Nthread, { | ||||
| 	// set up one per local site in threaded fashion | ||||
| 	std::vector<uint32_t> newseeds; | ||||
| 	std::uniform_int_distribution<uint32_t> uid;	 | ||||
| 	for(int l=0;l<_grid->lSites();l++) { | ||||
| 	  if ( (l%Nthread)==t ) { | ||||
| 	    _generators[l] = Reseed(seeders[t],newseeds,uid); | ||||
| 	  } | ||||
| 	} | ||||
|       } | ||||
|       }); | ||||
|     } | ||||
| #endif | ||||
|   } | ||||
|  | ||||
|   void Report(){ | ||||
|     std::cout << GridLogMessage << "Time spent in the fill() routine by GridParallelRNG: "<< _time_counter/1e3 << " ms" << std::endl; | ||||
|   } | ||||
|  | ||||
|  | ||||
|   //////////////////////////////////////////////////////////////////////// | ||||
|   // Support for rigorous test of RNG's | ||||
|   // Return uniform random uint32_t from requested site generator | ||||
|   //////////////////////////////////////////////////////////////////////// | ||||
|   uint32_t GlobalU01(int gsite){ | ||||
|  | ||||
|     uint32_t the_number; | ||||
|     // who | ||||
|     int rank,o_idx,i_idx; | ||||
|     Coordinate gcoor; | ||||
|     _grid->GlobalIndexToGlobalCoor(gsite,gcoor); | ||||
|     _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); | ||||
|  | ||||
|     // draw | ||||
|     int l_idx=generator_idx(o_idx,i_idx); | ||||
|     if( rank == _grid->ThisRank() ){ | ||||
|       the_number = _uid[l_idx](_generators[l_idx]); | ||||
|     } | ||||
|  | ||||
|     void Report(){ | ||||
|       std::cout << GridLogMessage << "Time spent in the fill() routine by GridParallelRNG: "<< _time_counter/1e3 << " ms" << std::endl; | ||||
|     } | ||||
|  | ||||
|  | ||||
|     //////////////////////////////////////////////////////////////////////// | ||||
|     // Support for rigorous test of RNG's | ||||
|     // Return uniform random uint32_t from requested site generator | ||||
|     //////////////////////////////////////////////////////////////////////// | ||||
|     uint32_t GlobalU01(int gsite){ | ||||
|  | ||||
|       uint32_t the_number; | ||||
|       // who | ||||
|       std::vector<int> gcoor; | ||||
|       int rank,o_idx,i_idx; | ||||
|       _grid->GlobalIndexToGlobalCoor(gsite,gcoor); | ||||
|       _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); | ||||
|  | ||||
|       // draw | ||||
|       int l_idx=generator_idx(o_idx,i_idx); | ||||
|       if( rank == _grid->ThisRank() ){ | ||||
| 	the_number = _uid[l_idx](_generators[l_idx]); | ||||
|       } | ||||
|        | ||||
|       // share & return | ||||
|       _grid->Broadcast(rank,(void *)&the_number,sizeof(the_number)); | ||||
|       return the_number; | ||||
|     } | ||||
|     // share & return | ||||
|     _grid->Broadcast(rank,(void *)&the_number,sizeof(the_number)); | ||||
|     return the_number; | ||||
|   } | ||||
|  | ||||
|   }; | ||||
| }; | ||||
|  | ||||
|   template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l)   { rng.fill(l,rng._uniform);  } | ||||
|   template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); } | ||||
|   template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);} | ||||
| template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l)   { rng.fill(l,rng._uniform);  } | ||||
| template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); } | ||||
| template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);} | ||||
|  | ||||
|   template <class sobj> inline void random(GridSerialRNG &rng,sobj &l)   { rng.fill(l,rng._uniform  ); } | ||||
|   template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); } | ||||
|   template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); } | ||||
| template <class sobj> inline void random(GridSerialRNG &rng,sobj &l)   { rng.fill(l,rng._uniform  ); } | ||||
| template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); } | ||||
| template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); } | ||||
|  | ||||
| } | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -23,8 +23,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_LATTICE_TRACE_H | ||||
| #define GRID_LATTICE_TRACE_H | ||||
|  | ||||
| @@ -32,36 +32,40 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| // Tracing, transposing, peeking, poking | ||||
| /////////////////////////////////////////////// | ||||
|  | ||||
| namespace Grid { | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
|     //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|     // Trace | ||||
|     //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|     template<class vobj> | ||||
|     inline auto trace(const Lattice<vobj> &lhs) | ||||
|       -> Lattice<decltype(trace(lhs._odata[0]))> | ||||
|     { | ||||
|       Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid); | ||||
|       parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
|             ret._odata[ss] = trace(lhs._odata[ss]); | ||||
|         } | ||||
|         return ret; | ||||
|     }; | ||||
| //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // Trace | ||||
| //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| /* | ||||
| template<class vobj> | ||||
| inline auto trace(const Lattice<vobj> &lhs)  -> Lattice<decltype(trace(vobj()))> | ||||
| { | ||||
|   Lattice<decltype(trace(vobj()))> ret(lhs.Grid()); | ||||
|   autoView(ret_v , ret, AcceleratorWrite); | ||||
|   autoView(lhs_v , lhs, AcceleratorRead); | ||||
|   accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { | ||||
|     coalescedWrite(ret_v[ss], trace(lhs_v(ss))); | ||||
|   }); | ||||
|   return ret; | ||||
| }; | ||||
| */ | ||||
|      | ||||
|     //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|     // Trace Index level dependent operation | ||||
|     //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|     template<int Index,class vobj> | ||||
|     inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> | ||||
|     { | ||||
|       Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid); | ||||
|       parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||
| 	ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]); | ||||
|       } | ||||
|       return ret; | ||||
|     }; | ||||
| //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // Trace Index level dependent operation | ||||
| //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| template<int Index,class vobj> | ||||
| inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))> | ||||
| { | ||||
|   Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid()); | ||||
|   autoView( ret_v , ret, AcceleratorWrite); | ||||
|   autoView( lhs_v , lhs, AcceleratorRead); | ||||
|   accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { | ||||
|     coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss))); | ||||
|   }); | ||||
|   return ret; | ||||
| }; | ||||
|  | ||||
|  | ||||
| } | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
|  | ||||
|   | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user