dbollweg
							
						 
					 | 
					
						
						
							
						
						09af8c25a2
					 | 
					
						
						
							
							Merge branch 'paboyle:develop' into feature/sliceSum_gpu
						
						
						
						
						
						
					 | 
					
						2024-02-09 13:02:59 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								dbollweg
							
						 
					 | 
					
						
						
							
						
						9514035b87
					 | 
					
						
						
							
							refactor slicesum: slicesum uses GPU version by default now
						
						
						
						
						
						
					 | 
					
						2024-02-09 13:02:28 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								david clarke
							
						 
					 | 
					
						
						
							
						
						2da09ae99b
					 | 
					
						
						
							
							acceleration compiles and doesn't break scalar mode
						
						
						
						
						
						
					 | 
					
						2024-02-06 18:40:13 -07:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								david clarke
							
						 
					 | 
					
						
						
							
						
						a38fb0e04a
					 | 
					
						
						
							
							first effort toward accelerators
						
						
						
						
						
						
					 | 
					
						2024-02-06 18:24:55 -07:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					| 
						
					 | 
					
						
						
							
						
						7019916294
					 | 
					
						
						
							
							RNG seed change safer for large volumes; this is a long term solution
						
						
						
						
						
						
					 | 
					
						2024-02-07 00:56:39 +00:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								dbollweg
							
						 
					 | 
					
						
						
							
						
						1514b4f137
					 | 
					
						
						
							
							slicesum_sycl passes test
						
						
						
						
						
						
					 | 
					
						2024-02-06 19:08:44 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					| 
						
					 | 
					
						
						
							
						
						91cf5ee312
					 | 
					
						
						
							
							Updated bench script
						
						
						
						
						
						
					 | 
					
						2024-02-06 23:45:10 +00:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								david clarke
							
						 
					 | 
					
						
						
							
						
						0a6e2f42c5
					 | 
					
						
						
							
							small amount of cleanup
						
						
						
						
						
						
					 | 
					
						2024-02-06 16:32:07 -07:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								dbollweg
							
						 
					 | 
					
						
						
							
						
						ab2de131bd
					 | 
					
						
						
							
							work towards sliceSum for sycl backend
						
						
						
						
						
						
					 | 
					
						2024-02-06 13:24:45 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					| 
						
					 | 
					
						
						
							
						
						5bfa88be85
					 | 
					
						
						
							
							Aurora MPI standalone benchmake and options that work well
						
						
						
						
						
						
					 | 
					
						2024-02-06 16:28:40 +00:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Dennis Bollweg
							
						 
					 | 
					
						
						
							
						
						5af8da76d7
					 | 
					
						
						
							
							Fix cuda compilation of Lattice_slicesum_gpu.h
						
						
						
						
						
						
					 | 
					
						2024-02-01 18:02:30 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Dennis Bollweg
							
						 
					 | 
					
						
						
							
						
						b8b9dc952d
					 | 
					
						
						
							
							Async memcpy's and cleanup
						
						
						
						
						
						
					 | 
					
						2024-02-01 17:55:35 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Dennis Bollweg
							
						 
					 | 
					
						
						
							
						
						79a6ed32d8
					 | 
					
						
						
							
							Use accelerator_for2d and DeviceSegmentedRecude to avoid kernel launch latencies
						
						
						
						
						
						
					 | 
					
						2024-02-01 16:41:03 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								dbollweg
							
						 
					 | 
					
						
						
							
						
						caa5f97723
					 | 
					
						
						
							
							Add sliceSum gpu using cub/hipcub
						
						
						
						
						
						
					 | 
					
						2024-01-31 16:50:06 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								david clarke
							
						 
					 | 
					
						
						
							
						
						4924b3209e
					 | 
					
						
						
							
							projectU3 yields a unitary matrix
						
						
						
						
						
						
					 | 
					
						2024-01-23 14:43:58 -07:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						eb702f581b
					 | 
					
						
						
							
							Running on 12 rhs on 18 nodes of frontier
						
						
						
						
						
						
					 | 
					
						2024-01-22 17:44:15 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						3d13fd56c5
					 | 
					
						
						
							
							Precompute phases, save memory in hermitian
						
						
						
						
						
						
					 | 
					
						2024-01-22 17:43:35 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						6f51b49ef8
					 | 
					
						
						
							
							Use stderr
						
						
						
						
						
						
					 | 
					
						2024-01-22 17:41:09 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						addc638856
					 | 
					
						
						
							
							Fast localCopyRegion, blockProjectFast
						
						
						
						
						
						
					 | 
					
						2024-01-22 17:40:38 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								david clarke
							
						 
					 | 
					
						
						
							
						
						00f24f8765
					 | 
					
						
						
							
							already found some bugs in projection, still needs testing
						
						
						
						
						
						
					 | 
					
						2024-01-22 05:50:16 -07:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								david clarke
							
						 
					 | 
					
						
						
							
						
						f5b3d582b0
					 | 
					
						
						
							
							first attempt at U3 projection
						
						
						
						
						
						
					 | 
					
						2024-01-22 02:49:40 -07:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								david clarke
							
						 
					 | 
					
						
						
							
						
						981c93d67a
					 | 
					
						
						
							
							update Test_fatLinks to accept Naik
						
						
						
						
						
						
					 | 
					
						2024-01-21 21:09:19 -07:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								david clarke
							
						 
					 | 
					
						
						
							
						
						c020b78e02
					 | 
					
						
						
							
							Merge branch 'develop' into hisq_fat_links
						
						
						
						
						
						
					 | 
					
						2024-01-21 20:21:08 -07:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						42ae36bc28
					 | 
					
						
						
							
							WOrking
						
						
						
						
						
						
					 | 
					
						2024-01-17 16:39:14 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						c69f73ff9f
					 | 
					
						
						
							
							Working
						
						
						
						
						
						
					 | 
					
						2024-01-17 16:38:46 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						ca5ae8a2e6
					 | 
					
						
						
							
							Revert to working.
						
						
						
						
						
						
					 | 
					
						2024-01-17 16:32:05 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						d967eb53de
					 | 
					
						
						
							
							Working for first time
						
						
						
						
						
						
					 | 
					
						2024-01-17 16:31:12 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						839f9f1bbe
					 | 
					
						
						
							
							Don't log memory by default
						
						
						
						
						
						
					 | 
					
						2024-01-17 16:25:50 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						b754a152c6
					 | 
					
						
						
							
							Flag guard correctly
						
						
						
						
						
						
					 | 
					
						2024-01-17 16:25:28 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						e07cb2b9de
					 | 
					
						
						
							
							Accelerator memory
						
						
						
						
						
						
					 | 
					
						2024-01-17 16:24:31 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						a1f8bbb078
					 | 
					
						
						
							
							accelerator memory print
						
						
						
						
						
						
					 | 
					
						2024-01-17 16:24:09 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						7909683f3b
					 | 
					
						
						
							
							MultiRHS
						
						
						
						
						
						
					 | 
					
						2024-01-17 16:21:07 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						25f71913b7
					 | 
					
						
						
							
							MultiRHS coarse
						
						
						
						
						
						
					 | 
					
						2024-01-04 12:01:17 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						34ddd2b7b1
					 | 
					
						
						
							
							MultiRHS coarse space
						
						
						
						
						
						
					 | 
					
						2024-01-04 12:00:53 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						d5fd90b2f3
					 | 
					
						
						
							
							Add 48^3 rtest
						
						
						
						
						
						
					 | 
					
						2024-01-04 12:00:01 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						b7c7000d0d
					 | 
					
						
						
							
							Don't need the numerical rounding tolerance in multigrid
						
						
						
						
						
						
					 | 
					
						2023-12-22 18:10:23 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						551f6c4edd
					 | 
					
						
						
							
							Synchronise changes
						
						
						
						
						
						
					 | 
					
						2023-12-22 18:09:11 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						defd814750
					 | 
					
						
						
							
							Speed up the coarsened matrix matrix evaluation.
						
						
						
						
						
						
						
						It is block project limited.
Could be sped up with calls to Batched GEMM and a data layout change. 
						
						
					 | 
					
						2023-12-22 18:07:03 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						3d517bbd2a
					 | 
					
						
						
							
							Synchronise decouple from the launch
						
						
						
						
						
						
						
						Speeds up multileg stencils 
						
						
					 | 
					
						2023-12-22 18:06:13 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						78ab955fec
					 | 
					
						
						
							
							Better padded cell exchange
						
						
						
						
						
						
					 | 
					
						2023-12-22 18:05:41 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						dd13937bb6
					 | 
					
						
						
							
							Better opt face gather scatter
						
						
						
						
						
						
					 | 
					
						2023-12-22 18:03:38 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						66a1b63aa9
					 | 
					
						
						
							
							Faster grid/blas layout change.
						
						
						
						
						
						
						
						Halo exchange is now the only slow part.
Revisit 
						
						
					 | 
					
						2023-12-21 20:50:18 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						22c611bd1a
					 | 
					
						
						
							
							Delete temp file
						
						
						
						
						
						
					 | 
					
						2023-12-21 18:32:31 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						c9bb1bf8ea
					 | 
					
						
						
							
							Passing new BLAs based
						
						
						
						
						
						
					 | 
					
						2023-12-21 18:31:17 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					| 
						
					 | 
					
						
						
							
						
						2a0d75bac2
					 | 
					
						
						
							
							Aurora files
						
						
						
						
						
						
					 | 
					
						2023-12-21 23:20:17 +00:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						9e489887cf
					 | 
					
						
						
							
							General coarse multiRHS move to BLAS implementation
						
						
						
						
						
						
					 | 
					
						2023-12-21 15:24:48 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						9feb801bb9
					 | 
					
						
						
							
							Much simpler GPU implementation
						
						
						
						
						
						
					 | 
					
						2023-12-21 15:24:06 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						c00b495933
					 | 
					
						
						
							
							Multigrid
						
						
						
						
						
						
					 | 
					
						2023-12-21 15:23:31 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						d22eebe553
					 | 
					
						
						
							
							BLas options
						
						
						
						
						
						
					 | 
					
						2023-12-21 15:23:03 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Peter Boyle
							
						 
					 | 
					
						
						
							
						
						8bcbd82680
					 | 
					
						
						
							
							BLAS based layout and implementation
						
						
						
						
						
						
					 | 
					
						2023-12-21 15:21:24 -05:00 | 
					
					
						
						
						
							
							
							
							
							
							
						
					 |