mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-29 19:14:33 +00:00 
			
		
		
		
	Compare commits
	
		
			351 Commits
		
	
	
		
			feature/bl
			...
			27a5508ea1
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 27a5508ea1 | ||
| 7019916294 | |||
| 91cf5ee312 | |||
| 5bfa88be85 | |||
| 2a0d75bac2 | |||
|  | 37d1d87c3c | ||
|  | 1381dbc8ef | ||
|  | cc5ab624a2 | ||
|  | 72641211cd | ||
|  | 505cc6927b | ||
|  | f48298ad4e | ||
|  | 645e47c1ba | ||
|  | d1d9827263 | ||
|  | f516acda5f | ||
|  | 7a7aa61d52 | ||
|  | 14643c0aab | ||
|  | 867abeaf8e | ||
|  | b77a9b8947 | ||
|  | 7d077fe493 | ||
|  | 51051df62c | ||
|  | 33097681b9 | ||
|  | 07e4900218 | ||
|  | 36ab567d67 | ||
|  | e19171523b | ||
|  | 9626a2c7c0 | ||
|  | e936f5b80b | ||
|  | ffc0639cb9 | ||
|  | c5b43b322c | ||
|  | c9c4576237 | ||
|  | 6d0c2de399 | ||
|  | 7786ea9921 | ||
|  | d93eac7b1c | ||
|  | afc316f501 | ||
|  | f14bfd5c1b | ||
|  | c5f1420dea | ||
|  | 018e6da872 | ||
|  | b77bccfac2 | ||
|  | 80359e0d49 | ||
|  | 3d437c5cc4 | ||
|  | e5bc51779a | ||
|  | 157368ed04 | ||
|  | ec2ddda12c | ||
|  | 5a5c481d45 | ||
|  | 59dade8346 | ||
|  | b8a7004365 | ||
|  | bd56c95a6f | ||
|  | 994512048e | ||
|  | dbd8bb49dc | ||
|  | 3a29af0ce4 | ||
|  | f7b79cdd45 | ||
|  | 1bda8c47fa | ||
|  | 2100cc6497 | ||
|  | ef8af7bff8 | ||
|  | cb277ae516 | ||
|  | 075b9d22d0 | ||
|  | b92428f05f | ||
|  | 34b11864b6 | ||
|  | 1dfaa08afb | ||
|  | f44dce390f | ||
|  | bb71e9a96a | ||
| 78bae9417c | |||
| dd170ead01 | |||
| 014704856f | |||
|  | 6f6844ccf1 | ||
|  | 4c6613d72c | ||
|  | ee92e08edb | ||
|  | c1dcee9328 | ||
|  | 559257bbe9 | ||
|  | 6b150961fe | ||
|  | cff1f8d3b8 | ||
|  | f27d2083cd | ||
|  | 36cc9c524f | ||
|  | 2822487450 | ||
|  | e07fafe46a | ||
|  | 063d290bd8 | ||
|  | 4e6194d92a | ||
|  | de30c4e22a | ||
|  | 5bafcaedfa | ||
|  | bfeceae708 | ||
|  | eacb66591f | ||
|  | fadaa85626 | ||
|  | 02a5b0d786 | ||
|  | 0e2141442a | ||
|  | 769eb0eecb | ||
|  | 4241c7d4a3 | ||
|  | 7b11075102 | ||
|  | abc658dca5 | ||
|  | 2372275b2c | ||
|  | ef736e8aa4 | ||
|  | 5e539e2d54 | ||
|  | 96773f5254 | ||
|  | d80df09f3b | ||
|  | 621e612c30 | ||
|  | 8c3792721b | ||
|  | c95bbd3948 | ||
|  | e28ab7a732 | ||
|  | c797cbe737 | ||
|  | e09dfbf1c2 | ||
| 85e35c4da1 | |||
|  | d72e914cf0 | ||
|  | 3b5254e2d5 | ||
|  | f1c358b596 | ||
|  | c0ef210265 | ||
|  | e3e1cc1962 | ||
|  | 723eadbb5c | ||
|  | e24637ec1e | ||
|  | 8b01ff4ce7 | ||
|  | 588197c487 | ||
|  | 116d90b0ee | ||
|  | b0646ca187 | ||
|  | 1352bad2e4 | ||
|  | 4895ff260e | ||
|  | 470d93006a | ||
|  | 2f3d03f188 | ||
|  | 8db7c23bee | ||
|  | 69dc5172dc | ||
|  | fd72eb6546 | ||
|  | ffd7301649 | ||
|  | d2a8494044 | ||
|  | 0982e0d19b | ||
|  | 3badbfc3c1 | ||
|  | 5465961e30 | ||
| 477b794bc5 | |||
|  | 4835fd1a87 | ||
|  | 6533c25814 | ||
|  | b405767569 | ||
|  | fe88a0c12f | ||
|  | e61a9ed2b4 | ||
|  | de8daa3824 | ||
|  | 3a50fb29cb | ||
|  | 6647d2656f | ||
|  | a6f4dbeb6d | ||
|  | 92a282f2d8 | ||
|  | ca2fd9fc7b | ||
|  | be1a4f5860 | ||
|  | 1b2914ec09 | ||
|  | 519f795066 | ||
|  | 5897b93dd4 | ||
|  | af091e0881 | ||
|  | 3c1e5e9517 | ||
|  | 85b2cb7a8a | ||
|  | 4240ad5ca8 | ||
|  | d418347d86 | ||
|  | 29a4bfe5e5 | ||
|  | 9955bf9daf | ||
|  | b8bdc2eefb | ||
|  | 0078826ff1 | ||
|  | e855c41772 | ||
|  | d169c275b6 | ||
|  | a5125e23f4 | ||
|  | 7b83c80757 | ||
|  | e41821e206 | ||
|  | 5a75ab15a2 | ||
|  | 932c783fbf | ||
|  | 55f9cce577 | ||
|  | b3533ca847 | ||
|  | fd2a637010 | ||
|  | eee27b8b30 | ||
|  | 8522352aa3 | ||
|  | 3beb8f4091 | ||
|  | 12a706e9b1 | ||
|  | 170aa7df01 | ||
|  | e8ad1fef53 | ||
|  | 876c8f4478 | ||
|  | 9c8750f261 | ||
|  | 91efd08179 | ||
|  | 9953511b65 | ||
|  | 025fa9991a | ||
|  | e8c60c355b | ||
|  | 6c9c7f9d85 | ||
|  | f534523ede | ||
|  | 1b8a834beb | ||
|  | aa9df63a05 | ||
|  | 2b6b98be48 | ||
|  | 3953312a93 | ||
|  | 6e62f4f616 | ||
|  | 6a7bdca53b | ||
|  | c7fba9aace | ||
|  | ac6c7cb8d6 | ||
|  | c5924833a1 | ||
|  | ac0a74be0d | ||
|  | 42b0e1125d | ||
|  | 339c4fda79 | ||
|  | 9b85bf9402 | ||
|  | 86b02c3cd8 | ||
|  | 7b3b7093fa | ||
|  | 881b08a465 | ||
|  | 3ee5444c69 | ||
|  | 5e28fe56d2 | ||
|  | 3aa43e6065 | ||
|  | 78ac4044ff | ||
|  | 119c3db47f | ||
|  | 21bbdb8fc2 | ||
|  | 5aabe074fe | ||
|  | 739bd7572c | ||
|  | 074627a5bd | ||
|  | 6a23b2c599 | ||
|  | dace904c10 | ||
|  | be98d26610 | ||
|  | bd891fb3f5 | ||
|  | 3984265851 | ||
|  | 45361d188f | ||
|  | 80c9d77e02 | ||
|  | 3aff64dddb | ||
|  | b4f2ca81ff | ||
|  | d1dea5f840 | ||
|  | 54f8b84d16 | ||
|  | da503fef0e | ||
|  | 4a6802098a | ||
|  | f9b41a84d2 | ||
| 5d7e0d18b9 | |||
| 9e64387933 | |||
| 983b681d46 | |||
| 4072408b6f | |||
| bd76b47fbf | |||
|  | 178376f24b | ||
| 18ce23aa75 | |||
|  | 5b50eaa55f | ||
|  | 6a0eb466ee | ||
|  | ffa7fe0cc2 | ||
|  | 6b979f0a69 | ||
|  | 4ea29b8f0f | ||
|  | 778291230a | ||
|  | 3671ace5a1 | ||
|  | 86dac5ff4f | ||
|  | 4a382fad3f | ||
|  | cc753670d9 | ||
|  | cc9d88ea1c | ||
|  | b281b0166e | ||
|  | 6a21f694ff | ||
|  | fc4db5e963 | ||
|  | 6252ffaf76 | ||
|  | 026e736dfa | ||
|  | 4275b3f431 | ||
|  | af64c1c6b6 | ||
|  | 866f48391a | ||
|  | a4df527d74 | ||
|  | 5764d21161 | ||
|  | 496d04cd85 | ||
|  | 10e6d7c6ce | ||
|  | c42e25e5b8 | ||
|  | a00ae981e0 | ||
|  | 58e020b62a | ||
|  | a7e1aceeca | ||
|  | 7212432f43 | ||
|  | 4a261fab30 | ||
|  | 6af97069b9 | ||
|  | 5068413cdb | ||
|  | 71c6960eea | ||
|  | ddf6d5c9e3 | ||
| 39214702f6 | |||
| 3e4614c63a | |||
|  | 900e01f49b | ||
|  | 2376156fbc | ||
|  | 3f2fd49db4 | ||
|  | 0efa107cb6 | ||
|  | 8feedb4f6f | ||
|  | 05e562e3d7 | ||
|  | dd3bbb8fa2 | ||
|  | 2fbcf13c46 | ||
|  | 4ea48ef0c4 | ||
|  | 5c85774ee3 | ||
|  | d8a9a745d8 | ||
|  | dcf172da3b | ||
|  | 546be724e7 | ||
|  | 481bbaf1fc | ||
|  | 281488611a | ||
|  | bae0f8ea99 | ||
|  | bbbcd36ae5 | ||
|  | 1b8176e2c0 | ||
|  | cbc053c3db | ||
|  | cdf3f6ef6e | ||
|  | ba7f9d7b70 | ||
|  | 371fd123fb | ||
|  | d6ff644aab | ||
|  | 29586f6b5e | ||
|  | fd057c838f | ||
|  | f51222086c | ||
| a3e935c902 | |||
| 7731c7db8e | |||
| ff97340324 | |||
| 920a51438d | |||
| be528b6d27 | |||
|  | f73691ec47 | ||
|  | ccd21f96ff | ||
|  | 4b90cb8888 | ||
|  | 7ebda3e9ec | ||
|  | b10e1b7bc8 | ||
|  | 7d62f1d6d2 | ||
|  | a9df27f18d | ||
|  | 458c943987 | ||
|  | 88015b0858 | ||
|  | 26ad759469 | ||
|  | ed723909a2 | ||
|  | 36ffe79093 | ||
|  | 1df8669898 | ||
|  | d7dea44ce7 | ||
|  | 37b6b82869 | ||
|  | 92ad5b8f74 | ||
|  | f6661ce29b | ||
|  | 9b3ac3c23f | ||
|  | c33a3b3b40 | ||
|  | 40ee605591 | ||
|  | 8c80f1c168 | ||
|  | 0af7d5a793 | ||
|  | 505fa49983 | ||
|  | 7bcf33def9 | ||
|  | a13820656a | ||
|  | fa71b46a41 | ||
|  | b8b3ae6ac1 | ||
|  | 55c008da21 | ||
|  | 2507606bd0 | ||
|  | 7c2ad4f8c8 | ||
|  | 54c8025aad | ||
|  | 921e23e83c | ||
|  | 6e750ecb0e | ||
|  | b8f1f5d2a3 | ||
|  | 9273f2937c | ||
|  | 1aa28b47ae | ||
|  | 629cb2987a | ||
|  | 03235d6368 | ||
|  | 22064c7e4c | ||
|  | 2de03e5172 | ||
|  | 3af4929dda | ||
|  | 1ba429345b | ||
|  | 88bdd4344b | ||
|  | 4044536eea | ||
|  | 4d8ae6221c | ||
|  | 4e31e4e094 | ||
|  | 0d6674e489 | ||
|  | b145fd4f5b | ||
|  | 8a5b794f25 | ||
|  | 291e80f88a | ||
|  | 1ace5850ae | ||
|  | 283f14b7c1 | ||
|  | 1d6e708083 | ||
|  | 89457e25e3 | ||
|  | 7e3b298d3d | ||
|  | 7ff3e5eed4 | ||
|  | 19eb51cf41 | ||
|  | 470d4dcc6d | ||
|  | ed03bfd555 | ||
|  | 8c0fbcccae | ||
|  | d4866157fe | ||
|  | b6496b6cb5 | ||
|  | 4f5fe57920 | ||
|  | 11fb943b1e | ||
|  | 046a23121e | ||
|  | c2f8ba194e | ||
|  | 229ce57fef | ||
|  | 712b326e40 | 
							
								
								
									
										54
									
								
								.github/ISSUE_TEMPLATE/bug-report.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								.github/ISSUE_TEMPLATE/bug-report.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,54 @@ | ||||
| name: Bug report | ||||
| description: Report a bug. | ||||
| title: "<insert title>" | ||||
| labels: [bug] | ||||
|  | ||||
| body: | ||||
|   - type: markdown | ||||
|     attributes: | ||||
|       value: > | ||||
|         Thank you for taking the time to file a bug report. | ||||
|         Please check that the code is pointing to the HEAD of develop | ||||
|         or any commit in master which is tagged with a version number. | ||||
|  | ||||
|   - type: textarea | ||||
|     attributes: | ||||
|       label: "Describe the issue:" | ||||
|       description: > | ||||
|         Describe the issue and any previous attempt to solve it. | ||||
|     validations: | ||||
|       required: true | ||||
|  | ||||
|   - type: textarea | ||||
|     attributes: | ||||
|       label: "Code example:" | ||||
|       description: > | ||||
|         If relevant, show how to reproduce the issue using a minimal working | ||||
|         example. | ||||
|       placeholder: | | ||||
|         << your code here >> | ||||
|       render: shell | ||||
|     validations: | ||||
|       required: false | ||||
|  | ||||
|   - type: textarea | ||||
|     attributes: | ||||
|       label: "Target platform:" | ||||
|       description: > | ||||
|         Give a description of the target platform (CPU, network, compiler). | ||||
|         Please give the full CPU part description, using for example | ||||
|         `cat /proc/cpuinfo | grep 'model name' | uniq` (Linux) | ||||
|         or `sysctl machdep.cpu.brand_string` (macOS) and the full output | ||||
|         the `--version` option of your compiler. | ||||
|     validations: | ||||
|       required: true | ||||
|  | ||||
|   - type: textarea | ||||
|     attributes: | ||||
|       label: "Configure options:" | ||||
|       description: > | ||||
|         Please give the exact configure command used and attach | ||||
|         `config.log`, `grid.config.summary` and the output of `make V=1`. | ||||
|       render: shell | ||||
|     validations: | ||||
|       required: true | ||||
| @@ -66,6 +66,10 @@ if BUILD_FERMION_REPS | ||||
|   extra_sources+=$(ADJ_FERMION_FILES) | ||||
|   extra_sources+=$(TWOIND_FERMION_FILES) | ||||
| endif | ||||
| if BUILD_SP | ||||
|     extra_sources+=$(SP_FERMION_FILES) | ||||
|     extra_sources+=$(SP_TWOIND_FERMION_FILES) | ||||
| endif | ||||
|  | ||||
| lib_LIBRARIES = libGrid.a | ||||
|  | ||||
|   | ||||
| @@ -55,6 +55,7 @@ NAMESPACE_CHECK(BiCGSTAB); | ||||
| #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h> | ||||
| #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h> | ||||
| #include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h> | ||||
| #include <Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h> | ||||
| #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h> | ||||
| #include <Grid/algorithms/iterative/BlockConjugateGradient.h> | ||||
| #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h> | ||||
|   | ||||
| @@ -542,6 +542,7 @@ public: | ||||
|       (*this)(in[i], out[i]); | ||||
|     } | ||||
|   } | ||||
|   virtual ~LinearFunction(){}; | ||||
| }; | ||||
|  | ||||
| template<class Field> class IdentityLinearFunction : public LinearFunction<Field> { | ||||
|   | ||||
| @@ -191,7 +191,7 @@ public: | ||||
| 	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl; | ||||
|  | ||||
| 	std::cout << GridLogMessage << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl; | ||||
| 	std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl; | ||||
|  | ||||
|         if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); | ||||
|  | ||||
|   | ||||
							
								
								
									
										213
									
								
								Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										213
									
								
								Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,213 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
|     Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h | ||||
|  | ||||
|     Copyright (C) 2015 | ||||
|  | ||||
|     Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
|  | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
|  | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H | ||||
| #define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| //Mixed precision restarted defect correction CG | ||||
| template<class FieldD,class FieldF,  | ||||
|   typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, | ||||
|   typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>  | ||||
| class MixedPrecisionConjugateGradientBatched : public LinearFunction<FieldD> { | ||||
| public: | ||||
|   using LinearFunction<FieldD>::operator(); | ||||
|   RealD   Tolerance; | ||||
|   RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed | ||||
|   Integer MaxInnerIterations; | ||||
|   Integer MaxOuterIterations; | ||||
|   Integer MaxPatchupIterations; | ||||
|   GridBase* SinglePrecGrid; //Grid for single-precision fields | ||||
|   RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance | ||||
|   LinearOperatorBase<FieldF> &Linop_f; | ||||
|   LinearOperatorBase<FieldD> &Linop_d; | ||||
|  | ||||
|   //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess | ||||
|   LinearFunction<FieldF> *guesser; | ||||
|   bool updateResidual; | ||||
|    | ||||
|   MixedPrecisionConjugateGradientBatched(RealD tol,  | ||||
|           Integer maxinnerit,  | ||||
|           Integer maxouterit,  | ||||
|           Integer maxpatchit, | ||||
|           GridBase* _sp_grid,  | ||||
|           LinearOperatorBase<FieldF> &_Linop_f,  | ||||
|           LinearOperatorBase<FieldD> &_Linop_d, | ||||
|           bool _updateResidual=true) : | ||||
|     Linop_f(_Linop_f), Linop_d(_Linop_d), | ||||
|     Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid), | ||||
|     OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { }; | ||||
|  | ||||
|   void useGuesser(LinearFunction<FieldF> &g){ | ||||
|     guesser = &g; | ||||
|   } | ||||
|    | ||||
|   void operator() (const FieldD &src_d_in, FieldD &sol_d){ | ||||
|     std::vector<FieldD> srcs_d_in{src_d_in}; | ||||
|     std::vector<FieldD> sols_d{sol_d}; | ||||
|  | ||||
|     (*this)(srcs_d_in,sols_d); | ||||
|  | ||||
|     sol_d = sols_d[0]; | ||||
|   } | ||||
|  | ||||
|   void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){ | ||||
|     assert(src_d_in.size() == sol_d.size()); | ||||
|     int NBatch = src_d_in.size(); | ||||
|  | ||||
|     std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl; | ||||
|  | ||||
|     Integer TotalOuterIterations = 0; //Number of restarts | ||||
|     std::vector<Integer> TotalInnerIterations(NBatch,0);     //Number of inner CG iterations | ||||
|     std::vector<Integer> TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step | ||||
|    | ||||
|     GridStopWatch TotalTimer; | ||||
|     TotalTimer.Start(); | ||||
|  | ||||
|     GridStopWatch InnerCGtimer; | ||||
|     GridStopWatch PrecChangeTimer; | ||||
|      | ||||
|     int cb = src_d_in[0].Checkerboard(); | ||||
|      | ||||
|     std::vector<RealD> src_norm; | ||||
|     std::vector<RealD> norm; | ||||
|     std::vector<RealD> stop; | ||||
|      | ||||
|     GridBase* DoublePrecGrid = src_d_in[0].Grid(); | ||||
|     FieldD tmp_d(DoublePrecGrid); | ||||
|     tmp_d.Checkerboard() = cb; | ||||
|      | ||||
|     FieldD tmp2_d(DoublePrecGrid); | ||||
|     tmp2_d.Checkerboard() = cb; | ||||
|  | ||||
|     std::vector<FieldD> src_d; | ||||
|     std::vector<FieldF> src_f; | ||||
|     std::vector<FieldF> sol_f; | ||||
|  | ||||
|     for (int i=0; i<NBatch; i++) { | ||||
|       sol_d[i].Checkerboard() = cb; | ||||
|  | ||||
|       src_norm.push_back(norm2(src_d_in[i])); | ||||
|       norm.push_back(0.); | ||||
|       stop.push_back(src_norm[i] * Tolerance*Tolerance); | ||||
|  | ||||
|       src_d.push_back(src_d_in[i]); //source for next inner iteration, computed from residual during operation | ||||
|  | ||||
|       src_f.push_back(SinglePrecGrid); | ||||
|       src_f[i].Checkerboard() = cb; | ||||
|  | ||||
|       sol_f.push_back(SinglePrecGrid); | ||||
|       sol_f[i].Checkerboard() = cb; | ||||
|     } | ||||
|      | ||||
|     RealD inner_tol = InnerTolerance; | ||||
|      | ||||
|     ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations); | ||||
|     CG_f.ErrorOnNoConverge = false; | ||||
|      | ||||
|     Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count | ||||
|        | ||||
|     for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){ | ||||
|       std::cout << GridLogMessage << std::endl; | ||||
|       std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl; | ||||
|        | ||||
|       bool allConverged = true; | ||||
|        | ||||
|       for (int i=0; i<NBatch; i++) { | ||||
|         //Compute double precision rsd and also new RHS vector. | ||||
|         Linop_d.HermOp(sol_d[i], tmp_d); | ||||
|         norm[i] = axpy_norm(src_d[i], -1., tmp_d, src_d_in[i]); //src_d is residual vector | ||||
|          | ||||
|         std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Outer iteration " << outer_iter <<" solve " << i << " residual "<< norm[i] << " target "<< stop[i] <<std::endl; | ||||
|  | ||||
|         PrecChangeTimer.Start(); | ||||
|         precisionChange(src_f[i], src_d[i]); | ||||
|         PrecChangeTimer.Stop(); | ||||
|          | ||||
|         sol_f[i] = Zero(); | ||||
|        | ||||
|         if(norm[i] > OuterLoopNormMult * stop[i]) { | ||||
|           allConverged = false; | ||||
|         } | ||||
|       } | ||||
|       if (allConverged) break; | ||||
|  | ||||
|       if (updateResidual) { | ||||
|         RealD normMax = *std::max_element(std::begin(norm), std::end(norm)); | ||||
|         RealD stopMax = *std::max_element(std::begin(stop), std::end(stop)); | ||||
|         while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ?? | ||||
|         CG_f.Tolerance = inner_tol; | ||||
|       } | ||||
|  | ||||
|       //Optionally improve inner solver guess (eg using known eigenvectors) | ||||
|       if(guesser != NULL) { | ||||
|         (*guesser)(src_f, sol_f); | ||||
|       } | ||||
|  | ||||
|       for (int i=0; i<NBatch; i++) { | ||||
|         //Inner CG | ||||
|         InnerCGtimer.Start(); | ||||
|         CG_f(Linop_f, src_f[i], sol_f[i]); | ||||
|         InnerCGtimer.Stop(); | ||||
|         TotalInnerIterations[i] += CG_f.IterationsToComplete; | ||||
|          | ||||
|         //Convert sol back to double and add to double prec solution | ||||
|         PrecChangeTimer.Start(); | ||||
|         precisionChange(tmp_d, sol_f[i]); | ||||
|         PrecChangeTimer.Stop(); | ||||
|          | ||||
|         axpy(sol_d[i], 1.0, tmp_d, sol_d[i]); | ||||
|       } | ||||
|  | ||||
|     } | ||||
|      | ||||
|     //Final trial CG | ||||
|     std::cout << GridLogMessage << std::endl; | ||||
|     std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Starting final patch-up double-precision solve"<<std::endl; | ||||
|      | ||||
|     for (int i=0; i<NBatch; i++) { | ||||
|       ConjugateGradient<FieldD> CG_d(Tolerance, MaxPatchupIterations); | ||||
|       CG_d(Linop_d, src_d_in[i], sol_d[i]); | ||||
|       TotalFinalStepIterations[i] += CG_d.IterationsToComplete; | ||||
|     } | ||||
|  | ||||
|     TotalTimer.Stop(); | ||||
|  | ||||
|     std::cout << GridLogMessage << std::endl; | ||||
|     for (int i=0; i<NBatch; i++) { | ||||
|       std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: solve " << i << " Inner CG iterations " << TotalInnerIterations[i] << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations[i] << std::endl; | ||||
|     } | ||||
|     std::cout << GridLogMessage << std::endl; | ||||
|     std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl; | ||||
|      | ||||
|   } | ||||
| }; | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
| #endif | ||||
| @@ -166,16 +166,16 @@ public: | ||||
|       rsqf[s] =rsq[s]; | ||||
|       std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift "<< s <<" target resid "<<rsq[s]<<std::endl; | ||||
|       //      ps_d[s] = src_d; | ||||
|       precisionChangeFast(ps_f[s],src_d); | ||||
|       precisionChange(ps_f[s],src_d); | ||||
|     } | ||||
|     // r and p for primary | ||||
|     p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys | ||||
|     r_d = p_d; | ||||
|      | ||||
|     //MdagM+m[0] | ||||
|     precisionChangeFast(p_f,p_d); | ||||
|     precisionChange(p_f,p_d); | ||||
|     Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp) | ||||
|     precisionChangeFast(tmp_d,mmp_f); | ||||
|     precisionChange(tmp_d,mmp_f); | ||||
|     Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp) | ||||
|     tmp_d = tmp_d - mmp_d; | ||||
|     std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl; | ||||
| @@ -204,7 +204,7 @@ public: | ||||
|    | ||||
|     for(int s=0;s<nshift;s++) { | ||||
|       axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d); | ||||
|       precisionChangeFast(psi_f[s],psi_d[s]); | ||||
|       precisionChange(psi_f[s],psi_d[s]); | ||||
|     } | ||||
|    | ||||
|     /////////////////////////////////////// | ||||
| @@ -225,7 +225,7 @@ public: | ||||
|       AXPYTimer.Stop(); | ||||
|  | ||||
|       PrecChangeTimer.Start(); | ||||
|       precisionChangeFast(r_f, r_d); | ||||
|       precisionChange(r_f, r_d); | ||||
|       PrecChangeTimer.Stop(); | ||||
|  | ||||
|       AXPYTimer.Start(); | ||||
| @@ -243,13 +243,13 @@ public: | ||||
|  | ||||
|       cp=c; | ||||
|       PrecChangeTimer.Start(); | ||||
|       precisionChangeFast(p_f, p_d); //get back single prec search direction for linop | ||||
|       precisionChange(p_f, p_d); //get back single prec search direction for linop | ||||
|       PrecChangeTimer.Stop(); | ||||
|       MatrixTimer.Start();   | ||||
|       Linop_f.HermOp(p_f,mmp_f); | ||||
|       MatrixTimer.Stop();   | ||||
|       PrecChangeTimer.Start(); | ||||
|       precisionChangeFast(mmp_d, mmp_f); // From Float to Double | ||||
|       precisionChange(mmp_d, mmp_f); // From Float to Double | ||||
|       PrecChangeTimer.Stop(); | ||||
|  | ||||
|       d=real(innerProduct(p_d,mmp_d));     | ||||
| @@ -311,7 +311,7 @@ public: | ||||
| 	SolverTimer.Stop(); | ||||
|  | ||||
| 	for(int s=0;s<nshift;s++){ | ||||
| 	  precisionChangeFast(psi_d[s],psi_f[s]); | ||||
| 	  precisionChange(psi_d[s],psi_f[s]); | ||||
| 	} | ||||
|  | ||||
| 	 | ||||
|   | ||||
| @@ -211,7 +211,7 @@ public: | ||||
|     Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp) | ||||
|     tmp_d = tmp_d - mmp_d; | ||||
|     std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl; | ||||
|     //    assert(norm2(tmp_d)< 1.0e-4); | ||||
|     assert(norm2(tmp_d)< 1.0); | ||||
|  | ||||
|     axpy(mmp_d,mass[0],p_d,mmp_d); | ||||
|     RealD rn = norm2(p_d); | ||||
|   | ||||
| @@ -419,14 +419,15 @@ until convergence | ||||
| 	} | ||||
|       } | ||||
|  | ||||
|       if ( Nconv < Nstop ) | ||||
|       if ( Nconv < Nstop ) { | ||||
| 	std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl; | ||||
|  | ||||
| 	std::cout << GridLogIRL << "returning Nstop vectors, the last "<< Nstop-Nconv << "of which might meet convergence criterion only approximately" <<std::endl; | ||||
|       } | ||||
|       eval=eval2; | ||||
|        | ||||
|       //Keep only converged | ||||
|       eval.resize(Nconv);// Nstop? | ||||
|       evec.resize(Nconv,grid);// Nstop? | ||||
|       eval.resize(Nstop);// was Nconv | ||||
|       evec.resize(Nstop,grid);// was Nconv | ||||
|       basisSortInPlace(evec,eval,reverse); | ||||
|        | ||||
|     } | ||||
|   | ||||
| @@ -4,11 +4,14 @@ NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| /*Allocation types, saying which pointer cache should be used*/ | ||||
| #define Cpu      (0) | ||||
| #define CpuSmall (1) | ||||
| #define Acc      (2) | ||||
| #define AccSmall (3) | ||||
| #define Shared   (4) | ||||
| #define SharedSmall (5) | ||||
| #define CpuHuge  (1) | ||||
| #define CpuSmall (2) | ||||
| #define Acc      (3) | ||||
| #define AccHuge  (4) | ||||
| #define AccSmall (5) | ||||
| #define Shared   (6) | ||||
| #define SharedHuge  (7) | ||||
| #define SharedSmall (8) | ||||
| #undef GRID_MM_VERBOSE  | ||||
| uint64_t total_shared; | ||||
| uint64_t total_device; | ||||
| @@ -35,12 +38,15 @@ void MemoryManager::PrintBytes(void) | ||||
|    | ||||
| } | ||||
|  | ||||
| uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; } | ||||
| uint64_t MemoryManager::HostCacheBytes()   { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; } | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////// | ||||
| // Data tables for recently freed pooiniter caches | ||||
| ////////////////////////////////////////////////////////////////////// | ||||
| MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax]; | ||||
| int MemoryManager::Victim[MemoryManager::NallocType]; | ||||
| int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 }; | ||||
| int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 }; | ||||
| uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType]; | ||||
| ////////////////////////////////////////////////////////////////////// | ||||
| // Actual allocation and deallocation utils | ||||
| @@ -170,6 +176,16 @@ void MemoryManager::Init(void) | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   str= getenv("GRID_ALLOC_NCACHE_HUGE"); | ||||
|   if ( str ) { | ||||
|     Nc = atoi(str); | ||||
|     if ( (Nc>=0) && (Nc < NallocCacheMax)) { | ||||
|       Ncache[CpuHuge]=Nc; | ||||
|       Ncache[AccHuge]=Nc; | ||||
|       Ncache[SharedHuge]=Nc; | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   str= getenv("GRID_ALLOC_NCACHE_SMALL"); | ||||
|   if ( str ) { | ||||
|     Nc = atoi(str); | ||||
| @@ -190,7 +206,9 @@ void MemoryManager::InitMessage(void) { | ||||
|    | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl; | ||||
| #ifdef ALLOCATION_CACHE | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl; | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host   allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<" HUGE "<<Ncache[CpuHuge]<<std::endl; | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<" Huge "<<Ncache[AccHuge]<<std::endl; | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<" Huge "<<Ncache[SharedHuge]<<std::endl; | ||||
| #endif | ||||
|    | ||||
| #ifdef GRID_UVM | ||||
| @@ -204,6 +222,9 @@ void MemoryManager::InitMessage(void) { | ||||
| #ifdef GRID_SYCL | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_shared"<<std::endl; | ||||
| #endif | ||||
| #ifdef GRID_OMPTARGET | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() Using OMPTARGET managed memory"<<std::endl; | ||||
| #endif | ||||
| #else | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory"<<std::endl; | ||||
| #ifdef GRID_CUDA | ||||
| @@ -215,6 +236,9 @@ void MemoryManager::InitMessage(void) { | ||||
| #ifdef GRID_SYCL | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl; | ||||
| #endif | ||||
| #ifdef GRID_OMPTARGET | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() Using OMPTARGET omp_alloc_device"<<std::endl; | ||||
| #endif | ||||
| #endif | ||||
|  | ||||
| } | ||||
| @@ -222,8 +246,11 @@ void MemoryManager::InitMessage(void) { | ||||
| void *MemoryManager::Insert(void *ptr,size_t bytes,int type)  | ||||
| { | ||||
| #ifdef ALLOCATION_CACHE | ||||
|   bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); | ||||
|   int cache = type + small; | ||||
|   int cache; | ||||
|   if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2; | ||||
|   else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1; | ||||
|   else                                     cache = type; | ||||
|  | ||||
|   return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);   | ||||
| #else | ||||
|   return ptr; | ||||
| @@ -232,11 +259,12 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type) | ||||
|  | ||||
| void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)  | ||||
| { | ||||
|   assert(ncache>0); | ||||
| #ifdef GRID_OMP | ||||
|   assert(omp_in_parallel()==0); | ||||
| #endif  | ||||
|  | ||||
|   if (ncache == 0) return ptr; | ||||
|  | ||||
|   void * ret = NULL; | ||||
|   int v = -1; | ||||
|  | ||||
| @@ -271,8 +299,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries | ||||
| void *MemoryManager::Lookup(size_t bytes,int type) | ||||
| { | ||||
| #ifdef ALLOCATION_CACHE | ||||
|   bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); | ||||
|   int cache = type+small; | ||||
|   int cache; | ||||
|   if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2; | ||||
|   else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1; | ||||
|   else                                     cache = type; | ||||
|  | ||||
|   return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]); | ||||
| #else | ||||
|   return NULL; | ||||
| @@ -281,7 +312,6 @@ void *MemoryManager::Lookup(size_t bytes,int type) | ||||
|  | ||||
| void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)  | ||||
| { | ||||
|   assert(ncache>0); | ||||
| #ifdef GRID_OMP | ||||
|   assert(omp_in_parallel()==0); | ||||
| #endif  | ||||
|   | ||||
| @@ -35,6 +35,7 @@ NAMESPACE_BEGIN(Grid); | ||||
| // Move control to configure.ac and Config.h? | ||||
|  | ||||
| #define GRID_ALLOC_SMALL_LIMIT (4096) | ||||
| #define GRID_ALLOC_HUGE_LIMIT  (2147483648) | ||||
|  | ||||
| #define STRINGIFY(x) #x | ||||
| #define TOSTRING(x) STRINGIFY(x) | ||||
| @@ -70,6 +71,21 @@ enum ViewMode { | ||||
|   CpuWriteDiscard = 0x10 // same for now | ||||
| }; | ||||
|  | ||||
| struct MemoryStatus { | ||||
|   uint64_t     DeviceBytes; | ||||
|   uint64_t     DeviceLRUBytes; | ||||
|   uint64_t     DeviceMaxBytes; | ||||
|   uint64_t     HostToDeviceBytes; | ||||
|   uint64_t     DeviceToHostBytes; | ||||
|   uint64_t     HostToDeviceXfer; | ||||
|   uint64_t     DeviceToHostXfer; | ||||
|   uint64_t     DeviceEvictions; | ||||
|   uint64_t     DeviceDestroy; | ||||
|   uint64_t     DeviceAllocCacheBytes; | ||||
|   uint64_t     HostAllocCacheBytes; | ||||
| }; | ||||
|  | ||||
|  | ||||
| class MemoryManager { | ||||
| private: | ||||
|  | ||||
| @@ -83,7 +99,7 @@ private: | ||||
|   } AllocationCacheEntry; | ||||
|  | ||||
|   static const int NallocCacheMax=128;  | ||||
|   static const int NallocType=6; | ||||
|   static const int NallocType=9; | ||||
|   static AllocationCacheEntry Entries[NallocType][NallocCacheMax]; | ||||
|   static int Victim[NallocType]; | ||||
|   static int Ncache[NallocType]; | ||||
| @@ -121,7 +137,26 @@ private: | ||||
|   static uint64_t     DeviceToHostXfer; | ||||
|   static uint64_t     DeviceEvictions; | ||||
|   static uint64_t     DeviceDestroy; | ||||
|   | ||||
|    | ||||
|   static uint64_t     DeviceCacheBytes(); | ||||
|   static uint64_t     HostCacheBytes(); | ||||
|  | ||||
|   static MemoryStatus GetFootprint(void) { | ||||
|     MemoryStatus stat; | ||||
|     stat.DeviceBytes       = DeviceBytes; | ||||
|     stat.DeviceLRUBytes    = DeviceLRUBytes; | ||||
|     stat.DeviceMaxBytes    = DeviceMaxBytes; | ||||
|     stat.HostToDeviceBytes = HostToDeviceBytes; | ||||
|     stat.DeviceToHostBytes = DeviceToHostBytes; | ||||
|     stat.HostToDeviceXfer  = HostToDeviceXfer; | ||||
|     stat.DeviceToHostXfer  = DeviceToHostXfer; | ||||
|     stat.DeviceEvictions   = DeviceEvictions; | ||||
|     stat.DeviceDestroy     = DeviceDestroy; | ||||
|     stat.DeviceAllocCacheBytes = DeviceCacheBytes(); | ||||
|     stat.HostAllocCacheBytes   = HostCacheBytes(); | ||||
|     return stat; | ||||
|   }; | ||||
|    | ||||
|  private: | ||||
| #ifndef GRID_UVM | ||||
|   ////////////////////////////////////////////////////////////////////// | ||||
|   | ||||
| @@ -519,7 +519,6 @@ void MemoryManager::Audit(std::string s) | ||||
|   uint64_t LruBytes1=0; | ||||
|   uint64_t LruBytes2=0; | ||||
|   uint64_t LruCnt=0; | ||||
|   uint64_t LockedBytes=0; | ||||
|    | ||||
|   std::cout << " Memory Manager::Audit() from "<<s<<std::endl; | ||||
|   for(auto it=LRU.begin();it!=LRU.end();it++){ | ||||
|   | ||||
| @@ -400,9 +400,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques | ||||
| } | ||||
| void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir) | ||||
| { | ||||
|   acceleratorCopySynchronise(); | ||||
|   StencilBarrier();// Synch shared memory on a single nodes | ||||
|  | ||||
|   int nreq=list.size(); | ||||
|  | ||||
|   if (nreq==0) return; | ||||
|   | ||||
| @@ -128,7 +128,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques | ||||
| 							 int recv_from_rank,int dor, | ||||
| 							 int xbytes,int rbytes, int dir) | ||||
| { | ||||
|   return 2.0*bytes; | ||||
|   return xbytes+rbytes; | ||||
| } | ||||
| void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) | ||||
| { | ||||
|   | ||||
| @@ -91,6 +91,59 @@ void *SharedMemory::ShmBufferSelf(void) | ||||
|   //std::cerr << "ShmBufferSelf "<<ShmRank<<" "<<std::hex<< ShmCommBufs[ShmRank] <<std::dec<<std::endl; | ||||
|   return ShmCommBufs[ShmRank]; | ||||
| } | ||||
| static inline int divides(int a,int b) | ||||
| { | ||||
|   return ( b == ( (b/a)*a ) ); | ||||
| } | ||||
| void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims) | ||||
| { | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   // Allow user to configure through environment variable | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str()); | ||||
|   if ( str ) { | ||||
|     std::vector<int> IntShmDims; | ||||
|     GridCmdOptionIntVector(std::string(str),IntShmDims); | ||||
|     assert(IntShmDims.size() == WorldDims.size()); | ||||
|     long ShmSize = 1; | ||||
|     for (int dim=0;dim<WorldDims.size();dim++) { | ||||
|       ShmSize *= (ShmDims[dim] = IntShmDims[dim]); | ||||
|       assert(divides(ShmDims[dim],WorldDims[dim])); | ||||
|     } | ||||
|     assert(ShmSize == WorldShmSize); | ||||
|     return; | ||||
|   } | ||||
|    | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   // Powers of 2,3,5 only in prime decomposition for now | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   int ndimension = WorldDims.size(); | ||||
|   ShmDims=Coordinate(ndimension,1); | ||||
|  | ||||
|   std::vector<int> primes({2,3,5}); | ||||
|  | ||||
|   int dim = 0; | ||||
|   int last_dim = ndimension - 1; | ||||
|   int AutoShmSize = 1; | ||||
|   while(AutoShmSize != WorldShmSize) { | ||||
|     int p; | ||||
|     for(p=0;p<primes.size();p++) { | ||||
|       int prime=primes[p]; | ||||
|       if ( divides(prime,WorldDims[dim]/ShmDims[dim]) | ||||
|         && divides(prime,WorldShmSize/AutoShmSize)  ) { | ||||
|   AutoShmSize*=prime; | ||||
|   ShmDims[dim]*=prime; | ||||
|   last_dim = dim; | ||||
|   break; | ||||
|       } | ||||
|     } | ||||
|     if (p == primes.size() && last_dim == dim) { | ||||
|       std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl; | ||||
|       exit(EXIT_FAILURE); | ||||
|     } | ||||
|     dim=(dim+1) %ndimension; | ||||
|   } | ||||
| } | ||||
|  | ||||
| NAMESPACE_END(Grid);  | ||||
|  | ||||
|   | ||||
| @@ -27,9 +27,10 @@ Author: Christoph Lehner <christoph@lhnr.de> | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
|  | ||||
| #define Mheader "SharedMemoryMpi: " | ||||
|  | ||||
| #include <Grid/GridCore.h> | ||||
| #include <pwd.h> | ||||
| #include <syscall.h> | ||||
|  | ||||
| #ifdef GRID_CUDA | ||||
| #include <cuda_runtime_api.h> | ||||
| @@ -37,12 +38,120 @@ Author: Christoph Lehner <christoph@lhnr.de> | ||||
| #ifdef GRID_HIP | ||||
| #include <hip/hip_runtime_api.h> | ||||
| #endif | ||||
| #ifdef GRID_SYCl | ||||
|  | ||||
| #ifdef GRID_SYCL | ||||
| #define GRID_SYCL_LEVEL_ZERO_IPC | ||||
| #include <syscall.h> | ||||
| #define SHM_SOCKETS  | ||||
| #endif | ||||
|  | ||||
| #include <sys/socket.h> | ||||
| #include <sys/un.h> | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid);  | ||||
| #define header "SharedMemoryMpi: " | ||||
|  | ||||
| #ifdef SHM_SOCKETS | ||||
|  | ||||
| /* | ||||
|  * Barbaric extra intranode communication route in case we need sockets to pass FDs | ||||
|  * Forced by level_zero not being nicely designed | ||||
|  */ | ||||
| static int sock; | ||||
| static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d"; | ||||
| static char sock_path[256]; | ||||
| class UnixSockets { | ||||
| public: | ||||
|   static void Open(int rank) | ||||
|   { | ||||
|     int errnum; | ||||
|  | ||||
|     sock = socket(AF_UNIX, SOCK_DGRAM, 0);  assert(sock>0); | ||||
|  | ||||
|     struct sockaddr_un sa_un = { 0 }; | ||||
|     sa_un.sun_family = AF_UNIX; | ||||
|     snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank); | ||||
|     unlink(sa_un.sun_path); | ||||
|     if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) { | ||||
|       perror("bind failure"); | ||||
|       exit(EXIT_FAILURE); | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   static int RecvFileDescriptor(void) | ||||
|   { | ||||
|     int n; | ||||
|     int fd; | ||||
|     char buf[1]; | ||||
|     struct iovec iov; | ||||
|     struct msghdr msg; | ||||
|     struct cmsghdr *cmsg; | ||||
|     char cms[CMSG_SPACE(sizeof(int))]; | ||||
|  | ||||
|     iov.iov_base = buf; | ||||
|     iov.iov_len = 1; | ||||
|  | ||||
|     memset(&msg, 0, sizeof msg); | ||||
|     msg.msg_name = 0; | ||||
|     msg.msg_namelen = 0; | ||||
|     msg.msg_iov = &iov; | ||||
|     msg.msg_iovlen = 1; | ||||
|  | ||||
|     msg.msg_control = (caddr_t)cms; | ||||
|     msg.msg_controllen = sizeof cms; | ||||
|  | ||||
|     if((n=recvmsg(sock, &msg, 0)) < 0) { | ||||
|       perror("recvmsg failed"); | ||||
|       return -1; | ||||
|     } | ||||
|     if(n == 0){ | ||||
|       perror("recvmsg returned 0"); | ||||
|       return -1; | ||||
|     } | ||||
|     cmsg = CMSG_FIRSTHDR(&msg); | ||||
|  | ||||
|     memmove(&fd, CMSG_DATA(cmsg), sizeof(int)); | ||||
|  | ||||
|     return fd; | ||||
|   } | ||||
|  | ||||
|   static void SendFileDescriptor(int fildes,int xmit_to_rank) | ||||
|   { | ||||
|     struct msghdr msg; | ||||
|     struct iovec iov; | ||||
|     struct cmsghdr *cmsg = NULL; | ||||
|     char ctrl[CMSG_SPACE(sizeof(int))]; | ||||
|     char data = ' '; | ||||
|  | ||||
|     memset(&msg, 0, sizeof(struct msghdr)); | ||||
|     memset(ctrl, 0, CMSG_SPACE(sizeof(int))); | ||||
|     iov.iov_base = &data; | ||||
|     iov.iov_len = sizeof(data); | ||||
|      | ||||
|     sprintf(sock_path,sock_path_fmt,xmit_to_rank); | ||||
|      | ||||
|     struct sockaddr_un sa_un = { 0 }; | ||||
|     sa_un.sun_family = AF_UNIX; | ||||
|     snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank); | ||||
|  | ||||
|     msg.msg_name = (void *)&sa_un; | ||||
|     msg.msg_namelen = sizeof(sa_un); | ||||
|     msg.msg_iov = &iov; | ||||
|     msg.msg_iovlen = 1; | ||||
|     msg.msg_controllen =  CMSG_SPACE(sizeof(int)); | ||||
|     msg.msg_control = ctrl; | ||||
|  | ||||
|     cmsg = CMSG_FIRSTHDR(&msg); | ||||
|     cmsg->cmsg_level = SOL_SOCKET; | ||||
|     cmsg->cmsg_type = SCM_RIGHTS; | ||||
|     cmsg->cmsg_len = CMSG_LEN(sizeof(int)); | ||||
|  | ||||
|     *((int *) CMSG_DATA(cmsg)) = fildes; | ||||
|  | ||||
|     sendmsg(sock, &msg, 0); | ||||
|   }; | ||||
| }; | ||||
| #endif | ||||
|  | ||||
|  | ||||
| /*Construct from an MPI communicator*/ | ||||
| void GlobalSharedMemory::Init(Grid_MPI_Comm comm) | ||||
| { | ||||
| @@ -65,8 +174,8 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm) | ||||
|   MPI_Comm_size(WorldShmComm     ,&WorldShmSize); | ||||
|  | ||||
|   if ( WorldRank == 0) { | ||||
|     std::cout << header " World communicator of size " <<WorldSize << std::endl;   | ||||
|     std::cout << header " Node  communicator of size " <<WorldShmSize << std::endl; | ||||
|     std::cout << Mheader " World communicator of size " <<WorldSize << std::endl;   | ||||
|     std::cout << Mheader " Node  communicator of size " <<WorldShmSize << std::endl; | ||||
|   } | ||||
|   // WorldShmComm, WorldShmSize, WorldShmRank | ||||
|  | ||||
| @@ -169,59 +278,7 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M | ||||
|   if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM); | ||||
|   else                          OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM); | ||||
| } | ||||
| static inline int divides(int a,int b) | ||||
| { | ||||
|   return ( b == ( (b/a)*a ) ); | ||||
| } | ||||
| void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims) | ||||
| { | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   // Allow user to configure through environment variable | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str()); | ||||
|   if ( str ) { | ||||
|     std::vector<int> IntShmDims; | ||||
|     GridCmdOptionIntVector(std::string(str),IntShmDims); | ||||
|     assert(IntShmDims.size() == WorldDims.size()); | ||||
|     long ShmSize = 1; | ||||
|     for (int dim=0;dim<WorldDims.size();dim++) { | ||||
|       ShmSize *= (ShmDims[dim] = IntShmDims[dim]); | ||||
|       assert(divides(ShmDims[dim],WorldDims[dim])); | ||||
|     } | ||||
|     assert(ShmSize == WorldShmSize); | ||||
|     return; | ||||
|   } | ||||
|    | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   // Powers of 2,3,5 only in prime decomposition for now | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   int ndimension = WorldDims.size(); | ||||
|   ShmDims=Coordinate(ndimension,1); | ||||
|  | ||||
|   std::vector<int> primes({2,3,5}); | ||||
|  | ||||
|   int dim = 0; | ||||
|   int last_dim = ndimension - 1; | ||||
|   int AutoShmSize = 1; | ||||
|   while(AutoShmSize != WorldShmSize) { | ||||
|     int p; | ||||
|     for(p=0;p<primes.size();p++) { | ||||
|       int prime=primes[p]; | ||||
|       if ( divides(prime,WorldDims[dim]/ShmDims[dim]) | ||||
|         && divides(prime,WorldShmSize/AutoShmSize)  ) { | ||||
| 	AutoShmSize*=prime; | ||||
| 	ShmDims[dim]*=prime; | ||||
| 	last_dim = dim; | ||||
| 	break; | ||||
|       } | ||||
|     } | ||||
|     if (p == primes.size() && last_dim == dim) { | ||||
|       std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl; | ||||
|       exit(EXIT_FAILURE); | ||||
|     } | ||||
|     dim=(dim+1) %ndimension; | ||||
|   } | ||||
| } | ||||
| void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM) | ||||
| { | ||||
|   //////////////////////////////////////////////////////////////// | ||||
| @@ -395,7 +452,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce | ||||
| #ifdef GRID_MPI3_SHMGET | ||||
| void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
| { | ||||
|   std::cout << header "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl; | ||||
|   std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl; | ||||
|   assert(_ShmSetup==1); | ||||
|   assert(_ShmAlloc==0); | ||||
|  | ||||
| @@ -480,7 +537,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
|     exit(EXIT_FAILURE);   | ||||
|   } | ||||
|  | ||||
|   std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes  | ||||
|   std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes  | ||||
| 	    << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl; | ||||
|  | ||||
|   SharedMemoryZero(ShmCommBuf,bytes); | ||||
| @@ -523,7 +580,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
|     exit(EXIT_FAILURE);   | ||||
|   } | ||||
|   if ( WorldRank == 0 ){ | ||||
|     std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes  | ||||
|     std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes  | ||||
| 	      << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl; | ||||
|   } | ||||
|   SharedMemoryZero(ShmCommBuf,bytes); | ||||
| @@ -531,8 +588,13 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // Loop over ranks/gpu's on our node | ||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| #ifdef SHM_SOCKETS | ||||
|   UnixSockets::Open(WorldShmRank); | ||||
| #endif | ||||
|   for(int r=0;r<WorldShmSize;r++){ | ||||
|  | ||||
|     MPI_Barrier(WorldShmComm); | ||||
|  | ||||
| #ifndef GRID_MPI3_SHM_NONE | ||||
|     ////////////////////////////////////////////////// | ||||
|     // If it is me, pass around the IPC access key | ||||
| @@ -540,24 +602,32 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
|     void * thisBuf = ShmCommBuf; | ||||
|     if(!Stencil_force_mpi) { | ||||
| #ifdef GRID_SYCL_LEVEL_ZERO_IPC | ||||
|     typedef struct { int fd; pid_t pid ; } clone_mem_t; | ||||
|     typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t; | ||||
|  | ||||
|     auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device()); | ||||
|     auto zeContext   = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context()); | ||||
|     auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device()); | ||||
|     auto zeContext   = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context()); | ||||
|        | ||||
|     ze_ipc_mem_handle_t ihandle; | ||||
|     clone_mem_t handle; | ||||
|  | ||||
|      | ||||
|     if ( r==WorldShmRank ) {  | ||||
|       auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle); | ||||
|       if ( err != ZE_RESULT_SUCCESS ) { | ||||
| 	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; | ||||
| 	std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; | ||||
| 	exit(EXIT_FAILURE); | ||||
|       } else { | ||||
| 	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; | ||||
|       } | ||||
|       memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int)); | ||||
|       handle.pid = getpid(); | ||||
|       memcpy((void *)&handle.ze,(void *)&ihandle,sizeof(ihandle)); | ||||
| #ifdef SHM_SOCKETS | ||||
|       for(int rr=0;rr<WorldShmSize;rr++){ | ||||
| 	if(rr!=r){ | ||||
| 	  UnixSockets::SendFileDescriptor(handle.fd,rr); | ||||
| 	} | ||||
|       } | ||||
| #endif | ||||
|     } | ||||
| #endif | ||||
| #ifdef GRID_CUDA | ||||
| @@ -585,6 +655,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
|     // Share this IPC handle across the Shm Comm | ||||
|     ////////////////////////////////////////////////// | ||||
|     {  | ||||
|       MPI_Barrier(WorldShmComm); | ||||
|       int ierr=MPI_Bcast(&handle, | ||||
| 			 sizeof(handle), | ||||
| 			 MPI_BYTE, | ||||
| @@ -600,6 +671,10 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
| #ifdef GRID_SYCL_LEVEL_ZERO_IPC | ||||
|     if ( r!=WorldShmRank ) { | ||||
|       thisBuf = nullptr; | ||||
|       int myfd; | ||||
| #ifdef SHM_SOCKETS | ||||
|       myfd=UnixSockets::RecvFileDescriptor(); | ||||
| #else | ||||
|       std::cout<<"mapping seeking remote pid/fd " | ||||
| 	       <<handle.pid<<"/" | ||||
| 	       <<handle.fd<<std::endl; | ||||
| @@ -607,16 +682,22 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
|       int pidfd = syscall(SYS_pidfd_open,handle.pid,0); | ||||
|       std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n"; | ||||
|       //      int myfd  = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0); | ||||
|       int myfd  = syscall(438,pidfd,handle.fd,0); | ||||
|  | ||||
|       std::cout<<"Using IpcHandle myfd "<<myfd<<"\n"; | ||||
|        | ||||
|       myfd  = syscall(438,pidfd,handle.fd,0); | ||||
|       int err_t = errno; | ||||
|       if (myfd < 0) { | ||||
|         fprintf(stderr,"pidfd_getfd returned %d errno was %d\n", myfd,err_t); fflush(stderr); | ||||
| 	perror("pidfd_getfd failed "); | ||||
| 	assert(0); | ||||
|       } | ||||
| #endif | ||||
|       std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n"; | ||||
|       memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle)); | ||||
|       memcpy((void *)&ihandle,(void *)&myfd,sizeof(int)); | ||||
|  | ||||
|       auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf); | ||||
|       if ( err != ZE_RESULT_SUCCESS ) { | ||||
| 	std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl; | ||||
| 	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;  | ||||
| 	std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl; | ||||
| 	std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;  | ||||
| 	exit(EXIT_FAILURE); | ||||
|       } else { | ||||
| 	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl; | ||||
| @@ -651,6 +732,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
| #else | ||||
|     WorldShmCommBufs[r] = ShmCommBuf; | ||||
| #endif | ||||
|     MPI_Barrier(WorldShmComm); | ||||
|   } | ||||
|  | ||||
|   _ShmAllocBytes=bytes; | ||||
| @@ -662,7 +744,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
| #ifdef GRID_MPI3_SHMMMAP | ||||
| void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
| { | ||||
|   std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl; | ||||
|   std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl; | ||||
|   assert(_ShmSetup==1); | ||||
|   assert(_ShmAlloc==0); | ||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| @@ -699,7 +781,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
|     assert(((uint64_t)ptr&0x3F)==0); | ||||
|     close(fd); | ||||
|     WorldShmCommBufs[r] =ptr; | ||||
|     //    std::cout << header "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl; | ||||
|     //    std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl; | ||||
|   } | ||||
|   _ShmAlloc=1; | ||||
|   _ShmAllocBytes  = bytes; | ||||
| @@ -709,7 +791,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
| #ifdef GRID_MPI3_SHM_NONE | ||||
| void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
| { | ||||
|   std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl; | ||||
|   std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl; | ||||
|   assert(_ShmSetup==1); | ||||
|   assert(_ShmAlloc==0); | ||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| @@ -756,7 +838,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
| //////////////////////////////////////////////////////////////////////////////////////////// | ||||
| void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
| {  | ||||
|   std::cout << header "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl; | ||||
|   std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl; | ||||
|   assert(_ShmSetup==1); | ||||
|   assert(_ShmAlloc==0);  | ||||
|   MPI_Barrier(WorldShmComm); | ||||
|   | ||||
| @@ -68,7 +68,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // Each MPI rank should allocate our own buffer | ||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   ShmCommBuf = acceleratorAllocDevice(bytes); | ||||
|   ShmCommBuf = acceleratorAllocShared(bytes); | ||||
|   //ShmCommBuf = acceleratorAllocDevice(bytes); | ||||
|  | ||||
|   if (ShmCommBuf == (void *)NULL ) { | ||||
|     std::cerr << " SharedMemoryNone.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl; | ||||
|   | ||||
| @@ -29,8 +29,27 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| extern Vector<std::pair<int,int> > Cshift_table;  | ||||
| extern std::vector<std::pair<int,int> > Cshift_table;  | ||||
| extern commVector<std::pair<int,int> > Cshift_table_device;  | ||||
|  | ||||
| inline std::pair<int,int> *MapCshiftTable(void) | ||||
| { | ||||
|   // GPU version | ||||
| #ifdef ACCELERATOR_CSHIFT     | ||||
|   uint64_t sz=Cshift_table.size(); | ||||
|   if (Cshift_table_device.size()!=sz )    { | ||||
|     Cshift_table_device.resize(sz); | ||||
|   } | ||||
|   acceleratorCopyToDevice((void *)&Cshift_table[0], | ||||
| 			  (void *)&Cshift_table_device[0], | ||||
| 			  sizeof(Cshift_table[0])*sz); | ||||
|  | ||||
|   return &Cshift_table_device[0]; | ||||
| #else  | ||||
|   return &Cshift_table[0]; | ||||
| #endif | ||||
|   // CPU version use identify map | ||||
| } | ||||
| /////////////////////////////////////////////////////////////////// | ||||
| // Gather for when there is no need to SIMD split  | ||||
| /////////////////////////////////////////////////////////////////// | ||||
| @@ -74,8 +93,8 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim | ||||
|   } | ||||
|   { | ||||
|     auto buffer_p = & buffer[0]; | ||||
|     auto table = &Cshift_table[0]; | ||||
| #ifdef ACCELERATOR_CSHIFT     | ||||
|     auto table = MapCshiftTable(); | ||||
| #ifdef ACCELERATOR_CSHIFT | ||||
|     autoView(rhs_v , rhs, AcceleratorRead); | ||||
|     accelerator_for(i,ent,vobj::Nsimd(),{ | ||||
| 	coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); | ||||
| @@ -225,7 +244,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector< | ||||
|    | ||||
|   { | ||||
|     auto buffer_p = & buffer[0]; | ||||
|     auto table = &Cshift_table[0]; | ||||
|     auto table = MapCshiftTable(); | ||||
| #ifdef ACCELERATOR_CSHIFT     | ||||
|     autoView( rhs_v, rhs, AcceleratorWrite); | ||||
|     accelerator_for(i,ent,vobj::Nsimd(),{ | ||||
| @@ -340,7 +359,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs | ||||
|   } | ||||
|  | ||||
|   { | ||||
|     auto table = &Cshift_table[0]; | ||||
|     auto table = MapCshiftTable(); | ||||
| #ifdef ACCELERATOR_CSHIFT     | ||||
|     autoView(rhs_v , rhs, AcceleratorRead); | ||||
|     autoView(lhs_v , lhs, AcceleratorWrite); | ||||
| @@ -392,7 +411,7 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo | ||||
|   } | ||||
|  | ||||
|   { | ||||
|     auto table = &Cshift_table[0]; | ||||
|     auto table = MapCshiftTable(); | ||||
| #ifdef ACCELERATOR_CSHIFT     | ||||
|     autoView( rhs_v, rhs, AcceleratorRead); | ||||
|     autoView( lhs_v, lhs, AcceleratorWrite); | ||||
|   | ||||
| @@ -52,7 +52,8 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension | ||||
|   int comm_dim        = rhs.Grid()->_processors[dimension] >1 ; | ||||
|   int splice_dim      = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim); | ||||
|  | ||||
|  | ||||
|   RealD t1,t0; | ||||
|   t0=usecond(); | ||||
|   if ( !comm_dim ) { | ||||
|     //std::cout << "CSHIFT: Cshift_local" <<std::endl; | ||||
|     Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding | ||||
| @@ -63,6 +64,8 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension | ||||
|     //std::cout << "CSHIFT: Cshift_comms" <<std::endl; | ||||
|     Cshift_comms(ret,rhs,dimension,shift); | ||||
|   } | ||||
|   t1=usecond(); | ||||
|   //  std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl; | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| @@ -127,16 +130,20 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | ||||
|      | ||||
|   int cb= (cbmask==0x2)? Odd : Even; | ||||
|   int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); | ||||
|  | ||||
|   RealD tcopy=0.0; | ||||
|   RealD tgather=0.0; | ||||
|   RealD tscatter=0.0; | ||||
|   RealD tcomms=0.0; | ||||
|   uint64_t xbytes=0; | ||||
|   for(int x=0;x<rd;x++){        | ||||
|  | ||||
|     int sx        =  (x+sshift)%rd; | ||||
|     int comm_proc = ((x+sshift)/rd)%pd; | ||||
|      | ||||
|     if (comm_proc==0) { | ||||
|  | ||||
|       tcopy-=usecond(); | ||||
|       Copy_plane(ret,rhs,dimension,x,sx,cbmask);  | ||||
|  | ||||
|       tcopy+=usecond(); | ||||
|     } else { | ||||
|  | ||||
|       int words = buffer_size; | ||||
| @@ -144,26 +151,39 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | ||||
|  | ||||
|       int bytes = words * sizeof(vobj); | ||||
|  | ||||
|       tgather-=usecond(); | ||||
|       Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask); | ||||
|       tgather+=usecond(); | ||||
|  | ||||
|       //      int rank           = grid->_processor; | ||||
|       int recv_from_rank; | ||||
|       int xmit_to_rank; | ||||
|       grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); | ||||
|  | ||||
|       grid->Barrier(); | ||||
|        | ||||
|       tcomms-=usecond(); | ||||
|       //      grid->Barrier(); | ||||
|  | ||||
|       grid->SendToRecvFrom((void *)&send_buf[0], | ||||
| 			   xmit_to_rank, | ||||
| 			   (void *)&recv_buf[0], | ||||
| 			   recv_from_rank, | ||||
| 			   bytes); | ||||
|       xbytes+=bytes; | ||||
|       //      grid->Barrier(); | ||||
|       tcomms+=usecond(); | ||||
|  | ||||
|       grid->Barrier(); | ||||
|  | ||||
|       tscatter-=usecond(); | ||||
|       Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); | ||||
|       tscatter+=usecond(); | ||||
|     } | ||||
|   } | ||||
|   /* | ||||
|   std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl; | ||||
|   std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl; | ||||
|   std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl; | ||||
|   std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl; | ||||
|   std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl; | ||||
|   */ | ||||
| } | ||||
|  | ||||
| template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) | ||||
| @@ -190,6 +210,12 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | ||||
|   assert(shift>=0); | ||||
|   assert(shift<fd); | ||||
|  | ||||
|   RealD tcopy=0.0; | ||||
|   RealD tgather=0.0; | ||||
|   RealD tscatter=0.0; | ||||
|   RealD tcomms=0.0; | ||||
|   uint64_t xbytes=0; | ||||
|    | ||||
|   int permute_type=grid->PermuteType(dimension); | ||||
|  | ||||
|   /////////////////////////////////////////////// | ||||
| @@ -227,7 +253,9 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | ||||
|       pointers[i] = &send_buf_extract[i][0]; | ||||
|     } | ||||
|     int sx   = (x+sshift)%rd; | ||||
|     tgather-=usecond(); | ||||
|     Gather_plane_extract(rhs,pointers,dimension,sx,cbmask); | ||||
|     tgather+=usecond(); | ||||
|  | ||||
|     for(int i=0;i<Nsimd;i++){ | ||||
|        | ||||
| @@ -252,7 +280,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | ||||
|       if(nbr_proc){ | ||||
| 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);  | ||||
|  | ||||
| 	grid->Barrier(); | ||||
| 	tcomms-=usecond(); | ||||
| 	//	grid->Barrier(); | ||||
|  | ||||
| 	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0]; | ||||
| 	recv_buf_extract_mpi = &recv_buf_extract[i][0]; | ||||
| @@ -262,7 +291,9 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | ||||
| 			     recv_from_rank, | ||||
| 			     bytes); | ||||
|  | ||||
| 	grid->Barrier(); | ||||
| 	xbytes+=bytes; | ||||
| 	//	grid->Barrier(); | ||||
| 	tcomms+=usecond(); | ||||
|  | ||||
| 	rpointers[i] = &recv_buf_extract[i][0]; | ||||
|       } else {  | ||||
| @@ -270,9 +301,17 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | ||||
|       } | ||||
|  | ||||
|     } | ||||
|     tscatter-=usecond(); | ||||
|     Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); | ||||
|     tscatter+=usecond(); | ||||
|   } | ||||
|  | ||||
|   /* | ||||
|   std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl; | ||||
|   std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl; | ||||
|   std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl; | ||||
|   std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl; | ||||
|   std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl; | ||||
|   */ | ||||
| } | ||||
| #else  | ||||
| template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) | ||||
| @@ -292,6 +331,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | ||||
|   assert(comm_dim==1); | ||||
|   assert(shift>=0); | ||||
|   assert(shift<fd); | ||||
|   RealD tcopy=0.0; | ||||
|   RealD tgather=0.0; | ||||
|   RealD tscatter=0.0; | ||||
|   RealD tcomms=0.0; | ||||
|   uint64_t xbytes=0; | ||||
|    | ||||
|   int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; | ||||
|   static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size); | ||||
| @@ -315,7 +359,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | ||||
|      | ||||
|     if (comm_proc==0) { | ||||
|  | ||||
|       tcopy-=usecond(); | ||||
|       Copy_plane(ret,rhs,dimension,x,sx,cbmask);  | ||||
|       tcopy+=usecond(); | ||||
|  | ||||
|     } else { | ||||
|  | ||||
| @@ -324,7 +370,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | ||||
|  | ||||
|       int bytes = words * sizeof(vobj); | ||||
|  | ||||
|       tgather-=usecond(); | ||||
|       Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask); | ||||
|       tgather+=usecond(); | ||||
|  | ||||
|       //      int rank           = grid->_processor; | ||||
|       int recv_from_rank; | ||||
| @@ -332,7 +380,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | ||||
|       grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); | ||||
|  | ||||
|  | ||||
|       grid->Barrier(); | ||||
|       tcomms-=usecond(); | ||||
|       //      grid->Barrier(); | ||||
|  | ||||
|       acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes); | ||||
|       grid->SendToRecvFrom((void *)&send_buf[0], | ||||
| @@ -340,13 +389,24 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | ||||
| 			   (void *)&recv_buf[0], | ||||
| 			   recv_from_rank, | ||||
| 			   bytes); | ||||
|       xbytes+=bytes; | ||||
|       acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes); | ||||
|  | ||||
|       grid->Barrier(); | ||||
|       //      grid->Barrier(); | ||||
|       tcomms+=usecond(); | ||||
|  | ||||
|       tscatter-=usecond(); | ||||
|       Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask); | ||||
|       tscatter+=usecond(); | ||||
|     } | ||||
|   } | ||||
|   /* | ||||
|   std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl; | ||||
|   std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl; | ||||
|   std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl; | ||||
|   std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl; | ||||
|   std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl; | ||||
|   */ | ||||
| } | ||||
|  | ||||
| template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) | ||||
| @@ -372,6 +432,11 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | ||||
|   assert(simd_layout==2); | ||||
|   assert(shift>=0); | ||||
|   assert(shift<fd); | ||||
|   RealD tcopy=0.0; | ||||
|   RealD tgather=0.0; | ||||
|   RealD tscatter=0.0; | ||||
|   RealD tcomms=0.0; | ||||
|   uint64_t xbytes=0; | ||||
|  | ||||
|   int permute_type=grid->PermuteType(dimension); | ||||
|  | ||||
| @@ -414,8 +479,10 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | ||||
|     for(int i=0;i<Nsimd;i++){        | ||||
|       pointers[i] = &send_buf_extract[i][0]; | ||||
|     } | ||||
|     tgather-=usecond(); | ||||
|     int sx   = (x+sshift)%rd; | ||||
|     Gather_plane_extract(rhs,pointers,dimension,sx,cbmask); | ||||
|     tgather+=usecond(); | ||||
|  | ||||
|     for(int i=0;i<Nsimd;i++){ | ||||
|        | ||||
| @@ -440,7 +507,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | ||||
|       if(nbr_proc){ | ||||
| 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);  | ||||
|  | ||||
| 	grid->Barrier(); | ||||
| 	tcomms-=usecond(); | ||||
| 	//	grid->Barrier(); | ||||
|  | ||||
| 	acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes); | ||||
| 	grid->SendToRecvFrom((void *)send_buf_extract_mpi, | ||||
| @@ -449,17 +517,28 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | ||||
| 			     recv_from_rank, | ||||
| 			     bytes); | ||||
| 	acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes); | ||||
| 	xbytes+=bytes; | ||||
|  | ||||
| 	grid->Barrier(); | ||||
| 	//	grid->Barrier(); | ||||
| 	tcomms+=usecond(); | ||||
| 	rpointers[i] = &recv_buf_extract[i][0]; | ||||
|       } else {  | ||||
| 	rpointers[i] = &send_buf_extract[nbr_lane][0]; | ||||
|       } | ||||
|  | ||||
|     } | ||||
|     tscatter-=usecond(); | ||||
|     Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); | ||||
|   } | ||||
|     tscatter+=usecond(); | ||||
|  | ||||
|   } | ||||
|   /* | ||||
|   std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl; | ||||
|   std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl; | ||||
|   std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl; | ||||
|   std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl; | ||||
|   std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl; | ||||
|   */ | ||||
| } | ||||
| #endif | ||||
| NAMESPACE_END(Grid);  | ||||
|   | ||||
| @@ -1,4 +1,5 @@ | ||||
| #include <Grid/GridCore.h>        | ||||
| NAMESPACE_BEGIN(Grid); | ||||
| Vector<std::pair<int,int> > Cshift_table;  | ||||
| std::vector<std::pair<int,int> > Cshift_table;  | ||||
| commVector<std::pair<int,int> > Cshift_table_device;  | ||||
| NAMESPACE_END(Grid); | ||||
|   | ||||
| @@ -47,3 +47,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| #include <Grid/lattice/Lattice_transfer.h> | ||||
| #include <Grid/lattice/Lattice_basis.h> | ||||
| #include <Grid/lattice/Lattice_crc.h> | ||||
| #include <Grid/lattice/PaddedCell.h> | ||||
|   | ||||
| @@ -345,7 +345,9 @@ GridUnopClass(UnaryNot, Not(a)); | ||||
| GridUnopClass(UnaryTrace, trace(a)); | ||||
| GridUnopClass(UnaryTranspose, transpose(a)); | ||||
| GridUnopClass(UnaryTa, Ta(a)); | ||||
| GridUnopClass(UnarySpTa, SpTa(a)); | ||||
| GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a)); | ||||
| GridUnopClass(UnaryProjectOnSpGroup, ProjectOnSpGroup(a)); | ||||
| GridUnopClass(UnaryTimesI, timesI(a)); | ||||
| GridUnopClass(UnaryTimesMinusI, timesMinusI(a)); | ||||
| GridUnopClass(UnaryAbs, abs(a)); | ||||
| @@ -456,7 +458,9 @@ GRID_DEF_UNOP(operator!, UnaryNot); | ||||
| GRID_DEF_UNOP(trace, UnaryTrace); | ||||
| GRID_DEF_UNOP(transpose, UnaryTranspose); | ||||
| GRID_DEF_UNOP(Ta, UnaryTa); | ||||
| GRID_DEF_UNOP(SpTa, UnarySpTa); | ||||
| GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup); | ||||
| GRID_DEF_UNOP(ProjectOnSpGroup, UnaryProjectOnSpGroup); | ||||
| GRID_DEF_UNOP(timesI, UnaryTimesI); | ||||
| GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI); | ||||
| GRID_DEF_UNOP(abs, UnaryAbs);  // abs overloaded in cmath C++98; DON'T do the | ||||
|   | ||||
| @@ -270,5 +270,42 @@ RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const L | ||||
|     return axpby_norm_fast(ret,a,b,x,y); | ||||
| } | ||||
|  | ||||
| /// Trace product | ||||
| template<class obj> auto traceProduct(const Lattice<obj> &rhs_1,const Lattice<obj> &rhs_2) | ||||
|   -> Lattice<decltype(trace(obj()))> | ||||
| { | ||||
|   typedef decltype(trace(obj())) robj; | ||||
|   Lattice<robj> ret_i(rhs_1.Grid()); | ||||
|   autoView( rhs1 , rhs_1, AcceleratorRead); | ||||
|   autoView( rhs2 , rhs_2, AcceleratorRead); | ||||
|   autoView( ret , ret_i, AcceleratorWrite); | ||||
|   ret.Checkerboard() = rhs_1.Checkerboard(); | ||||
|   accelerator_for(ss,rhs1.size(),obj::Nsimd(),{ | ||||
|       coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2(ss))); | ||||
|   }); | ||||
|   return ret_i; | ||||
| } | ||||
|  | ||||
| template<class obj1,class obj2> auto traceProduct(const Lattice<obj1> &rhs_1,const obj2 &rhs2) | ||||
|   -> Lattice<decltype(trace(obj1()))> | ||||
| { | ||||
|   typedef decltype(trace(obj1())) robj; | ||||
|   Lattice<robj> ret_i(rhs_1.Grid()); | ||||
|   autoView( rhs1 , rhs_1, AcceleratorRead); | ||||
|   autoView( ret , ret_i, AcceleratorWrite); | ||||
|   ret.Checkerboard() = rhs_1.Checkerboard(); | ||||
|   accelerator_for(ss,rhs1.size(),obj1::Nsimd(),{ | ||||
|       coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2)); | ||||
|   }); | ||||
|   return ret_i; | ||||
| } | ||||
| template<class obj1,class obj2> auto traceProduct(const obj2 &rhs_2,const Lattice<obj1> &rhs_1) | ||||
|   -> Lattice<decltype(trace(obj1()))> | ||||
| { | ||||
|   return traceProduct(rhs_1,rhs_2); | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
|   | ||||
| @@ -153,33 +153,44 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites) | ||||
| } | ||||
|  | ||||
| template<class vobj> | ||||
| inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) | ||||
| inline typename vobj::scalar_object rankSum(const Lattice<vobj> &arg) | ||||
| { | ||||
|   Integer osites = arg.Grid()->oSites(); | ||||
| #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) | ||||
|   typename vobj::scalar_object ssum; | ||||
|   autoView( arg_v, arg, AcceleratorRead); | ||||
|   ssum= sum_gpu(&arg_v[0],osites); | ||||
|   return sum_gpu(&arg_v[0],osites); | ||||
| #else | ||||
|   autoView(arg_v, arg, CpuRead); | ||||
|   auto ssum= sum_cpu(&arg_v[0],osites); | ||||
|   return sum_cpu(&arg_v[0],osites); | ||||
| #endif   | ||||
| } | ||||
|  | ||||
| template<class vobj> | ||||
| inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) | ||||
| { | ||||
|   auto ssum = rankSum(arg); | ||||
|   arg.Grid()->GlobalSum(ssum); | ||||
|   return ssum; | ||||
| } | ||||
|  | ||||
| template<class vobj> | ||||
| inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg) | ||||
| inline typename vobj::scalar_object rankSumLarge(const Lattice<vobj> &arg) | ||||
| { | ||||
| #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) | ||||
|   autoView( arg_v, arg, AcceleratorRead); | ||||
|   Integer osites = arg.Grid()->oSites(); | ||||
|   auto ssum= sum_gpu_large(&arg_v[0],osites); | ||||
|   return sum_gpu_large(&arg_v[0],osites); | ||||
| #else | ||||
|   autoView(arg_v, arg, CpuRead); | ||||
|   Integer osites = arg.Grid()->oSites(); | ||||
|   auto ssum= sum_cpu(&arg_v[0],osites); | ||||
|   return sum_cpu(&arg_v[0],osites); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| template<class vobj> | ||||
| inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg) | ||||
| { | ||||
|   auto ssum = rankSumLarge(arg); | ||||
|   arg.Grid()->GlobalSum(ssum); | ||||
|   return ssum; | ||||
| } | ||||
| @@ -240,10 +251,10 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> & | ||||
|     autoView( right_v,right, AcceleratorRead); | ||||
|     // This code could read coalesce | ||||
|     // GPU - SIMT lane compliance... | ||||
|     accelerator_for( ss, sites, nsimd,{ | ||||
| 	auto x_l = left_v(ss); | ||||
| 	auto y_l = right_v(ss); | ||||
| 	coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l)); | ||||
|     accelerator_for( ss, sites, 1,{ | ||||
|         auto x_l = left_v[ss]; | ||||
|         auto y_l = right_v[ss]; | ||||
|         inner_tmp_v[ss]=innerProductD(x_l,y_l); | ||||
|     }); | ||||
|   } | ||||
| #else | ||||
| @@ -256,11 +267,18 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> & | ||||
|     autoView( right_v,right, AcceleratorRead); | ||||
|  | ||||
|     // GPU - SIMT lane compliance... | ||||
|     accelerator_for( ss, sites, nsimd,{ | ||||
| 	auto x_l = left_v(ss); | ||||
| 	auto y_l = right_v(ss); | ||||
| 	coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l)); | ||||
|     }); | ||||
|     //accelerator_for( ss, sites, nsimd,{ | ||||
|     //    auto x_l = left_v(ss); | ||||
|     //    auto y_l = right_v(ss); | ||||
|     //    coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l)); | ||||
|     //}); | ||||
|     #pragma omp target map ( to:left_v, right_v ) map ( tofrom:inner_tmp_v ) | ||||
|     #pragma omp teams distribute parallel for thread_limit(THREAD_LIMIT) //nowait | ||||
|     for ( uint64_t ss=0;ss<sites;ss++) {  | ||||
|         auto x_l = left_v[ss]; | ||||
|         auto y_l = right_v[ss]; | ||||
|         coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l)); | ||||
|     } | ||||
|   } | ||||
| #endif | ||||
|   // This is in single precision and fails some tests | ||||
|   | ||||
| @@ -30,9 +30,12 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator & | ||||
|   cudaGetDevice(&device); | ||||
| #endif | ||||
| #ifdef GRID_HIP | ||||
|   hipGetDevice(&device); | ||||
|   auto r=hipGetDevice(&device); | ||||
| #endif | ||||
|    | ||||
| #ifdef GRID_OMPTARGET | ||||
|   device = omp_get_device_num();   | ||||
| #endif | ||||
|  | ||||
|   Iterator warpSize            = gpu_props[device].warpSize; | ||||
|   Iterator sharedMemPerBlock   = gpu_props[device].sharedMemPerBlock; | ||||
|   Iterator maxThreadsPerBlock  = gpu_props[device].maxThreadsPerBlock; | ||||
| @@ -211,25 +214,22 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi | ||||
|   assert(ok); | ||||
|  | ||||
|   Integer smemSize = numThreads * sizeof(sobj); | ||||
|   // UVM seems to be buggy under later CUDA drivers | ||||
|   // This fails on A100 and driver 5.30.02 / CUDA 12.1 | ||||
|   // Fails with multiple NVCC versions back to 11.4, | ||||
|   // which worked with earlier drivers. | ||||
|   // Not sure which driver had first fail and this bears checking | ||||
|   // Is awkward as must install multiple driver versions | ||||
|   // Move out of UVM | ||||
|   // Turns out I had messed up the synchronise after move to compute stream | ||||
|   // as running this on the default stream fools the synchronise | ||||
| #undef UVM_BLOCK_BUFFER   | ||||
| #ifndef UVM_BLOCK_BUFFER   | ||||
|   commVector<sobj> buffer(numBlocks); | ||||
|   sobj *buffer_v = &buffer[0]; | ||||
|   sobj result; | ||||
|   reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); | ||||
|   reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); | ||||
|   accelerator_barrier(); | ||||
|   acceleratorCopyFromDevice(buffer_v,&result,sizeof(result)); | ||||
| #else | ||||
|   Vector<sobj> buffer(numBlocks); | ||||
|   sobj *buffer_v = &buffer[0]; | ||||
|   sobj result; | ||||
|   reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); | ||||
|   reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); | ||||
|   accelerator_barrier(); | ||||
|   result = *buffer_v; | ||||
| #endif | ||||
|   | ||||
| @@ -152,6 +152,7 @@ public: | ||||
| #ifdef RNG_FAST_DISCARD | ||||
|   static void Skip(RngEngine &eng,uint64_t site) | ||||
|   { | ||||
| #if 0 | ||||
|     ///////////////////////////////////////////////////////////////////////////////////// | ||||
|     // Skip by 2^40 elements between successive lattice sites | ||||
|     // This goes by 10^12. | ||||
| @@ -162,9 +163,9 @@ public: | ||||
|     // tens of seconds per trajectory so this is clean in all reasonable cases, | ||||
|     // and margin of safety is orders of magnitude. | ||||
|     // We could hack Sitmo to skip in the higher order words of state if necessary | ||||
|       // | ||||
|       // Replace with 2^30 ; avoid problem on large volumes | ||||
|       // | ||||
|     // | ||||
|     // Replace with 2^30 ; avoid problem on large volumes | ||||
|     // | ||||
|     ///////////////////////////////////////////////////////////////////////////////////// | ||||
|     //      uint64_t skip = site+1;  //   Old init Skipped then drew.  Checked compat with faster init | ||||
|     const int shift = 30; | ||||
| @@ -179,6 +180,9 @@ public: | ||||
|     assert((skip >> shift)==site); // check for overflow | ||||
|  | ||||
|     eng.discard(skip); | ||||
| #else | ||||
|     eng.discardhi(site); | ||||
| #endif | ||||
|     //      std::cout << " Engine  " <<site << " state " <<eng<<std::endl; | ||||
|   }  | ||||
| #endif | ||||
| @@ -440,17 +444,8 @@ public: | ||||
| 	_grid->GlobalCoorToGlobalIndex(gcoor,gidx); | ||||
|  | ||||
| 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); | ||||
| #if 1 | ||||
| 	assert(rank == _grid->ThisRank() ); | ||||
| #else | ||||
| //  | ||||
| 	if (rank != _grid->ThisRank() ){ | ||||
| 	std::cout <<"rank "<<rank<<" _grid->ThisRank() "<<_grid->ThisRank()<< std::endl; | ||||
| //	exit(-42); | ||||
| //	assert(0); | ||||
| 	} | ||||
| #endif | ||||
|  | ||||
| 	assert(rank == _grid->ThisRank() ); | ||||
| 	 | ||||
| 	int l_idx=generator_idx(o_idx,i_idx); | ||||
| 	_generators[l_idx] = master_engine; | ||||
|   | ||||
| @@ -66,6 +66,65 @@ inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex< | ||||
|   return ret; | ||||
| }; | ||||
|  | ||||
| template<int N, class Vec> | ||||
| Lattice<iScalar<iScalar<iScalar<Vec> > > > Determinant(const Lattice<iScalar<iScalar<iMatrix<Vec, N> > > > &Umu) | ||||
| { | ||||
|   GridBase *grid=Umu.Grid(); | ||||
|   auto lvol = grid->lSites(); | ||||
|   Lattice<iScalar<iScalar<iScalar<Vec> > > > ret(grid); | ||||
|   typedef typename Vec::scalar_type scalar; | ||||
|   autoView(Umu_v,Umu,CpuRead); | ||||
|   autoView(ret_v,ret,CpuWrite); | ||||
|   thread_for(site,lvol,{ | ||||
|     Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N); | ||||
|     Coordinate lcoor; | ||||
|     grid->LocalIndexToLocalCoor(site, lcoor); | ||||
|     iScalar<iScalar<iMatrix<scalar, N> > > Us; | ||||
|     peekLocalSite(Us, Umu_v, lcoor); | ||||
|     for(int i=0;i<N;i++){ | ||||
|       for(int j=0;j<N;j++){ | ||||
| 	scalar tmp= Us()()(i,j); | ||||
| 	ComplexD ztmp(real(tmp),imag(tmp)); | ||||
| 	EigenU(i,j)=ztmp; | ||||
|       }} | ||||
|     ComplexD detD  = EigenU.determinant(); | ||||
|     typename Vec::scalar_type det(detD.real(),detD.imag()); | ||||
|     pokeLocalSite(det,ret_v,lcoor); | ||||
|   }); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| template<int N> | ||||
| Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > Inverse(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu) | ||||
| { | ||||
|   GridBase *grid=Umu.Grid(); | ||||
|   auto lvol = grid->lSites(); | ||||
|   Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > ret(grid); | ||||
|    | ||||
|   autoView(Umu_v,Umu,CpuRead); | ||||
|   autoView(ret_v,ret,CpuWrite); | ||||
|   thread_for(site,lvol,{ | ||||
|     Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N); | ||||
|     Coordinate lcoor; | ||||
|     grid->LocalIndexToLocalCoor(site, lcoor); | ||||
|     iScalar<iScalar<iMatrix<ComplexD, N> > > Us; | ||||
|     iScalar<iScalar<iMatrix<ComplexD, N> > > Ui; | ||||
|     peekLocalSite(Us, Umu_v, lcoor); | ||||
|     for(int i=0;i<N;i++){ | ||||
|       for(int j=0;j<N;j++){ | ||||
| 	EigenU(i,j) = Us()()(i,j); | ||||
|       }} | ||||
|     Eigen::MatrixXcd EigenUinv = EigenU.inverse(); | ||||
|     for(int i=0;i<N;i++){ | ||||
|       for(int j=0;j<N;j++){ | ||||
| 	Ui()()(i,j) = EigenUinv(i,j); | ||||
|       }} | ||||
|     pokeLocalSite(Ui,ret_v,lcoor); | ||||
|   }); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
| #endif | ||||
|  | ||||
|   | ||||
| @@ -288,7 +288,36 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData, | ||||
|     blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);  | ||||
|   } | ||||
| } | ||||
| template<class vobj,class CComplex,int nbasis,class VLattice> | ||||
| inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData, | ||||
|                                const std::vector<Lattice<vobj>> &fineData, | ||||
|                                const VLattice &Basis) | ||||
| { | ||||
|   int NBatch = fineData.size(); | ||||
|   assert(coarseData.size() == NBatch); | ||||
|  | ||||
|   GridBase * fine  = fineData[0].Grid(); | ||||
|   GridBase * coarse= coarseData[0].Grid(); | ||||
|  | ||||
|   Lattice<iScalar<CComplex>> ip(coarse); | ||||
|   std::vector<Lattice<vobj>> fineDataCopy = fineData; | ||||
|  | ||||
|   autoView(ip_, ip, AcceleratorWrite); | ||||
|   for(int v=0;v<nbasis;v++) { | ||||
|     for (int k=0; k<NBatch; k++) { | ||||
|       autoView( coarseData_ , coarseData[k], AcceleratorWrite); | ||||
|       blockInnerProductD(ip,Basis[v],fineDataCopy[k]); // ip = <basis|fine> | ||||
|       accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), { | ||||
|         convertType(coarseData_[sc](v),ip_[sc]); | ||||
|       }); | ||||
|  | ||||
|       // improve numerical stability of projection | ||||
|       // |fine> = |fine> - <basis|fine> |basis> | ||||
|       ip=-ip; | ||||
|       blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]);  | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| template<class vobj,class vobj2,class CComplex> | ||||
|   inline void blockZAXPY(Lattice<vobj> &fineZ, | ||||
| @@ -590,6 +619,26 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData, | ||||
| } | ||||
| #endif | ||||
|  | ||||
| template<class vobj,class CComplex,int nbasis,class VLattice> | ||||
| inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData, | ||||
|                                std::vector<Lattice<vobj>> &fineData, | ||||
|                                const VLattice &Basis) | ||||
| { | ||||
|   int NBatch = coarseData.size(); | ||||
|   assert(fineData.size() == NBatch); | ||||
|  | ||||
|   GridBase * fine   = fineData[0].Grid(); | ||||
|   GridBase * coarse = coarseData[0].Grid(); | ||||
|   for (int k=0; k<NBatch; k++) | ||||
|     fineData[k]=Zero(); | ||||
|   for (int i=0;i<nbasis;i++) { | ||||
|     for (int k=0; k<NBatch; k++) { | ||||
|       Lattice<iScalar<CComplex>> ip = PeekIndex<0>(coarseData[k],i); | ||||
|       blockZAXPY(fineData[k],ip,Basis[i],fineData[k]); | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars. | ||||
| // Simd layouts need not match since we use peek/poke Local | ||||
| template<class vobj,class vvobj> | ||||
| @@ -648,8 +697,68 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro | ||||
|   for(int d=0;d<nd;d++){ | ||||
|     assert(Fg->_processors[d]  == Tg->_processors[d]); | ||||
|   } | ||||
|  | ||||
|   // the above should guarantee that the operations are local | ||||
|    | ||||
| #if 1 | ||||
|  | ||||
|   size_t nsite = 1; | ||||
|   for(int i=0;i<nd;i++) nsite *= RegionSize[i]; | ||||
|    | ||||
|   size_t tbytes = 4*nsite*sizeof(int); | ||||
|   int *table = (int*)malloc(tbytes); | ||||
|   | ||||
|   thread_for(idx, nsite, { | ||||
|       Coordinate from_coor, to_coor; | ||||
|       size_t rem = idx; | ||||
|       for(int i=0;i<nd;i++){ | ||||
| 	size_t base_i  = rem % RegionSize[i]; rem /= RegionSize[i]; | ||||
| 	from_coor[i] = base_i + FromLowerLeft[i]; | ||||
| 	to_coor[i] = base_i + ToLowerLeft[i]; | ||||
|       } | ||||
|        | ||||
|       int foidx = Fg->oIndex(from_coor); | ||||
|       int fiidx = Fg->iIndex(from_coor); | ||||
|       int toidx = Tg->oIndex(to_coor); | ||||
|       int tiidx = Tg->iIndex(to_coor); | ||||
|       int* tt = table + 4*idx; | ||||
|       tt[0] = foidx; | ||||
|       tt[1] = fiidx; | ||||
|       tt[2] = toidx; | ||||
|       tt[3] = tiidx; | ||||
|     }); | ||||
|    | ||||
|   int* table_d = (int*)acceleratorAllocDevice(tbytes); | ||||
|   acceleratorCopyToDevice(table,table_d,tbytes); | ||||
|  | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|  | ||||
|   autoView(from_v,From,AcceleratorRead); | ||||
|   autoView(to_v,To,AcceleratorWrite); | ||||
|    | ||||
|   accelerator_for(idx,nsite,1,{ | ||||
|       static const int words=sizeof(vobj)/sizeof(vector_type); | ||||
|       int* tt = table_d + 4*idx; | ||||
|       int from_oidx = *tt++; | ||||
|       int from_lane = *tt++; | ||||
|       int to_oidx = *tt++; | ||||
|       int to_lane = *tt; | ||||
|  | ||||
|       const vector_type* from = (const vector_type *)&from_v[from_oidx]; | ||||
|       vector_type* to = (vector_type *)&to_v[to_oidx]; | ||||
|        | ||||
|       scalar_type stmp; | ||||
|       for(int w=0;w<words;w++){ | ||||
| 	stmp = getlane(from[w], from_lane); | ||||
| 	putlane(to[w], stmp, to_lane); | ||||
|       } | ||||
|     }); | ||||
|    | ||||
|   acceleratorFreeDevice(table_d);     | ||||
|   free(table); | ||||
|    | ||||
|  | ||||
| #else   | ||||
|   Coordinate ldf = Fg->_ldimensions; | ||||
|   Coordinate rdf = Fg->_rdimensions; | ||||
|   Coordinate isf = Fg->_istride; | ||||
| @@ -658,9 +767,9 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro | ||||
|   Coordinate ist = Tg->_istride; | ||||
|   Coordinate ost = Tg->_ostride; | ||||
|  | ||||
|   autoView( t_v , To, AcceleratorWrite); | ||||
|   autoView( f_v , From, AcceleratorRead); | ||||
|   accelerator_for(idx,Fg->lSites(),1,{ | ||||
|   autoView( t_v , To, CpuWrite); | ||||
|   autoView( f_v , From, CpuRead); | ||||
|   thread_for(idx,Fg->lSites(),{ | ||||
|     sobj s; | ||||
|     Coordinate Fcoor(nd); | ||||
|     Coordinate Tcoor(nd); | ||||
| @@ -673,17 +782,24 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro | ||||
|       Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d]; | ||||
|     } | ||||
|     if (in_region) { | ||||
|       Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]); | ||||
|       Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]); | ||||
|       Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]); | ||||
|       Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]); | ||||
|       vector_type * fp = (vector_type *)&f_v[odx_f]; | ||||
|       vector_type * tp = (vector_type *)&t_v[odx_t]; | ||||
| #if 0       | ||||
|       Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]); // inner index from | ||||
|       Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]); // inner index to | ||||
|       Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]); // outer index from | ||||
|       Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]); // outer index to | ||||
|       scalar_type * fp = (scalar_type *)&f_v[odx_f]; | ||||
|       scalar_type * tp = (scalar_type *)&t_v[odx_t]; | ||||
|       for(int w=0;w<words;w++){ | ||||
| 	tp[w].putlane(fp[w].getlane(idx_f),idx_t); | ||||
|       } | ||||
| #else | ||||
|     peekLocalSite(s,f_v,Fcoor); | ||||
|     pokeLocalSite(s,t_v,Tcoor); | ||||
| #endif | ||||
|     } | ||||
|   }); | ||||
|  | ||||
| #endif | ||||
| } | ||||
|  | ||||
|  | ||||
| @@ -776,6 +892,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic | ||||
| } | ||||
|  | ||||
|  | ||||
| //Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim | ||||
| //The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same | ||||
| template<class vobj> | ||||
| void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog) | ||||
| { | ||||
| @@ -792,11 +910,70 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int | ||||
|  | ||||
|   for(int d=0;d<nh;d++){ | ||||
|     if ( d!=orthog ) { | ||||
|     assert(lg->_processors[d]  == hg->_processors[d]); | ||||
|     assert(lg->_ldimensions[d] == hg->_ldimensions[d]); | ||||
|   } | ||||
|       assert(lg->_processors[d]  == hg->_processors[d]); | ||||
|       assert(lg->_ldimensions[d] == hg->_ldimensions[d]); | ||||
|     } | ||||
|   } | ||||
|  | ||||
| #if 1 | ||||
|   size_t nsite = lg->lSites()/lg->LocalDimensions()[orthog]; | ||||
|   size_t tbytes = 4*nsite*sizeof(int); | ||||
|   int *table = (int*)malloc(tbytes); | ||||
|    | ||||
|   thread_for(idx,nsite,{ | ||||
|     Coordinate lcoor(nl); | ||||
|     Coordinate hcoor(nh); | ||||
|     lcoor[orthog] = slice_lo; | ||||
|     hcoor[orthog] = slice_hi; | ||||
|     size_t rem = idx; | ||||
|     for(int mu=0;mu<nl;mu++){ | ||||
|       if(mu != orthog){ | ||||
| 	int xmu = rem % lg->LocalDimensions()[mu];  rem /= lg->LocalDimensions()[mu]; | ||||
| 	lcoor[mu] = hcoor[mu] = xmu; | ||||
|       } | ||||
|     } | ||||
|     int loidx = lg->oIndex(lcoor); | ||||
|     int liidx = lg->iIndex(lcoor); | ||||
|     int hoidx = hg->oIndex(hcoor); | ||||
|     int hiidx = hg->iIndex(hcoor); | ||||
|     int* tt = table + 4*idx; | ||||
|     tt[0] = loidx; | ||||
|     tt[1] = liidx; | ||||
|     tt[2] = hoidx; | ||||
|     tt[3] = hiidx; | ||||
|     }); | ||||
|     | ||||
|   int* table_d = (int*)acceleratorAllocDevice(tbytes); | ||||
|   acceleratorCopyToDevice(table,table_d,tbytes); | ||||
|  | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|  | ||||
|   autoView(lowDim_v,lowDim,AcceleratorRead); | ||||
|   autoView(higherDim_v,higherDim,AcceleratorWrite); | ||||
|    | ||||
|   accelerator_for(idx,nsite,1,{ | ||||
|       static const int words=sizeof(vobj)/sizeof(vector_type); | ||||
|       int* tt = table_d + 4*idx; | ||||
|       int from_oidx = *tt++; | ||||
|       int from_lane = *tt++; | ||||
|       int to_oidx = *tt++; | ||||
|       int to_lane = *tt; | ||||
|  | ||||
|       const vector_type* from = (const vector_type *)&lowDim_v[from_oidx]; | ||||
|       vector_type* to = (vector_type *)&higherDim_v[to_oidx]; | ||||
|        | ||||
|       scalar_type stmp; | ||||
|       for(int w=0;w<words;w++){ | ||||
| 	stmp = getlane(from[w], from_lane); | ||||
| 	putlane(to[w], stmp, to_lane); | ||||
|       } | ||||
|     }); | ||||
|    | ||||
|   acceleratorFreeDevice(table_d);     | ||||
|   free(table); | ||||
|    | ||||
| #else | ||||
|   // the above should guarantee that the operations are local | ||||
|   autoView(lowDimv,lowDim,CpuRead); | ||||
|   autoView(higherDimv,higherDim,CpuWrite); | ||||
| @@ -812,6 +989,7 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int | ||||
|       pokeLocalSite(s,higherDimv,hcoor); | ||||
|     } | ||||
|   }); | ||||
| #endif | ||||
| } | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -79,7 +79,7 @@ public: | ||||
|   accelerator_inline uint64_t end(void)   const { return this->_odata_size; }; | ||||
|   accelerator_inline uint64_t size(void)  const { return this->_odata_size; }; | ||||
|  | ||||
|   LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){} | ||||
|   LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){ } | ||||
|   LatticeView(const LatticeView<vobj> &refer_to_me) = default; // Trivially copyable | ||||
|   LatticeView(const LatticeAccelerator<vobj> &refer_to_me,ViewMode mode) : LatticeAccelerator<vobj> (refer_to_me) | ||||
|   { | ||||
|   | ||||
							
								
								
									
										174
									
								
								Grid/lattice/PaddedCell.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										174
									
								
								Grid/lattice/PaddedCell.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,174 @@ | ||||
| /************************************************************************************* | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
|     Source file: ./lib/lattice/PaddedCell.h | ||||
|  | ||||
|     Copyright (C) 2019 | ||||
|  | ||||
| Author: Peter Boyle pboyle@bnl.gov | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
|  | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
|  | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #pragma once | ||||
|  | ||||
| #include<Grid/cshift/Cshift.h> | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| //Allow the user to specify how the C-shift is performed, e.g. to respect the appropriate boundary conditions | ||||
| template<typename vobj> | ||||
| struct CshiftImplBase{ | ||||
|   virtual Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const = 0; | ||||
|   virtual ~CshiftImplBase(){} | ||||
| }; | ||||
| template<typename vobj> | ||||
| struct CshiftImplDefault: public CshiftImplBase<vobj>{ | ||||
|   Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const override{ return Grid::Cshift(in,dir,shift); } | ||||
| }; | ||||
| template<typename Gimpl> | ||||
| struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::vector_object>{ | ||||
|   typename Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override{ return Gimpl::CshiftLink(in,dir,shift); } | ||||
| };   | ||||
|  | ||||
| class PaddedCell { | ||||
| public: | ||||
|   GridCartesian * unpadded_grid; | ||||
|   int dims; | ||||
|   int depth; | ||||
|   std::vector<GridCartesian *> grids; | ||||
|  | ||||
|   ~PaddedCell() | ||||
|   { | ||||
|     DeleteGrids(); | ||||
|   } | ||||
|   PaddedCell(int _depth,GridCartesian *_grid) | ||||
|   { | ||||
|     unpadded_grid = _grid; | ||||
|     depth=_depth; | ||||
|     dims=_grid->Nd(); | ||||
|     AllocateGrids(); | ||||
|     Coordinate local     =unpadded_grid->LocalDimensions(); | ||||
|     for(int d=0;d<dims;d++){ | ||||
|       assert(local[d]>=depth); | ||||
|     } | ||||
|   } | ||||
|   void DeleteGrids(void) | ||||
|   { | ||||
|     for(int d=0;d<grids.size();d++){ | ||||
|       delete grids[d]; | ||||
|     } | ||||
|     grids.resize(0); | ||||
|   }; | ||||
|   void AllocateGrids(void) | ||||
|   { | ||||
|     Coordinate local     =unpadded_grid->LocalDimensions(); | ||||
|     Coordinate simd      =unpadded_grid->_simd_layout; | ||||
|     Coordinate processors=unpadded_grid->_processors; | ||||
|     Coordinate plocal    =unpadded_grid->LocalDimensions(); | ||||
|     Coordinate global(dims); | ||||
|  | ||||
|     // expand up one dim at a time | ||||
|     for(int d=0;d<dims;d++){ | ||||
|  | ||||
|       plocal[d] += 2*depth;  | ||||
|  | ||||
|       for(int d=0;d<dims;d++){ | ||||
| 	global[d] = plocal[d]*processors[d]; | ||||
|       } | ||||
|  | ||||
|       grids.push_back(new GridCartesian(global,simd,processors)); | ||||
|     } | ||||
|   }; | ||||
|   template<class vobj> | ||||
|   inline Lattice<vobj> Extract(const Lattice<vobj> &in) const | ||||
|   { | ||||
|     Lattice<vobj> out(unpadded_grid); | ||||
|  | ||||
|     Coordinate local     =unpadded_grid->LocalDimensions(); | ||||
|     Coordinate fll(dims,depth); // depends on the MPI spread | ||||
|     Coordinate tll(dims,0); // depends on the MPI spread | ||||
|     localCopyRegion(in,out,fll,tll,local); | ||||
|     return out; | ||||
|   } | ||||
|   template<class vobj> | ||||
|   inline Lattice<vobj> Exchange(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const | ||||
|   { | ||||
|     GridBase *old_grid = in.Grid(); | ||||
|     int dims = old_grid->Nd(); | ||||
|     Lattice<vobj> tmp = in; | ||||
|     for(int d=0;d<dims;d++){ | ||||
|       tmp = Expand(d,tmp,cshift); // rvalue && assignment | ||||
|     } | ||||
|     return tmp; | ||||
|   } | ||||
|   // expand up one dim at a time | ||||
|   template<class vobj> | ||||
|   inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const | ||||
|   { | ||||
|     GridBase *old_grid = in.Grid(); | ||||
|     GridCartesian *new_grid = grids[dim];//These are new grids | ||||
|     Lattice<vobj>  padded(new_grid); | ||||
|     Lattice<vobj> shifted(old_grid);     | ||||
|     Coordinate local     =old_grid->LocalDimensions(); | ||||
|     Coordinate plocal    =new_grid->LocalDimensions(); | ||||
|     if(dim==0) conformable(old_grid,unpadded_grid); | ||||
|     else       conformable(old_grid,grids[dim-1]); | ||||
|  | ||||
|     std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl; | ||||
|  | ||||
|     double tins=0, tshift=0; | ||||
|      | ||||
|     // Middle bit | ||||
|     double t = usecond(); | ||||
|     for(int x=0;x<local[dim];x++){ | ||||
|       InsertSliceLocal(in,padded,x,depth+x,dim); | ||||
|     } | ||||
|     tins += usecond() - t; | ||||
|      | ||||
|     // High bit | ||||
|     t = usecond(); | ||||
|     shifted = cshift.Cshift(in,dim,depth); | ||||
|     tshift += usecond() - t; | ||||
|  | ||||
|     t=usecond(); | ||||
|     for(int x=0;x<depth;x++){ | ||||
|       InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim); | ||||
|     } | ||||
|     tins += usecond() - t; | ||||
|      | ||||
|     // Low bit | ||||
|     t = usecond(); | ||||
|     shifted = cshift.Cshift(in,dim,-depth); | ||||
|     tshift += usecond() - t; | ||||
|      | ||||
|     t = usecond(); | ||||
|     for(int x=0;x<depth;x++){ | ||||
|       InsertSliceLocal(shifted,padded,x,x,dim); | ||||
|     } | ||||
|     tins += usecond() - t; | ||||
|  | ||||
|     std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl; | ||||
|      | ||||
|     return padded; | ||||
|   } | ||||
|  | ||||
| }; | ||||
|   | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
| @@ -104,6 +104,7 @@ template<typename vtype> using iSpinMatrix                = iScalar<iMatrix<iSca | ||||
| template<typename vtype> using iColourMatrix              = iScalar<iScalar<iMatrix<vtype, Nc> > > ; | ||||
| template<typename vtype> using iSpinColourMatrix          = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >; | ||||
| template<typename vtype> using iLorentzColourMatrix       = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ; | ||||
| template<typename vtype> using iLorentzComplex            = iVector<iScalar<iScalar<vtype> >, Nd > ; | ||||
| template<typename vtype> using iDoubleStoredColourMatrix  = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ; | ||||
| template<typename vtype> using iSpinVector                = iScalar<iVector<iScalar<vtype>, Ns> >; | ||||
| template<typename vtype> using iColourVector              = iScalar<iScalar<iVector<vtype, Nc> > >; | ||||
| @@ -178,6 +179,15 @@ typedef iLorentzColourMatrix<vComplexF>  vLorentzColourMatrixF; | ||||
| typedef iLorentzColourMatrix<vComplexD>  vLorentzColourMatrixD; | ||||
| typedef iLorentzColourMatrix<vComplexD2> vLorentzColourMatrixD2; | ||||
|  | ||||
| // LorentzComplex | ||||
| typedef iLorentzComplex<Complex  > LorentzComplex; | ||||
| typedef iLorentzComplex<ComplexF > LorentzComplexF; | ||||
| typedef iLorentzComplex<ComplexD > LorentzComplexD; | ||||
|  | ||||
| typedef iLorentzComplex<vComplex > vLorentzComplex; | ||||
| typedef iLorentzComplex<vComplexF> vLorentzComplexF; | ||||
| typedef iLorentzComplex<vComplexD> vLorentzComplexD; | ||||
|  | ||||
| // DoubleStored gauge field | ||||
| typedef iDoubleStoredColourMatrix<Complex  > DoubleStoredColourMatrix; | ||||
| typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF; | ||||
| @@ -307,6 +317,10 @@ typedef Lattice<vLorentzColourMatrixF>  LatticeLorentzColourMatrixF; | ||||
| typedef Lattice<vLorentzColourMatrixD>  LatticeLorentzColourMatrixD; | ||||
| typedef Lattice<vLorentzColourMatrixD2> LatticeLorentzColourMatrixD2; | ||||
|  | ||||
| typedef Lattice<vLorentzComplex>  LatticeLorentzComplex; | ||||
| typedef Lattice<vLorentzComplexF> LatticeLorentzComplexF; | ||||
| typedef Lattice<vLorentzComplexD> LatticeLorentzComplexD; | ||||
|  | ||||
| // DoubleStored gauge field | ||||
| typedef Lattice<vDoubleStoredColourMatrix>   LatticeDoubleStoredColourMatrix; | ||||
| typedef Lattice<vDoubleStoredColourMatrixF>  LatticeDoubleStoredColourMatrixF; | ||||
|   | ||||
| @@ -34,10 +34,24 @@ directory | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| /////////////////////////////////// | ||||
| // Smart configuration base class | ||||
| /////////////////////////////////// | ||||
| template< class Field > | ||||
| class ConfigurationBase | ||||
| { | ||||
| public: | ||||
|   ConfigurationBase() {} | ||||
|   virtual ~ConfigurationBase() {} | ||||
|   virtual void set_Field(Field& U) =0; | ||||
|   virtual void smeared_force(Field&) = 0; | ||||
|   virtual Field& get_SmearedU() =0; | ||||
|   virtual Field &get_U(bool smeared = false) = 0; | ||||
| }; | ||||
|  | ||||
| template <class GaugeField > | ||||
| class Action  | ||||
| { | ||||
|  | ||||
| public: | ||||
|   bool is_smeared = false; | ||||
|   RealD deriv_norm_sum; | ||||
| @@ -77,16 +91,60 @@ public: | ||||
|   void refresh_timer_stop(void)  { refresh_us+=usecond(); } | ||||
|   void S_timer_start(void)       { S_us-=usecond(); } | ||||
|   void S_timer_stop(void)        { S_us+=usecond(); } | ||||
|   ///////////////////////////// | ||||
|   // Heatbath? | ||||
|   ///////////////////////////// | ||||
|   virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions | ||||
|   virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action | ||||
|   virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ;  // if the refresh computes the action, can cache it. Alternately refreshAndAction() ? | ||||
|   virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0;        // evaluate the action derivative | ||||
|  | ||||
|   ///////////////////////////////////////////////////////////// | ||||
|   // virtual smeared interface through configuration container | ||||
|   ///////////////////////////////////////////////////////////// | ||||
|   virtual void refresh(ConfigurationBase<GaugeField> & U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) | ||||
|   { | ||||
|     refresh(U.get_U(is_smeared),sRNG,pRNG); | ||||
|   } | ||||
|   virtual RealD S(ConfigurationBase<GaugeField>& U) | ||||
|   { | ||||
|     return S(U.get_U(is_smeared)); | ||||
|   } | ||||
|   virtual RealD Sinitial(ConfigurationBase<GaugeField>& U)  | ||||
|   { | ||||
|     return Sinitial(U.get_U(is_smeared)); | ||||
|   } | ||||
|   virtual void deriv(ConfigurationBase<GaugeField>& U, GaugeField& dSdU) | ||||
|   { | ||||
|     deriv(U.get_U(is_smeared),dSdU);  | ||||
|     if ( is_smeared ) { | ||||
|       U.smeared_force(dSdU); | ||||
|     } | ||||
|   } | ||||
|   /////////////////////////////// | ||||
|   // Logging | ||||
|   /////////////////////////////// | ||||
|   virtual std::string action_name()    = 0;                             // return the action name | ||||
|   virtual std::string LogParameters()  = 0;                             // prints action parameters | ||||
|   virtual ~Action(){} | ||||
| }; | ||||
|  | ||||
| template <class GaugeField > | ||||
| class EmptyAction : public Action <GaugeField> | ||||
| { | ||||
|   virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions | ||||
|   virtual RealD S(const GaugeField& U) { return 0.0;};                             // evaluate the action | ||||
|   virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); };        // evaluate the action derivative | ||||
|  | ||||
|   /////////////////////////////// | ||||
|   // Logging | ||||
|   /////////////////////////////// | ||||
|   virtual std::string action_name()    { return std::string("Level Force Log"); }; | ||||
|   virtual std::string LogParameters()  { return std::string("No parameters");}; | ||||
| }; | ||||
|  | ||||
|  | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
| #endif // ACTION_BASE_H | ||||
|   | ||||
| @@ -30,6 +30,8 @@ directory | ||||
| #ifndef QCD_ACTION_CORE | ||||
| #define QCD_ACTION_CORE | ||||
|  | ||||
| #include <Grid/qcd/action/gauge/GaugeImplementations.h> | ||||
|  | ||||
| #include <Grid/qcd/action/ActionBase.h> | ||||
| NAMESPACE_CHECK(ActionBase); | ||||
| #include <Grid/qcd/action/ActionSet.h> | ||||
|   | ||||
| @@ -126,6 +126,16 @@ typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermi | ||||
| typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonTwoIndexAntiSymmetricFermionF; | ||||
| typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonTwoIndexAntiSymmetricFermionD; | ||||
|  | ||||
| // Sp(2n) | ||||
| typedef WilsonFermion<SpWilsonImplF> SpWilsonFermionF; | ||||
| typedef WilsonFermion<SpWilsonImplD> SpWilsonFermionD; | ||||
|  | ||||
| typedef WilsonFermion<SpWilsonTwoIndexAntiSymmetricImplF> SpWilsonTwoIndexAntiSymmetricFermionF; | ||||
| typedef WilsonFermion<SpWilsonTwoIndexAntiSymmetricImplD> SpWilsonTwoIndexAntiSymmetricFermionD; | ||||
|  | ||||
| typedef WilsonFermion<SpWilsonTwoIndexSymmetricImplF> SpWilsonTwoIndexSymmetricFermionF; | ||||
| typedef WilsonFermion<SpWilsonTwoIndexSymmetricImplD> SpWilsonTwoIndexSymmetricFermionD; | ||||
|  | ||||
| // Twisted mass fermion | ||||
| typedef WilsonTMFermion<WilsonImplD2> WilsonTMFermionD2; | ||||
| typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF; | ||||
|   | ||||
| @@ -507,6 +507,7 @@ public: | ||||
|     } | ||||
|     this->face_table_computed=1; | ||||
|     assert(this->u_comm_offset==this->_unified_buffer_size); | ||||
|     accelerator_barrier(); | ||||
|   } | ||||
|  | ||||
| }; | ||||
|   | ||||
| @@ -261,6 +261,22 @@ typedef WilsonImpl<vComplex,  TwoIndexAntiSymmetricRepresentation, CoeffReal > W | ||||
| typedef WilsonImpl<vComplexF, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplF;  // Float | ||||
| typedef WilsonImpl<vComplexD, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplD;  // Double | ||||
|  | ||||
| //sp 2n | ||||
|  | ||||
| typedef WilsonImpl<vComplex,  SpFundamentalRepresentation, CoeffReal > SpWilsonImplR;  // Real.. whichever prec | ||||
| typedef WilsonImpl<vComplexF, SpFundamentalRepresentation, CoeffReal > SpWilsonImplF;  // Float | ||||
| typedef WilsonImpl<vComplexD, SpFundamentalRepresentation, CoeffReal > SpWilsonImplD;  // Double | ||||
|  | ||||
| typedef WilsonImpl<vComplex,  SpTwoIndexAntiSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexAntiSymmetricImplR;  // Real.. whichever prec | ||||
| typedef WilsonImpl<vComplexF, SpTwoIndexAntiSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexAntiSymmetricImplF;  // Float | ||||
| typedef WilsonImpl<vComplexD, SpTwoIndexAntiSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexAntiSymmetricImplD;  // Double | ||||
|  | ||||
| typedef WilsonImpl<vComplex,  SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexSymmetricImplR;  // Real.. whichever prec | ||||
| typedef WilsonImpl<vComplexF, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexSymmetricImplF;  // Float | ||||
| typedef WilsonImpl<vComplexD, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexSymmetricImplD;  // Double | ||||
|  | ||||
| typedef WilsonImpl<vComplex,  SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonAdjImplR;  // Real.. whichever prec    // adj = 2indx symmetric for Sp(2N) | ||||
| typedef WilsonImpl<vComplexF, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonAdjImplF;  // Float     // adj = 2indx symmetric for Sp(2N) | ||||
| typedef WilsonImpl<vComplexD, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonAdjImplD;  // Double    // adj = 2indx symmetric for Sp(2N) | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
|   | ||||
| @@ -332,8 +332,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg | ||||
|   ///////////////////////////// | ||||
|   { | ||||
|     GRID_TRACE("Gather"); | ||||
|     st.HaloExchangeOptGather(in,compressor); | ||||
|     accelerator_barrier(); | ||||
|     st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine | ||||
|   } | ||||
|    | ||||
|   std::vector<std::vector<CommsRequest_t> > requests; | ||||
|   | ||||
| @@ -423,14 +423,14 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S | ||||
| #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier(); | ||||
|  | ||||
| #define KERNEL_CALL_EXT(A)						\ | ||||
|   const uint64_t    NN = Nsite*Ls;					\ | ||||
|   const uint64_t    sz = st.surface_list.size();			\ | ||||
|   auto ptr = &st.surface_list[0];					\ | ||||
|   accelerator_forNB( ss, sz, Simd::Nsimd(), {				\ | ||||
|       int sF = ptr[ss];							\ | ||||
|       int sU = ss/Ls;							\ | ||||
|       int sU = sF/Ls;							\ | ||||
|       WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\ | ||||
|     });									 | ||||
|     });									\ | ||||
|   accelerator_barrier(); | ||||
|  | ||||
| #define ASM_CALL(A)							\ | ||||
|   thread_for( sss, Nsite, {						\ | ||||
| @@ -463,11 +463,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | ||||
|  | ||||
|    if( interior && exterior ) { | ||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;} | ||||
| #ifdef SYCL_HACK      | ||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteSycl);    return; } | ||||
| #else | ||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;} | ||||
| #endif      | ||||
| #ifndef GRID_CUDA | ||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite);    return;} | ||||
| #endif | ||||
| @@ -478,8 +474,10 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | ||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;} | ||||
| #endif | ||||
|    } else if( exterior ) { | ||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteExt); return;} | ||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt);    return;} | ||||
|      // dependent on result of merge | ||||
|      acceleratorFenceComputeStream(); | ||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;} | ||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt);    return;} | ||||
| #ifndef GRID_CUDA | ||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt);    return;} | ||||
| #endif | ||||
| @@ -502,21 +500,20 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | ||||
| #ifndef GRID_CUDA | ||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;} | ||||
| #endif | ||||
|      acceleratorFenceComputeStream(); | ||||
|    } else if( interior ) { | ||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagInt); return;} | ||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt);    return;} | ||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALLNB(GenericDhopSiteDagInt); return;} | ||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteDagInt);    return;} | ||||
| #ifndef GRID_CUDA | ||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;} | ||||
| #endif | ||||
|    } else if( exterior ) { | ||||
|      // Dependent on result of merge | ||||
|      acceleratorFenceComputeStream(); | ||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagExt); return;} | ||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt);    return;} | ||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL_EXT(GenericDhopSiteDagExt); return;} | ||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteDagExt);    return;} | ||||
| #ifndef GRID_CUDA | ||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;} | ||||
| #endif | ||||
|      acceleratorFenceComputeStream(); | ||||
|    } | ||||
|    assert(0 && " Kernel optimisation case not covered "); | ||||
|   } | ||||
|   | ||||
| @@ -0,0 +1 @@ | ||||
| #define IMPLEMENTATION SpWilsonImplD | ||||
| @@ -0,0 +1 @@ | ||||
| ../WilsonCloverFermionInstantiation.cc.master | ||||
| @@ -0,0 +1 @@ | ||||
| ../WilsonFermionInstantiation.cc.master | ||||
| @@ -0,0 +1 @@ | ||||
| ../WilsonTMFermionInstantiation.cc.master | ||||
| @@ -0,0 +1 @@ | ||||
| #define IMPLEMENTATION SpWilsonImplF | ||||
| @@ -0,0 +1 @@ | ||||
| ../WilsonCloverFermionInstantiation.cc.master | ||||
| @@ -0,0 +1 @@ | ||||
| ../WilsonFermionInstantiation.cc.master | ||||
| @@ -0,0 +1 @@ | ||||
| ../WilsonKernelsInstantiation.cc.master | ||||
| @@ -0,0 +1 @@ | ||||
| ../WilsonTMFermionInstantiation.cc.master | ||||
| @@ -0,0 +1 @@ | ||||
| #define IMPLEMENTATION SpWilsonTwoIndexAntiSymmetricImplD | ||||
| @@ -0,0 +1 @@ | ||||
| ../WilsonCloverFermionInstantiation.cc.master | ||||
| @@ -0,0 +1 @@ | ||||
| ../WilsonFermionInstantiation.cc.master | ||||
| @@ -0,0 +1 @@ | ||||
| ../WilsonKernelsInstantiation.cc.master | ||||
| @@ -0,0 +1 @@ | ||||
| ../WilsonTMFermionInstantiation.cc.master | ||||
| @@ -0,0 +1 @@ | ||||
| #define IMPLEMENTATION SpWilsonTwoIndexAntiSymmetricImplF | ||||
| @@ -0,0 +1 @@ | ||||
| ../WilsonCloverFermionInstantiation.cc.master | ||||
| @@ -0,0 +1 @@ | ||||
| ../WilsonFermionInstantiation.cc.master | ||||
| @@ -0,0 +1 @@ | ||||
| ../WilsonKernelsInstantiation.cc.master | ||||
| @@ -0,0 +1 @@ | ||||
| ../WilsonTMFermionInstantiation.cc.master | ||||
| @@ -0,0 +1 @@ | ||||
| #define IMPLEMENTATION SpWilsonTwoIndexSymmetricImplD | ||||
| @@ -0,0 +1 @@ | ||||
| ../WilsonCloverFermionInstantiation.cc.master | ||||
| @@ -0,0 +1 @@ | ||||
| ../WilsonFermionInstantiation.cc.master | ||||
| @@ -0,0 +1 @@ | ||||
| ../WilsonKernelsInstantiation.cc.master | ||||
| @@ -0,0 +1 @@ | ||||
| ../WilsonTMFermionInstantiation.cc.master | ||||
| @@ -0,0 +1 @@ | ||||
| #define IMPLEMENTATION SpWilsonTwoIndexSymmetricImplF | ||||
| @@ -1 +0,0 @@ | ||||
| ../CayleyFermion5DInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../ContinuedFractionFermion5DInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../DomainWallEOFAFermionInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../MobiusEOFAFermionInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../PartialFractionFermion5DInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../WilsonFermion5DInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| #define IMPLEMENTATION WilsonImplD2 | ||||
| @@ -1 +0,0 @@ | ||||
| ../CayleyFermion5DInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../ContinuedFractionFermion5DInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../DomainWallEOFAFermionInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../MobiusEOFAFermionInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../PartialFractionFermion5DInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../WilsonFermion5DInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| #define IMPLEMENTATION ZWilsonImplD2 | ||||
| @@ -10,12 +10,18 @@ WILSON_IMPL_LIST=" \ | ||||
| 	   WilsonImplF \ | ||||
| 	   WilsonImplD \ | ||||
| 	   WilsonImplD2 \ | ||||
| 	   SpWilsonImplF \ | ||||
| 	   SpWilsonImplD \ | ||||
| 	   WilsonAdjImplF \ | ||||
| 	   WilsonAdjImplD \ | ||||
| 	   WilsonTwoIndexSymmetricImplF \ | ||||
| 	   WilsonTwoIndexSymmetricImplD \ | ||||
| 	   WilsonTwoIndexAntiSymmetricImplF \ | ||||
| 	   WilsonTwoIndexAntiSymmetricImplD \ | ||||
| 	   SpWilsonTwoIndexAntiSymmetricImplF \ | ||||
| 	   SpWilsonTwoIndexAntiSymmetricImplD \ | ||||
| 	   SpWilsonTwoIndexSymmetricImplF \ | ||||
| 	   SpWilsonTwoIndexSymmetricImplD \ | ||||
| 	   GparityWilsonImplF \ | ||||
| 	   GparityWilsonImplD " | ||||
|  | ||||
|   | ||||
| @@ -39,6 +39,9 @@ NAMESPACE_BEGIN(Grid); | ||||
| typedef WilsonGaugeAction<PeriodicGimplR>          WilsonGaugeActionR; | ||||
| typedef WilsonGaugeAction<PeriodicGimplF>          WilsonGaugeActionF; | ||||
| typedef WilsonGaugeAction<PeriodicGimplD>          WilsonGaugeActionD; | ||||
| typedef WilsonGaugeAction<SpPeriodicGimplR>        SpWilsonGaugeActionR; | ||||
| typedef WilsonGaugeAction<SpPeriodicGimplF>        SpWilsonGaugeActionF; | ||||
| typedef WilsonGaugeAction<SpPeriodicGimplD>        SpWilsonGaugeActionD; | ||||
| typedef PlaqPlusRectangleAction<PeriodicGimplR>    PlaqPlusRectangleActionR; | ||||
| typedef PlaqPlusRectangleAction<PeriodicGimplF>    PlaqPlusRectangleActionF; | ||||
| typedef PlaqPlusRectangleAction<PeriodicGimplD>    PlaqPlusRectangleActionD; | ||||
|   | ||||
| @@ -61,7 +61,7 @@ NAMESPACE_BEGIN(Grid); | ||||
|   typedef typename Impl::Field Field; | ||||
|  | ||||
| // hardcodes the exponential approximation in the template | ||||
| template <class S, int Nrepresentation = Nc, int Nexp = 12 > class GaugeImplTypes { | ||||
| template <class S, int Nrepresentation = Nc, int Nexp = 12, class Group = SU<Nc> > class GaugeImplTypes { | ||||
| public: | ||||
|   typedef S Simd; | ||||
|   typedef typename Simd::scalar_type scalar_type; | ||||
| @@ -78,8 +78,6 @@ public: | ||||
|   typedef Lattice<SiteLink>    LinkField;  | ||||
|   typedef Lattice<SiteField>   Field; | ||||
|  | ||||
|   typedef SU<Nrepresentation> Group; | ||||
|  | ||||
|   // Guido: we can probably separate the types from the HMC functions | ||||
|   // this will create 2 kind of implementations | ||||
|   // probably confusing the users | ||||
| @@ -119,6 +117,7 @@ public: | ||||
|     // | ||||
|     LinkField Pmu(P.Grid()); | ||||
|     Pmu = Zero(); | ||||
|  | ||||
|     for (int mu = 0; mu < Nd; mu++) { | ||||
|       Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pmu); | ||||
|       RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR) ; | ||||
| @@ -126,8 +125,12 @@ public: | ||||
|       PokeIndex<LorentzIndex>(P, Pmu, mu); | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   static inline Field projectForce(Field &P) { return Ta(P); } | ||||
|      | ||||
|   static inline Field projectForce(Field &P) { | ||||
|       Field ret(P.Grid()); | ||||
|       Group::taProj(P, ret); | ||||
|       return ret; | ||||
|     } | ||||
|  | ||||
|   static inline void update_field(Field& P, Field& U, double ep){ | ||||
|     //static std::chrono::duration<double> diff; | ||||
| @@ -137,14 +140,15 @@ public: | ||||
|     autoView(P_v,P,AcceleratorRead); | ||||
|     accelerator_for(ss, P.Grid()->oSites(),1,{ | ||||
|       for (int mu = 0; mu < Nd; mu++) { | ||||
|         U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu)); | ||||
|           U_v[ss](mu) = Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu); | ||||
|           U_v[ss](mu) = Group::ProjectOnGeneralGroup(U_v[ss](mu)); | ||||
|       } | ||||
|     }); | ||||
|    //auto end = std::chrono::high_resolution_clock::now(); | ||||
|    // diff += end - start; | ||||
|    // std::cout << "Time to exponentiate matrix " << diff.count() << " s\n"; | ||||
|   } | ||||
|  | ||||
|      | ||||
|   static inline RealD FieldSquareNorm(Field& U){ | ||||
|     LatticeComplex Hloc(U.Grid()); | ||||
|     Hloc = Zero(); | ||||
| @@ -157,7 +161,7 @@ public: | ||||
|   } | ||||
|  | ||||
|   static inline void Project(Field &U) { | ||||
|     ProjectSUn(U); | ||||
|     Group::ProjectOnSpecialGroup(U); | ||||
|   } | ||||
|  | ||||
|   static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) { | ||||
| @@ -171,6 +175,7 @@ public: | ||||
|   static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) { | ||||
|     Group::ColdConfiguration(pRNG, U); | ||||
|   } | ||||
|  | ||||
| }; | ||||
|  | ||||
|  | ||||
| @@ -178,10 +183,17 @@ typedef GaugeImplTypes<vComplex, Nc> GimplTypesR; | ||||
| typedef GaugeImplTypes<vComplexF, Nc> GimplTypesF; | ||||
| typedef GaugeImplTypes<vComplexD, Nc> GimplTypesD; | ||||
|  | ||||
| typedef GaugeImplTypes<vComplex, Nc, 12, Sp<Nc> > SpGimplTypesR; | ||||
| typedef GaugeImplTypes<vComplexF, Nc, 12, Sp<Nc> > SpGimplTypesF; | ||||
| typedef GaugeImplTypes<vComplexD, Nc, 12, Sp<Nc> > SpGimplTypesD; | ||||
|  | ||||
| typedef GaugeImplTypes<vComplex, SU<Nc>::AdjointDimension> GimplAdjointTypesR; | ||||
| typedef GaugeImplTypes<vComplexF, SU<Nc>::AdjointDimension> GimplAdjointTypesF; | ||||
| typedef GaugeImplTypes<vComplexD, SU<Nc>::AdjointDimension> GimplAdjointTypesD; | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
| #endif // GRID_GAUGE_IMPL_TYPES_H | ||||
|   | ||||
| @@ -176,7 +176,7 @@ public: | ||||
|       return PeriodicBC::CshiftLink(Link,mu,shift); | ||||
|   } | ||||
|  | ||||
|   static inline void       setDirections(std::vector<int> &conjDirs) { _conjDirs=conjDirs; } | ||||
|   static inline void       setDirections(const std::vector<int> &conjDirs) { _conjDirs=conjDirs; } | ||||
|   static inline std::vector<int> getDirections(void) { return _conjDirs; } | ||||
|   static inline bool isPeriodicGaugeField(void) { return false; } | ||||
| }; | ||||
| @@ -193,6 +193,11 @@ typedef ConjugateGaugeImpl<GimplTypesR> ConjugateGimplR; // Real.. whichever pre | ||||
| typedef ConjugateGaugeImpl<GimplTypesF> ConjugateGimplF; // Float | ||||
| typedef ConjugateGaugeImpl<GimplTypesD> ConjugateGimplD; // Double | ||||
|  | ||||
| typedef PeriodicGaugeImpl<SpGimplTypesR> SpPeriodicGimplR; // Real.. whichever prec | ||||
| typedef PeriodicGaugeImpl<SpGimplTypesF> SpPeriodicGimplF; // Float | ||||
| typedef PeriodicGaugeImpl<SpGimplTypesD> SpPeriodicGimplD; // Double | ||||
|  | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
| #endif | ||||
|   | ||||
| @@ -43,7 +43,7 @@ public: | ||||
| private: | ||||
|   RealD c_plaq; | ||||
|   RealD c_rect; | ||||
|  | ||||
|   typename WilsonLoops<Gimpl>::StapleAndRectStapleAllWorkspace workspace; | ||||
| public: | ||||
|   PlaqPlusRectangleAction(RealD b,RealD c): c_plaq(b),c_rect(c){}; | ||||
|  | ||||
| @@ -79,27 +79,18 @@ public: | ||||
|     GridBase *grid = Umu.Grid(); | ||||
|  | ||||
|     std::vector<GaugeLinkField> U (Nd,grid); | ||||
|     std::vector<GaugeLinkField> U2(Nd,grid); | ||||
|  | ||||
|     for(int mu=0;mu<Nd;mu++){ | ||||
|       U[mu] = PeekIndex<LorentzIndex>(Umu,mu); | ||||
|       WilsonLoops<Gimpl>::RectStapleDouble(U2[mu],U[mu],mu); | ||||
|     } | ||||
|     std::vector<GaugeLinkField> RectStaple(Nd,grid), Staple(Nd,grid); | ||||
|     WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, U, workspace); | ||||
|  | ||||
|     GaugeLinkField dSdU_mu(grid); | ||||
|     GaugeLinkField staple(grid); | ||||
|  | ||||
|     for (int mu=0; mu < Nd; mu++){ | ||||
|  | ||||
|       // Staple in direction mu | ||||
|  | ||||
|       WilsonLoops<Gimpl>::Staple(staple,Umu,mu); | ||||
|  | ||||
|       dSdU_mu = Ta(U[mu]*staple)*factor_p; | ||||
|  | ||||
|       WilsonLoops<Gimpl>::RectStaple(Umu,staple,U2,U,mu); | ||||
|  | ||||
|       dSdU_mu = dSdU_mu + Ta(U[mu]*staple)*factor_r; | ||||
|       dSdU_mu = Ta(U[mu]*Staple[mu])*factor_p; | ||||
|       dSdU_mu = dSdU_mu + Ta(U[mu]*RectStaple[mu])*factor_r; | ||||
| 	   | ||||
|       PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu); | ||||
|     } | ||||
|   | ||||
| @@ -38,91 +38,73 @@ NAMESPACE_BEGIN(Grid); | ||||
|     // cf. GeneralEvenOddRational.h for details | ||||
|     ///////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|        | ||||
|     template<class ImplD, class ImplF, class ImplD2> | ||||
|     template<class ImplD, class ImplF> | ||||
|     class GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<ImplD> { | ||||
|     private: | ||||
|       typedef typename ImplD2::FermionField FermionFieldD2; | ||||
|       typedef typename ImplD::FermionField FermionFieldD; | ||||
|       typedef typename ImplF::FermionField FermionFieldF; | ||||
|  | ||||
|       FermionOperator<ImplD> & NumOpD; | ||||
|       FermionOperator<ImplD> & DenOpD; | ||||
|  | ||||
|       FermionOperator<ImplD2> & NumOpD2; | ||||
|       FermionOperator<ImplD2> & DenOpD2; | ||||
|       | ||||
|       FermionOperator<ImplF> & NumOpF; | ||||
|       FermionOperator<ImplF> & DenOpF; | ||||
|  | ||||
|       Integer ReliableUpdateFreq; | ||||
|     protected: | ||||
|  | ||||
|       //Action evaluation | ||||
|       //Allow derived classes to override the multishift CG | ||||
|       virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){ | ||||
| #if 0 | ||||
| #if 1 | ||||
| 	SchurDifferentiableOperator<ImplD> schurOp(numerator ? NumOpD : DenOpD); | ||||
| 	ConjugateGradientMultiShift<FermionFieldD> msCG(MaxIter, approx); | ||||
| 	msCG(schurOp,in, out); | ||||
| #else | ||||
| 	SchurDifferentiableOperator<ImplD2> schurOpD2(numerator ? NumOpD2 : DenOpD2); | ||||
| 	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD); | ||||
| 	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF); | ||||
| 	FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid()); | ||||
| 	FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid()); | ||||
| 	FermionFieldD inD(NumOpD.FermionRedBlackGrid()); | ||||
| 	FermionFieldD outD(NumOpD.FermionRedBlackGrid()); | ||||
|  | ||||
| 	// Action better with higher precision? | ||||
| 	ConjugateGradientMultiShiftMixedPrec<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); | ||||
| 	precisionChange(inD2,in); | ||||
| 	std::cout << "msCG single solve "<<norm2(inD2)<<" " <<norm2(in)<<std::endl; | ||||
| 	msCG(schurOpD2, inD2, outD2); | ||||
| 	precisionChange(out,outD2); | ||||
| 	ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); | ||||
| 	msCG(schurOpD, in, out); | ||||
| #endif | ||||
|       } | ||||
|       //Force evaluation | ||||
|       virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){ | ||||
| 	SchurDifferentiableOperator<ImplD2> schurOpD2(numerator ? NumOpD2 : DenOpD2); | ||||
| 	SchurDifferentiableOperator<ImplF>  schurOpF (numerator ? NumOpF  : DenOpF); | ||||
| 	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD); | ||||
| 	SchurDifferentiableOperator<ImplF>  schurOpF(numerator ? NumOpF  : DenOpF); | ||||
|  | ||||
| 	FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid()); | ||||
| 	FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid()); | ||||
| 	std::vector<FermionFieldD2> out_elemsD2(out_elems.size(),NumOpD2.FermionRedBlackGrid()); | ||||
| 	ConjugateGradientMultiShiftMixedPrecCleanup<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); | ||||
| 	precisionChange(inD2,in); | ||||
| 	std::cout << "msCG in "<<norm2(inD2)<<" " <<norm2(in)<<std::endl; | ||||
| 	msCG(schurOpD2, inD2, out_elemsD2, outD2); | ||||
| 	precisionChange(out,outD2); | ||||
| 	for(int i=0;i<out_elems.size();i++){ | ||||
| 	  precisionChange(out_elems[i],out_elemsD2[i]); | ||||
| 	} | ||||
| 	FermionFieldD inD(NumOpD.FermionRedBlackGrid()); | ||||
| 	FermionFieldD outD(NumOpD.FermionRedBlackGrid()); | ||||
| 	std::vector<FermionFieldD> out_elemsD(out_elems.size(),NumOpD.FermionRedBlackGrid()); | ||||
| 	ConjugateGradientMultiShiftMixedPrecCleanup<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); | ||||
| 	msCG(schurOpD, in, out_elems, out); | ||||
|       } | ||||
|       //Allow derived classes to override the gauge import | ||||
|       virtual void ImportGauge(const typename ImplD::GaugeField &Ud){ | ||||
|  | ||||
| 	typename ImplF::GaugeField Uf(NumOpF.GaugeGrid()); | ||||
| 	typename ImplD2::GaugeField Ud2(NumOpD2.GaugeGrid()); | ||||
| 	precisionChange(Uf, Ud); | ||||
| 	precisionChange(Ud2, Ud); | ||||
|  | ||||
| 	std::cout << "Importing "<<norm2(Ud)<<" "<< norm2(Uf)<<" " << norm2(Ud2)<<std::endl; | ||||
| 	std::cout << "Importing "<<norm2(Ud)<<" "<< norm2(Uf)<<" " <<std::endl; | ||||
| 	 | ||||
| 	NumOpD.ImportGauge(Ud); | ||||
| 	DenOpD.ImportGauge(Ud); | ||||
|  | ||||
| 	NumOpF.ImportGauge(Uf); | ||||
| 	DenOpF.ImportGauge(Uf); | ||||
|  | ||||
| 	NumOpD2.ImportGauge(Ud2); | ||||
| 	DenOpD2.ImportGauge(Ud2); | ||||
|       } | ||||
|        | ||||
|     public: | ||||
|       GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator<ImplD>  &_NumOpD, FermionOperator<ImplD>  &_DenOpD,  | ||||
| 							      FermionOperator<ImplF>  &_NumOpF, FermionOperator<ImplF>  &_DenOpF,  | ||||
| 							      FermionOperator<ImplD2>  &_NumOpD2, FermionOperator<ImplD2>  &_DenOpD2,  | ||||
| 							      const RationalActionParams & p, Integer _ReliableUpdateFreq | ||||
| 							      ) : GeneralEvenOddRatioRationalPseudoFermionAction<ImplD>(_NumOpD, _DenOpD, p), | ||||
| 								  ReliableUpdateFreq(_ReliableUpdateFreq), | ||||
| 								  NumOpD(_NumOpD), DenOpD(_DenOpD), | ||||
| 								  NumOpF(_NumOpF), DenOpF(_DenOpF), | ||||
| 								  NumOpD2(_NumOpD2), DenOpD2(_DenOpD2) | ||||
| 								  NumOpF(_NumOpF), DenOpF(_DenOpF) | ||||
|       {} | ||||
|        | ||||
|       virtual std::string action_name(){return "GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction";} | ||||
|   | ||||
| @@ -67,9 +67,9 @@ NAMESPACE_BEGIN(Grid); | ||||
|       virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}       | ||||
|     }; | ||||
|  | ||||
|     template<class Impl,class ImplF,class ImplD2> | ||||
|     template<class Impl,class ImplF> | ||||
|     class OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction | ||||
|       : public GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF,ImplD2> { | ||||
|       : public GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF> { | ||||
|     public: | ||||
|       typedef OneFlavourRationalParams Params; | ||||
|     private: | ||||
| @@ -91,11 +91,9 @@ NAMESPACE_BEGIN(Grid); | ||||
| 								 FermionOperator<Impl>  &_DenOp,  | ||||
| 								 FermionOperator<ImplF>  &_NumOpF,  | ||||
| 								 FermionOperator<ImplF>  &_DenOpF,  | ||||
| 								 FermionOperator<ImplD2>  &_NumOpD2,  | ||||
| 								 FermionOperator<ImplD2>  &_DenOpD2,  | ||||
| 								 const Params & p, Integer ReliableUpdateFreq | ||||
| 							) :  | ||||
| 	GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF,ImplD2>(_NumOp, _DenOp,_NumOpF, _DenOpF,_NumOpD2, _DenOpD2, transcribe(p),ReliableUpdateFreq){} | ||||
| 	GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF>(_NumOp, _DenOp,_NumOpF, _DenOpF, transcribe(p),ReliableUpdateFreq){} | ||||
|  | ||||
|       virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}       | ||||
|     }; | ||||
|   | ||||
| @@ -112,40 +112,27 @@ NAMESPACE_BEGIN(Grid); | ||||
|         // NumOp == V | ||||
|         // DenOp == M | ||||
|         // | ||||
|     AUDIT(); | ||||
|         FermionField etaOdd (NumOp.FermionRedBlackGrid()); | ||||
|         FermionField etaEven(NumOp.FermionRedBlackGrid()); | ||||
|         FermionField tmp    (NumOp.FermionRedBlackGrid()); | ||||
|  | ||||
|     AUDIT(); | ||||
|         pickCheckerboard(Even,etaEven,eta); | ||||
|     AUDIT(); | ||||
|         pickCheckerboard(Odd,etaOdd,eta); | ||||
|  | ||||
|     AUDIT(); | ||||
|         NumOp.ImportGauge(U); | ||||
|     AUDIT(); | ||||
|         DenOp.ImportGauge(U); | ||||
| 	std::cout << " TwoFlavourRefresh:  Imported gauge "<<std::endl; | ||||
|     AUDIT(); | ||||
|  | ||||
|         SchurDifferentiableOperator<Impl> Mpc(DenOp); | ||||
|     AUDIT(); | ||||
|         SchurDifferentiableOperator<Impl> Vpc(NumOp); | ||||
|     AUDIT(); | ||||
|  | ||||
| 	std::cout << " TwoFlavourRefresh: Diff ops "<<std::endl; | ||||
|     AUDIT(); | ||||
|         // Odd det factors | ||||
|         Mpc.MpcDag(etaOdd,PhiOdd); | ||||
|     AUDIT(); | ||||
| 	std::cout << " TwoFlavourRefresh: MpcDag "<<std::endl; | ||||
|         tmp=Zero(); | ||||
|     AUDIT(); | ||||
| 	std::cout << " TwoFlavourRefresh: Zero() guess "<<std::endl; | ||||
|     AUDIT(); | ||||
|         HeatbathSolver(Vpc,PhiOdd,tmp); | ||||
|     AUDIT(); | ||||
| 	std::cout << " TwoFlavourRefresh: Heatbath solver "<<std::endl; | ||||
|         Vpc.Mpc(tmp,PhiOdd);             | ||||
| 	std::cout << " TwoFlavourRefresh: Mpc "<<std::endl; | ||||
| @@ -220,20 +207,27 @@ NAMESPACE_BEGIN(Grid); | ||||
|         //X = (Mdag M)^-1 V^dag phi | ||||
|         //Y = (Mdag)^-1 V^dag  phi | ||||
|         Vpc.MpcDag(PhiOdd,Y);          // Y= Vdag phi | ||||
| 	std::cout << GridLogMessage <<" Y "<<norm2(Y)<<std::endl; | ||||
|         X=Zero(); | ||||
|         DerivativeSolver(Mpc,Y,X);     // X= (MdagM)^-1 Vdag phi | ||||
| 	std::cout << GridLogMessage <<" X "<<norm2(X)<<std::endl; | ||||
|         Mpc.Mpc(X,Y);                  // Y=  Mdag^-1 Vdag phi | ||||
| 	std::cout << GridLogMessage <<" Y "<<norm2(Y)<<std::endl; | ||||
|  | ||||
|         // phi^dag V (Mdag M)^-1 dV^dag  phi | ||||
|         Vpc.MpcDagDeriv(force , X, PhiOdd );   dSdU = force; | ||||
| 	std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl; | ||||
|    | ||||
|         // phi^dag dV (Mdag M)^-1 V^dag  phi | ||||
|         Vpc.MpcDeriv(force , PhiOdd, X );      dSdU = dSdU+force; | ||||
| 	std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl; | ||||
|  | ||||
|         //    -    phi^dag V (Mdag M)^-1 Mdag dM   (Mdag M)^-1 V^dag  phi | ||||
|         //    -    phi^dag V (Mdag M)^-1 dMdag M   (Mdag M)^-1 V^dag  phi | ||||
|         Mpc.MpcDeriv(force,Y,X);              dSdU = dSdU-force; | ||||
| 	std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl; | ||||
|         Mpc.MpcDagDeriv(force,X,Y);           dSdU = dSdU-force; | ||||
| 	std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl; | ||||
|  | ||||
|         // FIXME No force contribution from EvenEven assumed here | ||||
|         // Needs a fix for clover. | ||||
|   | ||||
| @@ -225,6 +225,18 @@ template <class RepresentationsPolicy, | ||||
| using GenericHMCRunnerHirep = | ||||
| 				     HMCWrapperTemplate<PeriodicGimplR, Integrator, RepresentationsPolicy>; | ||||
|  | ||||
| // sp2n | ||||
|  | ||||
| template <template <typename, typename, typename> class Integrator> | ||||
| using GenericSpHMCRunner = HMCWrapperTemplate<SpPeriodicGimplR, Integrator>; | ||||
|  | ||||
| template <class RepresentationsPolicy, | ||||
|           template <typename, typename, typename> class Integrator> | ||||
| using GenericSpHMCRunnerHirep = | ||||
|                      HMCWrapperTemplate<SpPeriodicGimplR, Integrator, RepresentationsPolicy>; | ||||
|  | ||||
|  | ||||
|  | ||||
| template <class Implementation, class RepresentationsPolicy,  | ||||
|           template <typename, typename, typename> class Integrator> | ||||
| using GenericHMCRunnerTemplate = HMCWrapperTemplate<Implementation, Integrator, RepresentationsPolicy>; | ||||
|   | ||||
| @@ -283,12 +283,13 @@ public: | ||||
|       std::cout << GridLogHMC << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl; | ||||
|  | ||||
|       TheIntegrator.print_timer(); | ||||
|  | ||||
|        | ||||
|       TheIntegrator.Smearer.set_Field(Ucur); | ||||
|       for (int obs = 0; obs < Observables.size(); obs++) { | ||||
|       	std::cout << GridLogDebug << "Observables # " << obs << std::endl; | ||||
|       	std::cout << GridLogDebug << "Observables total " << Observables.size() << std::endl; | ||||
|       	std::cout << GridLogDebug << "Observables pointer " << Observables[obs] << std::endl; | ||||
|         Observables[obs]->TrajectoryComplete(traj + 1, Ucur, sRNG, pRNG); | ||||
|         Observables[obs]->TrajectoryComplete(traj + 1, TheIntegrator.Smearer, sRNG, pRNG); | ||||
|       } | ||||
|       std::cout << GridLogHMC << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl; | ||||
|     } | ||||
|   | ||||
| @@ -35,13 +35,16 @@ class CheckpointerParameters : Serializable { | ||||
| public: | ||||
|   GRID_SERIALIZABLE_CLASS_MEMBERS(CheckpointerParameters,  | ||||
| 				  std::string, config_prefix,  | ||||
| 				  std::string, smeared_prefix,  | ||||
| 				  std::string, rng_prefix,  | ||||
| 				  int, saveInterval,  | ||||
| 				  bool, saveSmeared,  | ||||
| 				  std::string, format, ); | ||||
|  | ||||
|   CheckpointerParameters(std::string cf = "cfg", std::string rn = "rng", | ||||
|   CheckpointerParameters(std::string cf = "cfg", std::string sf="cfg_smr" , std::string rn = "rng", | ||||
| 			 int savemodulo = 1, const std::string &f = "IEEE64BIG") | ||||
|     : config_prefix(cf), | ||||
|       smeared_prefix(sf), | ||||
|       rng_prefix(rn), | ||||
|       saveInterval(savemodulo), | ||||
|       format(f){}; | ||||
| @@ -61,13 +64,21 @@ template <class Impl> | ||||
| class BaseHmcCheckpointer : public HmcObservable<typename Impl::Field> { | ||||
| public: | ||||
|   void build_filenames(int traj, CheckpointerParameters &Params, | ||||
|                        std::string &conf_file, std::string &rng_file) { | ||||
|                        std::string &conf_file, | ||||
|                        std::string &smear_file, | ||||
| 		       std::string &rng_file) { | ||||
|     { | ||||
|       std::ostringstream os; | ||||
|       os << Params.rng_prefix << "." << traj; | ||||
|       rng_file = os.str(); | ||||
|     } | ||||
|  | ||||
|     { | ||||
|       std::ostringstream os; | ||||
|       os << Params.smeared_prefix << "." << traj; | ||||
|       smear_file = os.str(); | ||||
|     } | ||||
|  | ||||
|     { | ||||
|       std::ostringstream os; | ||||
|       os << Params.config_prefix << "." << traj; | ||||
| @@ -84,6 +95,11 @@ public: | ||||
|   } | ||||
|   virtual void initialize(const CheckpointerParameters &Params) = 0; | ||||
|  | ||||
|   virtual void TrajectoryComplete(int traj, | ||||
|                                   typename Impl::Field &U, | ||||
|                                   GridSerialRNG &sRNG, | ||||
|                                   GridParallelRNG &pRNG) { assert(0); } ; // HMC should pass the smart config with smeared and unsmeared | ||||
|    | ||||
|   virtual void CheckpointRestore(int traj, typename Impl::Field &U, | ||||
|                                  GridSerialRNG &sRNG, | ||||
|                                  GridParallelRNG &pRNG) = 0; | ||||
|   | ||||
| @@ -61,11 +61,14 @@ public: | ||||
|     fout.close(); | ||||
|   } | ||||
|  | ||||
|   void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) { | ||||
|   void TrajectoryComplete(int traj, | ||||
| 			  ConfigurationBase<Field> &SmartConfig, | ||||
| 			  GridSerialRNG &sRNG, GridParallelRNG &pRNG) | ||||
|   { | ||||
|  | ||||
|     if ((traj % Params.saveInterval) == 0) { | ||||
|       std::string config, rng; | ||||
|       this->build_filenames(traj, Params, config, rng); | ||||
|       std::string config, rng, smr; | ||||
|       this->build_filenames(traj, Params, config, smr, rng); | ||||
|  | ||||
|       uint32_t nersc_csum; | ||||
|       uint32_t scidac_csuma; | ||||
| @@ -74,9 +77,15 @@ public: | ||||
|       BinarySimpleUnmunger<sobj_double, sobj> munge; | ||||
|       truncate(rng); | ||||
|       BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb); | ||||
|       truncate(config); | ||||
|       std::cout << GridLogMessage << "Written Binary RNG " << rng | ||||
|                 << " checksum " << std::hex  | ||||
| 		<< nersc_csum   <<"/" | ||||
| 		<< scidac_csuma   <<"/" | ||||
| 		<< scidac_csumb  | ||||
| 		<< std::dec << std::endl; | ||||
|  | ||||
|       BinaryIO::writeLatticeObject<vobj, sobj_double>(U, config, munge, 0, Params.format, | ||||
|       truncate(config); | ||||
|       BinaryIO::writeLatticeObject<vobj, sobj_double>(SmartConfig.get_U(false), config, munge, 0, Params.format, | ||||
| 						      nersc_csum,scidac_csuma,scidac_csumb); | ||||
|  | ||||
|       std::cout << GridLogMessage << "Written Binary Configuration " << config | ||||
| @@ -85,6 +94,18 @@ public: | ||||
| 		<< scidac_csuma   <<"/" | ||||
| 		<< scidac_csumb  | ||||
| 		<< std::dec << std::endl; | ||||
|  | ||||
|       if ( Params.saveSmeared ) { | ||||
| 	truncate(smr); | ||||
| 	BinaryIO::writeLatticeObject<vobj, sobj_double>(SmartConfig.get_U(true), smr, munge, 0, Params.format, | ||||
| 							nersc_csum,scidac_csuma,scidac_csumb); | ||||
| 	std::cout << GridLogMessage << "Written Binary Smeared Configuration " << smr | ||||
|                 << " checksum " << std::hex  | ||||
| 		<< nersc_csum   <<"/" | ||||
| 		<< scidac_csuma   <<"/" | ||||
| 		<< scidac_csumb  | ||||
| 		<< std::dec << std::endl; | ||||
|       } | ||||
|     } | ||||
|  | ||||
|   }; | ||||
|   | ||||
| @@ -69,17 +69,27 @@ public: | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG, | ||||
|   void TrajectoryComplete(int traj, | ||||
| 			  ConfigurationBase<GaugeField> &SmartConfig, | ||||
| 			  GridSerialRNG &sRNG, | ||||
|                           GridParallelRNG &pRNG) { | ||||
|     if ((traj % Params.saveInterval) == 0) { | ||||
|       std::string config, rng; | ||||
|       std::string config, rng, smr; | ||||
|       this->build_filenames(traj, Params, config, rng); | ||||
|       GridBase *grid = U.Grid(); | ||||
|       GridBase *grid = SmartConfig.get_U(false).Grid(); | ||||
|       uint32_t nersc_csum,scidac_csuma,scidac_csumb; | ||||
|       BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb); | ||||
|       std::cout << GridLogMessage << "Written BINARY RNG " << rng | ||||
|                 << " checksum " << std::hex  | ||||
| 		<< nersc_csum<<"/" | ||||
| 		<< scidac_csuma<<"/" | ||||
| 		<< scidac_csumb | ||||
| 		<< std::dec << std::endl; | ||||
|  | ||||
|        | ||||
|       IldgWriter _IldgWriter(grid->IsBoss()); | ||||
|       _IldgWriter.open(config); | ||||
|       _IldgWriter.writeConfiguration<GaugeStats>(U, traj, config, config); | ||||
|       _IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(false), traj, config, config); | ||||
|       _IldgWriter.close(); | ||||
|  | ||||
|       std::cout << GridLogMessage << "Written ILDG Configuration on " << config | ||||
| @@ -88,6 +98,21 @@ public: | ||||
| 		<< scidac_csuma<<"/" | ||||
| 		<< scidac_csumb | ||||
| 		<< std::dec << std::endl; | ||||
|  | ||||
|       if ( Params.saveSmeared ) {  | ||||
| 	IldgWriter _IldgWriter(grid->IsBoss()); | ||||
| 	_IldgWriter.open(smr); | ||||
| 	_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, config, config); | ||||
| 	_IldgWriter.close(); | ||||
|  | ||||
| 	std::cout << GridLogMessage << "Written ILDG Configuration on " << smr | ||||
|                 << " checksum " << std::hex  | ||||
| 		<< nersc_csum<<"/" | ||||
| 		<< scidac_csuma<<"/" | ||||
| 		<< scidac_csumb | ||||
| 		<< std::dec << std::endl; | ||||
|       } | ||||
|  | ||||
|     } | ||||
|   }; | ||||
|  | ||||
|   | ||||
| @@ -52,23 +52,29 @@ public: | ||||
|     Params.format = "IEEE64BIG";  // fixed, overwrite any other choice | ||||
|   } | ||||
|  | ||||
|   void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG, | ||||
|                           GridParallelRNG &pRNG) { | ||||
|   virtual void TrajectoryComplete(int traj, | ||||
|                                   ConfigurationBase<GaugeField> &SmartConfig, | ||||
|                                   GridSerialRNG &sRNG, | ||||
|                                   GridParallelRNG &pRNG) | ||||
|   { | ||||
|     if ((traj % Params.saveInterval) == 0) { | ||||
|       std::string config, rng; | ||||
|       this->build_filenames(traj, Params, config, rng); | ||||
|  | ||||
|       std::string config, rng, smr; | ||||
|       this->build_filenames(traj, Params, config, smr, rng); | ||||
|        | ||||
|       int precision32 = 1; | ||||
|       int tworow = 0; | ||||
|       NerscIO::writeRNGState(sRNG, pRNG, rng); | ||||
|       NerscIO::writeConfiguration<GaugeStats>(U, config, tworow, precision32); | ||||
|       NerscIO::writeConfiguration<GaugeStats>(SmartConfig.get_U(false), config, tworow, precision32); | ||||
|       if ( Params.saveSmeared ) { | ||||
| 	NerscIO::writeConfiguration<GaugeStats>(SmartConfig.get_U(true), smr, tworow, precision32); | ||||
|       } | ||||
|     } | ||||
|   }; | ||||
|  | ||||
|   void CheckpointRestore(int traj, GaugeField &U, GridSerialRNG &sRNG, | ||||
|                          GridParallelRNG &pRNG) { | ||||
|     std::string config, rng; | ||||
|     this->build_filenames(traj, Params, config, rng); | ||||
|     std::string config, rng, smr; | ||||
|     this->build_filenames(traj, Params, config, smr, rng ); | ||||
|     this->check_filename(rng); | ||||
|     this->check_filename(config); | ||||
|  | ||||
|   | ||||
| @@ -70,19 +70,37 @@ class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> { | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG, | ||||
|   void TrajectoryComplete(int traj,  | ||||
| 			  ConfigurationBase<Field> &SmartConfig, | ||||
| 			  GridSerialRNG &sRNG, | ||||
|                           GridParallelRNG &pRNG) { | ||||
|     if ((traj % Params.saveInterval) == 0) { | ||||
|       std::string config, rng; | ||||
|       this->build_filenames(traj, Params, config, rng); | ||||
|       GridBase *grid = U.Grid(); | ||||
|       std::string config, rng,smr; | ||||
|       this->build_filenames(traj, Params, config, smr, rng); | ||||
|       GridBase *grid = SmartConfig.get_U(false).Grid(); | ||||
|       uint32_t nersc_csum,scidac_csuma,scidac_csumb; | ||||
|       BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb); | ||||
|       ScidacWriter _ScidacWriter(grid->IsBoss()); | ||||
|       _ScidacWriter.open(config); | ||||
|       _ScidacWriter.writeScidacFieldRecord(U, MData); | ||||
|       _ScidacWriter.close(); | ||||
|       std::cout << GridLogMessage << "Written Binary RNG " << rng | ||||
|                 << " checksum " << std::hex  | ||||
| 		<< nersc_csum   <<"/" | ||||
| 		<< scidac_csuma   <<"/" | ||||
| 		<< scidac_csumb  | ||||
| 		<< std::dec << std::endl; | ||||
|  | ||||
|  | ||||
|       { | ||||
| 	ScidacWriter _ScidacWriter(grid->IsBoss()); | ||||
| 	_ScidacWriter.open(config); | ||||
| 	_ScidacWriter.writeScidacFieldRecord(SmartConfig.get_U(false), MData); | ||||
| 	_ScidacWriter.close(); | ||||
|       } | ||||
|        | ||||
|       if ( Params.saveSmeared ) { | ||||
| 	ScidacWriter _ScidacWriter(grid->IsBoss()); | ||||
| 	_ScidacWriter.open(smr); | ||||
| 	_ScidacWriter.writeScidacFieldRecord(SmartConfig.get_U(true), MData); | ||||
| 	_ScidacWriter.close(); | ||||
|       } | ||||
|       std::cout << GridLogMessage << "Written Scidac Configuration on " << config << std::endl; | ||||
|     } | ||||
|   }; | ||||
|   | ||||
| @@ -66,6 +66,7 @@ public: | ||||
| template <class FieldImplementation_, class SmearingPolicy, class RepresentationPolicy> | ||||
| class Integrator { | ||||
| protected: | ||||
| public: | ||||
|   typedef FieldImplementation_ FieldImplementation; | ||||
|   typedef typename FieldImplementation::Field MomentaField;  //for readability | ||||
|   typedef typename FieldImplementation::Field Field; | ||||
| @@ -86,6 +87,8 @@ protected: | ||||
|  | ||||
|   const ActionSet<Field, RepresentationPolicy> as; | ||||
|  | ||||
|   ActionSet<Field,RepresentationPolicy> LevelForces; | ||||
|    | ||||
|   //Get a pointer to a shared static instance of the "do-nothing" momentum filter to serve as a default | ||||
|   static MomentumFilterBase<MomentaField> const* getDefaultMomFilter(){  | ||||
|     static MomentumFilterNone<MomentaField> filter; | ||||
| @@ -96,7 +99,6 @@ protected: | ||||
|   { | ||||
|     t_P[level] += ep; | ||||
|     update_P(P, U, level, ep); | ||||
|  | ||||
|     std::cout << GridLogIntegrator << "[" << level << "] P " << " dt " << ep << " : t_P " << t_P[level] << std::endl; | ||||
|   } | ||||
|  | ||||
| @@ -124,37 +126,33 @@ protected: | ||||
|     // input U actually not used in the fundamental case | ||||
|     // Fundamental updates, include smearing | ||||
|  | ||||
|     assert(as.size()==LevelForces.size()); | ||||
|      | ||||
|     Field level_force(U.Grid()); level_force =Zero(); | ||||
|     for (int a = 0; a < as[level].actions.size(); ++a) { | ||||
|  | ||||
|       double start_full = usecond(); | ||||
|       Field force(U.Grid()); | ||||
|       conformable(U.Grid(), Mom.Grid()); | ||||
|  | ||||
|       Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared); | ||||
|       double start_force = usecond(); | ||||
|  | ||||
|       std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] before"<<std::endl; | ||||
|       AUDIT(); | ||||
|        | ||||
|       as[level].actions.at(a)->deriv_timer_start(); | ||||
|       as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta | ||||
|       as[level].actions.at(a)->deriv(Smearer, force);  // deriv should NOT include Ta | ||||
|       as[level].actions.at(a)->deriv_timer_stop(); | ||||
|  | ||||
|       std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] after"<<std::endl; | ||||
|       AUDIT(); | ||||
|  | ||||
|       std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl; | ||||
|       auto name = as[level].actions.at(a)->action_name(); | ||||
|       if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force); | ||||
|  | ||||
|       force = FieldImplementation::projectForce(force); // Ta for gauge fields | ||||
|       double end_force = usecond(); | ||||
|  | ||||
|       //      DumpSliceNorm("force ",force,Nd-1); | ||||
|       MomFilter->applyFilter(force); | ||||
|       std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<<" dt "<<ep<<  std::endl; | ||||
|       DumpSliceNorm("force filtered ",force,Nd-1); | ||||
|        | ||||
|       MomFilter->applyFilter(force); | ||||
|  | ||||
|       std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<<" dt "<<ep<<  std::endl; | ||||
|  | ||||
|       // track the total | ||||
|       level_force = level_force+force; | ||||
|  | ||||
|       Real force_abs   = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x])  | ||||
|       Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;     | ||||
|  | ||||
| @@ -177,6 +175,16 @@ protected: | ||||
|  | ||||
|     } | ||||
|  | ||||
|     { | ||||
|       // total force | ||||
|       Real force_abs   = std::sqrt(norm2(level_force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x])  | ||||
|       Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;     | ||||
|  | ||||
|       Real force_max   = std::sqrt(maxLocalNorm2(level_force)); | ||||
|       Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR;     | ||||
|       LevelForces[level].actions.at(0)->deriv_log(force_abs,force_max,impulse_abs,impulse_max); | ||||
|     } | ||||
|  | ||||
|     // Force from the other representations | ||||
|     as[level].apply(update_P_hireps, Representations, Mom, U, ep); | ||||
|  | ||||
| @@ -226,6 +234,16 @@ public: | ||||
|  | ||||
|     //Default the momentum filter to "do-nothing" | ||||
|     MomFilter = getDefaultMomFilter(); | ||||
|  | ||||
|     for (int level = 0; level < as.size(); ++level) { | ||||
|       int multiplier = as.at(level).multiplier; | ||||
|       ActionLevel<Field> * Level = new ActionLevel<Field>(multiplier); | ||||
|       Level->push_back(new EmptyAction<Field>);  | ||||
|       LevelForces.push_back(*Level); | ||||
|       // does it copy by value or reference?? | ||||
|       // - answer it copies by value, BUT the action level contains a reference that is NOT updated. | ||||
|       // Unsafe code in Guido's area | ||||
|     } | ||||
|   }; | ||||
|  | ||||
|   virtual ~Integrator() {} | ||||
| @@ -243,10 +261,14 @@ public: | ||||
|  | ||||
|   void reset_timer(void) | ||||
|   { | ||||
|     assert(as.size()==LevelForces.size()); | ||||
|     for (int level = 0; level < as.size(); ++level) { | ||||
|       for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { | ||||
|         as[level].actions.at(actionID)->reset_timer(); | ||||
|       } | ||||
|       int actionID=0; | ||||
|       assert(LevelForces.at(level).actions.size()==1); | ||||
|       LevelForces.at(level).actions.at(actionID)->reset_timer(); | ||||
|     } | ||||
|   } | ||||
|   void print_timer(void) | ||||
| @@ -308,6 +330,16 @@ public: | ||||
| 		  <<" calls "     << as[level].actions.at(actionID)->deriv_num | ||||
| 		  << std::endl; | ||||
|       } | ||||
|       int actionID=0; | ||||
|       std::cout << GridLogMessage  | ||||
| 		  << LevelForces[level].actions.at(actionID)->action_name() | ||||
| 		  <<"["<<level<<"]["<< actionID<<"] :\n\t\t " | ||||
| 		  <<" force max " << LevelForces[level].actions.at(actionID)->deriv_max_average() | ||||
| 		  <<" norm "      << LevelForces[level].actions.at(actionID)->deriv_norm_average() | ||||
| 		  <<" Fdt max  "  << LevelForces[level].actions.at(actionID)->Fdt_max_average() | ||||
| 		  <<" Fdt norm "  << LevelForces[level].actions.at(actionID)->Fdt_norm_average() | ||||
| 		  <<" calls "     << LevelForces[level].actions.at(actionID)->deriv_num | ||||
| 		  << std::endl; | ||||
|     } | ||||
|     std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl; | ||||
|   } | ||||
| @@ -329,6 +361,13 @@ public: | ||||
| 	std::cout << as[level].actions.at(actionID)->LogParameters(); | ||||
|       } | ||||
|     } | ||||
|     std::cout << " [Integrator] Total Force loggers: "<< LevelForces.size() <<std::endl; | ||||
|     for (int level = 0; level < LevelForces.size(); ++level) { | ||||
|       std::cout << GridLogMessage << "[Integrator] ---- Level: "<< level << std::endl; | ||||
|       for (int actionID = 0; actionID < LevelForces[level].actions.size(); ++actionID) { | ||||
| 	std::cout << GridLogMessage << "["<< LevelForces[level].actions.at(actionID)->action_name() << "] ID: " << actionID << std::endl; | ||||
|       } | ||||
|     } | ||||
|     std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl; | ||||
|   } | ||||
|  | ||||
| @@ -379,15 +418,10 @@ public: | ||||
| 	auto name = as[level].actions.at(actionID)->action_name(); | ||||
|         std::cout << GridLogMessage << "refresh [" << level << "][" << actionID << "] "<<name << std::endl; | ||||
|  | ||||
|         Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); | ||||
|  | ||||
| 	std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] before"<<std::endl; | ||||
| 	AUDIT(); | ||||
| 	as[level].actions.at(actionID)->refresh_timer_start(); | ||||
|         as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG); | ||||
|         as[level].actions.at(actionID)->refresh(Smearer, sRNG, pRNG); | ||||
| 	as[level].actions.at(actionID)->refresh_timer_stop(); | ||||
| 	std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] after"<<std::endl; | ||||
| 	AUDIT(); | ||||
|  | ||||
|       } | ||||
|  | ||||
|       // Refresh the higher representation actions | ||||
| @@ -415,6 +449,7 @@ public: | ||||
|   RealD S(Field& U)  | ||||
|   {  // here also U not used | ||||
|  | ||||
|     assert(as.size()==LevelForces.size()); | ||||
|     std::cout << GridLogIntegrator << "Integrator action\n"; | ||||
|  | ||||
|     RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom | ||||
| @@ -424,17 +459,16 @@ public: | ||||
|     // Actions | ||||
|     for (int level = 0; level < as.size(); ++level) { | ||||
|       for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { | ||||
| 	AUDIT(); | ||||
|  | ||||
|         // get gauge field from the SmearingPolicy and | ||||
|         // based on the boolean is_smeared in actionID | ||||
|         Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); | ||||
|         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; | ||||
| 	        as[level].actions.at(actionID)->S_timer_start(); | ||||
|         Hterm = as[level].actions.at(actionID)->S(Us); | ||||
|         Hterm = as[level].actions.at(actionID)->S(Smearer); | ||||
|    	        as[level].actions.at(actionID)->S_timer_stop(); | ||||
|         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; | ||||
|         H += Hterm; | ||||
| 	AUDIT(); | ||||
|  | ||||
|       } | ||||
|       as[level].apply(S_hireps, Representations, level, H); | ||||
|     } | ||||
| @@ -447,9 +481,9 @@ public: | ||||
|     void operator()(std::vector<Action<FieldType>*> repr_set, Repr& Rep, int level, RealD& H) { | ||||
|        | ||||
|       for (int a = 0; a < repr_set.size(); ++a) { | ||||
| 	AUDIT(); | ||||
|  | ||||
|         RealD Hterm = repr_set.at(a)->Sinitial(Rep.U); | ||||
| 	AUDIT(); | ||||
|  | ||||
|         std::cout << GridLogMessage << "Sinitial Level " << level << " term " << a << " H Hirep = " << Hterm << std::endl; | ||||
|         H += Hterm; | ||||
|  | ||||
| @@ -471,13 +505,12 @@ public: | ||||
|       for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { | ||||
|         // get gauge field from the SmearingPolicy and | ||||
|         // based on the boolean is_smeared in actionID | ||||
|         Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); | ||||
|         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; | ||||
| 	        as[level].actions.at(actionID)->S_timer_start(); | ||||
| 	AUDIT(); | ||||
|         Hterm = as[level].actions.at(actionID)->Sinitial(Us); | ||||
|    	        as[level].actions.at(actionID)->S_timer_stop(); | ||||
| 	AUDIT(); | ||||
|  | ||||
| 	as[level].actions.at(actionID)->S_timer_start(); | ||||
|         Hterm = as[level].actions.at(actionID)->S(Smearer); | ||||
| 	as[level].actions.at(actionID)->S_timer_stop(); | ||||
|  | ||||
|         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; | ||||
|         H += Hterm; | ||||
|       } | ||||
| @@ -490,7 +523,6 @@ public: | ||||
|    | ||||
|   void integrate(Field& U)  | ||||
|   { | ||||
|     AUDIT(); | ||||
|     // reset the clocks | ||||
|     t_U = 0; | ||||
|     for (int level = 0; level < as.size(); ++level) { | ||||
| @@ -508,10 +540,8 @@ public: | ||||
|       assert(fabs(t_U - t_P[level]) < 1.0e-6);  // must be the same | ||||
|       std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl; | ||||
|     } | ||||
|     AUDIT(); | ||||
|  | ||||
|     FieldImplementation::Project(U); | ||||
|     AUDIT(); | ||||
|  | ||||
|     // and that we indeed got to the end of the trajectory | ||||
|     assert(fabs(t_U - Params.trajL) < 1.0e-6); | ||||
|   | ||||
| @@ -34,6 +34,13 @@ NAMESPACE_BEGIN(Grid); | ||||
| template <class Field> | ||||
| class HmcObservable { | ||||
|  public: | ||||
|   virtual void TrajectoryComplete(int traj, | ||||
|                                   ConfigurationBase<Field> &SmartConfig, | ||||
|                                   GridSerialRNG &sRNG, | ||||
|                                   GridParallelRNG &pRNG) | ||||
|   { | ||||
|     TrajectoryComplete(traj,SmartConfig.get_U(false),sRNG,pRNG); // Unsmeared observable | ||||
|   }; | ||||
|   virtual void TrajectoryComplete(int traj, | ||||
|                                   Field &U, | ||||
|                                   GridSerialRNG &sRNG, | ||||
|   | ||||
| @@ -42,6 +42,18 @@ public: | ||||
|   // necessary for HmcObservable compatibility | ||||
|   typedef typename Impl::Field Field; | ||||
|  | ||||
|   virtual void TrajectoryComplete(int traj, | ||||
|                                   ConfigurationBase<Field> &SmartConfig, | ||||
|                                   GridSerialRNG &sRNG, | ||||
|                                   GridParallelRNG &pRNG) | ||||
|   { | ||||
|     std::cout << GridLogMessage << "+++++++++++++++++++"<<std::endl; | ||||
|     std::cout << GridLogMessage << "Unsmeared plaquette"<<std::endl; | ||||
|     TrajectoryComplete(traj,SmartConfig.get_U(false),sRNG,pRNG); // Unsmeared observable | ||||
|     std::cout << GridLogMessage << "Smeared plaquette"<<std::endl; | ||||
|     TrajectoryComplete(traj,SmartConfig.get_U(true),sRNG,pRNG); // Unsmeared observable | ||||
|     std::cout << GridLogMessage << "+++++++++++++++++++"<<std::endl; | ||||
|   }; | ||||
|   void TrajectoryComplete(int traj, | ||||
|                           Field &U, | ||||
|                           GridSerialRNG &sRNG, | ||||
|   | ||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user