mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-30 19:44:32 +00:00 
			
		
		
		
	Compare commits
	
		
			317 Commits
		
	
	
		
			feature/bl
			...
			807c0b46e8
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 807c0b46e8 | ||
| 2a0d75bac2 | |||
|  | f48298ad4e | ||
|  | 645e47c1ba | ||
|  | d1d9827263 | ||
|  | 14643c0aab | ||
|  | b77a9b8947 | ||
|  | 7d077fe493 | ||
|  | 51051df62c | ||
|  | 33097681b9 | ||
|  | 07e4900218 | ||
|  | 36ab567d67 | ||
|  | e19171523b | ||
|  | 9626a2c7c0 | ||
|  | e936f5b80b | ||
|  | ffc0639cb9 | ||
|  | c5b43b322c | ||
|  | c9c4576237 | ||
|  | 6d0c2de399 | ||
|  | 7786ea9921 | ||
|  | d93eac7b1c | ||
|  | afc316f501 | ||
|  | f14bfd5c1b | ||
|  | c5f1420dea | ||
|  | 018e6da872 | ||
|  | b77bccfac2 | ||
|  | 80359e0d49 | ||
|  | 3d437c5cc4 | ||
|  | b8a7004365 | ||
|  | bd56c95a6f | ||
|  | 994512048e | ||
|  | dbd8bb49dc | ||
|  | 3a29af0ce4 | ||
|  | f7b79cdd45 | ||
|  | 075b9d22d0 | ||
|  | b92428f05f | ||
|  | 34b11864b6 | ||
|  | 1dfaa08afb | ||
|  | f44dce390f | ||
|  | bb71e9a96a | ||
| 78bae9417c | |||
| dd170ead01 | |||
| 014704856f | |||
|  | 6f6844ccf1 | ||
|  | 4c6613d72c | ||
|  | ee92e08edb | ||
|  | c1dcee9328 | ||
|  | 559257bbe9 | ||
|  | 6b150961fe | ||
|  | cff1f8d3b8 | ||
|  | f27d2083cd | ||
|  | 36cc9c524f | ||
|  | 2822487450 | ||
|  | e07fafe46a | ||
|  | 063d290bd8 | ||
|  | 4e6194d92a | ||
|  | de30c4e22a | ||
|  | 5bafcaedfa | ||
|  | bfeceae708 | ||
|  | eacb66591f | ||
|  | fadaa85626 | ||
|  | 02a5b0d786 | ||
|  | 0e2141442a | ||
|  | 769eb0eecb | ||
|  | 4241c7d4a3 | ||
|  | 7b11075102 | ||
|  | abc658dca5 | ||
|  | 2372275b2c | ||
|  | ef736e8aa4 | ||
|  | 5e539e2d54 | ||
|  | 96773f5254 | ||
|  | d80df09f3b | ||
|  | 621e612c30 | ||
|  | 8c3792721b | ||
|  | c95bbd3948 | ||
|  | e28ab7a732 | ||
|  | c797cbe737 | ||
|  | e09dfbf1c2 | ||
| 85e35c4da1 | |||
|  | d72e914cf0 | ||
|  | 3b5254e2d5 | ||
|  | f1c358b596 | ||
|  | c0ef210265 | ||
|  | e3e1cc1962 | ||
|  | 723eadbb5c | ||
|  | e24637ec1e | ||
|  | 8b01ff4ce7 | ||
|  | 588197c487 | ||
|  | 116d90b0ee | ||
|  | b0646ca187 | ||
|  | 1352bad2e4 | ||
|  | 4895ff260e | ||
|  | 470d93006a | ||
|  | 2f3d03f188 | ||
|  | 8db7c23bee | ||
|  | 69dc5172dc | ||
|  | fd72eb6546 | ||
|  | ffd7301649 | ||
|  | d2a8494044 | ||
|  | 0982e0d19b | ||
|  | 3badbfc3c1 | ||
|  | 5465961e30 | ||
| 477b794bc5 | |||
|  | 4835fd1a87 | ||
|  | 6533c25814 | ||
|  | b405767569 | ||
|  | fe88a0c12f | ||
|  | e61a9ed2b4 | ||
|  | de8daa3824 | ||
|  | 3a50fb29cb | ||
|  | 6647d2656f | ||
|  | a6f4dbeb6d | ||
|  | 92a282f2d8 | ||
|  | ca2fd9fc7b | ||
|  | be1a4f5860 | ||
|  | 1b2914ec09 | ||
|  | 519f795066 | ||
|  | 5897b93dd4 | ||
|  | af091e0881 | ||
|  | 3c1e5e9517 | ||
|  | 85b2cb7a8a | ||
|  | 4240ad5ca8 | ||
|  | d418347d86 | ||
|  | 29a4bfe5e5 | ||
|  | 9955bf9daf | ||
|  | b8bdc2eefb | ||
|  | 0078826ff1 | ||
|  | e855c41772 | ||
|  | d169c275b6 | ||
|  | a5125e23f4 | ||
|  | 7b83c80757 | ||
|  | e41821e206 | ||
|  | 5a75ab15a2 | ||
|  | 932c783fbf | ||
|  | 55f9cce577 | ||
|  | b3533ca847 | ||
|  | fd2a637010 | ||
|  | eee27b8b30 | ||
|  | 8522352aa3 | ||
|  | 3beb8f4091 | ||
|  | 12a706e9b1 | ||
|  | 170aa7df01 | ||
|  | e8ad1fef53 | ||
|  | 876c8f4478 | ||
|  | 9c8750f261 | ||
|  | 91efd08179 | ||
|  | 9953511b65 | ||
|  | 025fa9991a | ||
|  | e8c60c355b | ||
|  | 6c9c7f9d85 | ||
|  | f534523ede | ||
|  | 1b8a834beb | ||
|  | aa9df63a05 | ||
|  | 3953312a93 | ||
|  | 6e62f4f616 | ||
|  | 6a7bdca53b | ||
|  | c7fba9aace | ||
|  | ac6c7cb8d6 | ||
|  | c5924833a1 | ||
|  | ac0a74be0d | ||
|  | 42b0e1125d | ||
|  | 339c4fda79 | ||
|  | 9b85bf9402 | ||
|  | 86b02c3cd8 | ||
|  | 7b3b7093fa | ||
|  | 881b08a465 | ||
|  | 3ee5444c69 | ||
|  | 5e28fe56d2 | ||
|  | 3aa43e6065 | ||
|  | 78ac4044ff | ||
|  | 119c3db47f | ||
|  | 21bbdb8fc2 | ||
|  | 5aabe074fe | ||
|  | 739bd7572c | ||
|  | 074627a5bd | ||
|  | 6a23b2c599 | ||
|  | dace904c10 | ||
|  | be98d26610 | ||
|  | bd891fb3f5 | ||
|  | 3984265851 | ||
|  | 45361d188f | ||
|  | 80c9d77e02 | ||
|  | 3aff64dddb | ||
|  | b4f2ca81ff | ||
|  | d1dea5f840 | ||
|  | 54f8b84d16 | ||
|  | da503fef0e | ||
|  | 4a6802098a | ||
|  | f9b41a84d2 | ||
| 5d7e0d18b9 | |||
| 9e64387933 | |||
| 983b681d46 | |||
| 4072408b6f | |||
| bd76b47fbf | |||
|  | 178376f24b | ||
| 18ce23aa75 | |||
|  | 6a0eb466ee | ||
|  | ffa7fe0cc2 | ||
|  | 6b979f0a69 | ||
|  | 4ea29b8f0f | ||
|  | 778291230a | ||
|  | 86dac5ff4f | ||
|  | 4a382fad3f | ||
|  | cc753670d9 | ||
|  | cc9d88ea1c | ||
|  | b281b0166e | ||
|  | 6a21f694ff | ||
|  | fc4db5e963 | ||
|  | 6252ffaf76 | ||
|  | 026e736dfa | ||
|  | 4275b3f431 | ||
|  | af64c1c6b6 | ||
|  | 866f48391a | ||
|  | a4df527d74 | ||
|  | 5764d21161 | ||
|  | 496d04cd85 | ||
|  | 10e6d7c6ce | ||
|  | c42e25e5b8 | ||
|  | a00ae981e0 | ||
|  | 58e020b62a | ||
|  | a7e1aceeca | ||
|  | 7212432f43 | ||
|  | 4a261fab30 | ||
|  | 6af97069b9 | ||
|  | 5068413cdb | ||
|  | 71c6960eea | ||
|  | ddf6d5c9e3 | ||
| 39214702f6 | |||
| 3e4614c63a | |||
|  | 900e01f49b | ||
|  | 2376156fbc | ||
|  | 3f2fd49db4 | ||
|  | 0efa107cb6 | ||
|  | 8feedb4f6f | ||
|  | 05e562e3d7 | ||
|  | dd3bbb8fa2 | ||
|  | 2fbcf13c46 | ||
|  | 4ea48ef0c4 | ||
|  | 5c85774ee3 | ||
|  | d8a9a745d8 | ||
|  | dcf172da3b | ||
|  | 546be724e7 | ||
|  | 481bbaf1fc | ||
|  | 281488611a | ||
|  | bae0f8ea99 | ||
|  | bbbcd36ae5 | ||
|  | 1b8176e2c0 | ||
|  | cbc053c3db | ||
|  | cdf3f6ef6e | ||
|  | ba7f9d7b70 | ||
|  | 371fd123fb | ||
|  | d6ff644aab | ||
|  | 29586f6b5e | ||
|  | fd057c838f | ||
|  | f51222086c | ||
| a3e935c902 | |||
| 7731c7db8e | |||
| ff97340324 | |||
| 920a51438d | |||
| be528b6d27 | |||
|  | f73691ec47 | ||
|  | ccd21f96ff | ||
|  | 4b90cb8888 | ||
|  | 7ebda3e9ec | ||
|  | b10e1b7bc8 | ||
|  | 7d62f1d6d2 | ||
|  | 458c943987 | ||
|  | 88015b0858 | ||
|  | d7dea44ce7 | ||
|  | 37b6b82869 | ||
|  | 92ad5b8f74 | ||
|  | 8c80f1c168 | ||
|  | 0af7d5a793 | ||
|  | 505fa49983 | ||
|  | 7bcf33def9 | ||
|  | a13820656a | ||
|  | fa71b46a41 | ||
|  | b8b3ae6ac1 | ||
|  | 55c008da21 | ||
|  | 2507606bd0 | ||
|  | 7c2ad4f8c8 | ||
|  | 54c8025aad | ||
|  | 921e23e83c | ||
|  | 6e750ecb0e | ||
|  | b8f1f5d2a3 | ||
|  | 9273f2937c | ||
|  | 1aa28b47ae | ||
|  | 629cb2987a | ||
|  | 03235d6368 | ||
|  | 22064c7e4c | ||
|  | 2de03e5172 | ||
|  | 3af4929dda | ||
|  | 1ba429345b | ||
| 32e6d58356 | |||
|  | 88bdd4344b | ||
|  | 4044536eea | ||
|  | 4d8ae6221c | ||
|  | 4e31e4e094 | ||
|  | 0d6674e489 | ||
|  | b145fd4f5b | ||
|  | 8a5b794f25 | ||
|  | 291e80f88a | ||
|  | 1ace5850ae | ||
|  | 283f14b7c1 | ||
|  | 1d6e708083 | ||
|  | 89457e25e3 | ||
|  | 7e3b298d3d | ||
|  | 7ff3e5eed4 | ||
|  | 19eb51cf41 | ||
|  | 470d4dcc6d | ||
|  | ed03bfd555 | ||
|  | 8c0fbcccae | ||
|  | d4866157fe | ||
|  | b6496b6cb5 | ||
|  | 4f5fe57920 | ||
|  | 11fb943b1e | ||
|  | 046a23121e | 
							
								
								
									
										54
									
								
								.github/ISSUE_TEMPLATE/bug-report.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								.github/ISSUE_TEMPLATE/bug-report.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,54 @@ | |||||||
|  | name: Bug report | ||||||
|  | description: Report a bug. | ||||||
|  | title: "<insert title>" | ||||||
|  | labels: [bug] | ||||||
|  |  | ||||||
|  | body: | ||||||
|  |   - type: markdown | ||||||
|  |     attributes: | ||||||
|  |       value: > | ||||||
|  |         Thank you for taking the time to file a bug report. | ||||||
|  |         Please check that the code is pointing to the HEAD of develop | ||||||
|  |         or any commit in master which is tagged with a version number. | ||||||
|  |  | ||||||
|  |   - type: textarea | ||||||
|  |     attributes: | ||||||
|  |       label: "Describe the issue:" | ||||||
|  |       description: > | ||||||
|  |         Describe the issue and any previous attempt to solve it. | ||||||
|  |     validations: | ||||||
|  |       required: true | ||||||
|  |  | ||||||
|  |   - type: textarea | ||||||
|  |     attributes: | ||||||
|  |       label: "Code example:" | ||||||
|  |       description: > | ||||||
|  |         If relevant, show how to reproduce the issue using a minimal working | ||||||
|  |         example. | ||||||
|  |       placeholder: | | ||||||
|  |         << your code here >> | ||||||
|  |       render: shell | ||||||
|  |     validations: | ||||||
|  |       required: false | ||||||
|  |  | ||||||
|  |   - type: textarea | ||||||
|  |     attributes: | ||||||
|  |       label: "Target platform:" | ||||||
|  |       description: > | ||||||
|  |         Give a description of the target platform (CPU, network, compiler). | ||||||
|  |         Please give the full CPU part description, using for example | ||||||
|  |         `cat /proc/cpuinfo | grep 'model name' | uniq` (Linux) | ||||||
|  |         or `sysctl machdep.cpu.brand_string` (macOS) and the full output | ||||||
|  |         the `--version` option of your compiler. | ||||||
|  |     validations: | ||||||
|  |       required: true | ||||||
|  |  | ||||||
|  |   - type: textarea | ||||||
|  |     attributes: | ||||||
|  |       label: "Configure options:" | ||||||
|  |       description: > | ||||||
|  |         Please give the exact configure command used and attach | ||||||
|  |         `config.log`, `grid.config.summary` and the output of `make V=1`. | ||||||
|  |       render: shell | ||||||
|  |     validations: | ||||||
|  |       required: true | ||||||
| @@ -66,6 +66,10 @@ if BUILD_FERMION_REPS | |||||||
|   extra_sources+=$(ADJ_FERMION_FILES) |   extra_sources+=$(ADJ_FERMION_FILES) | ||||||
|   extra_sources+=$(TWOIND_FERMION_FILES) |   extra_sources+=$(TWOIND_FERMION_FILES) | ||||||
| endif | endif | ||||||
|  | if BUILD_SP | ||||||
|  |     extra_sources+=$(SP_FERMION_FILES) | ||||||
|  |     extra_sources+=$(SP_TWOIND_FERMION_FILES) | ||||||
|  | endif | ||||||
|  |  | ||||||
| lib_LIBRARIES = libGrid.a | lib_LIBRARIES = libGrid.a | ||||||
|  |  | ||||||
|   | |||||||
| @@ -55,6 +55,7 @@ NAMESPACE_CHECK(BiCGSTAB); | |||||||
| #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h> | #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h> | ||||||
| #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h> | #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h> | ||||||
| #include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h> | #include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h> | ||||||
|  | #include <Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h> | ||||||
| #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h> | #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h> | ||||||
| #include <Grid/algorithms/iterative/BlockConjugateGradient.h> | #include <Grid/algorithms/iterative/BlockConjugateGradient.h> | ||||||
| #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h> | #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h> | ||||||
|   | |||||||
| @@ -542,6 +542,7 @@ public: | |||||||
|       (*this)(in[i], out[i]); |       (*this)(in[i], out[i]); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |   virtual ~LinearFunction(){}; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| template<class Field> class IdentityLinearFunction : public LinearFunction<Field> { | template<class Field> class IdentityLinearFunction : public LinearFunction<Field> { | ||||||
|   | |||||||
| @@ -191,7 +191,7 @@ public: | |||||||
| 	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl; | 	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl; | ||||||
| 	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl; | 	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl; | ||||||
|  |  | ||||||
| 	std::cout << GridLogMessage << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl; | 	std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl; | ||||||
|  |  | ||||||
|         if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); |         if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										213
									
								
								Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										213
									
								
								Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,213 @@ | |||||||
|  | /************************************************************************************* | ||||||
|  |  | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  |     Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h | ||||||
|  |  | ||||||
|  |     Copyright (C) 2015 | ||||||
|  |  | ||||||
|  |     Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk> | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |  | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |  | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  | *************************************************************************************/ | ||||||
|  | /*  END LEGAL */ | ||||||
|  | #ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H | ||||||
|  | #define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H | ||||||
|  |  | ||||||
|  | NAMESPACE_BEGIN(Grid); | ||||||
|  |  | ||||||
|  | //Mixed precision restarted defect correction CG | ||||||
|  | template<class FieldD,class FieldF,  | ||||||
|  |   typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, | ||||||
|  |   typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>  | ||||||
|  | class MixedPrecisionConjugateGradientBatched : public LinearFunction<FieldD> { | ||||||
|  | public: | ||||||
|  |   using LinearFunction<FieldD>::operator(); | ||||||
|  |   RealD   Tolerance; | ||||||
|  |   RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed | ||||||
|  |   Integer MaxInnerIterations; | ||||||
|  |   Integer MaxOuterIterations; | ||||||
|  |   Integer MaxPatchupIterations; | ||||||
|  |   GridBase* SinglePrecGrid; //Grid for single-precision fields | ||||||
|  |   RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance | ||||||
|  |   LinearOperatorBase<FieldF> &Linop_f; | ||||||
|  |   LinearOperatorBase<FieldD> &Linop_d; | ||||||
|  |  | ||||||
|  |   //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess | ||||||
|  |   LinearFunction<FieldF> *guesser; | ||||||
|  |   bool updateResidual; | ||||||
|  |    | ||||||
|  |   MixedPrecisionConjugateGradientBatched(RealD tol,  | ||||||
|  |           Integer maxinnerit,  | ||||||
|  |           Integer maxouterit,  | ||||||
|  |           Integer maxpatchit, | ||||||
|  |           GridBase* _sp_grid,  | ||||||
|  |           LinearOperatorBase<FieldF> &_Linop_f,  | ||||||
|  |           LinearOperatorBase<FieldD> &_Linop_d, | ||||||
|  |           bool _updateResidual=true) : | ||||||
|  |     Linop_f(_Linop_f), Linop_d(_Linop_d), | ||||||
|  |     Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid), | ||||||
|  |     OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { }; | ||||||
|  |  | ||||||
|  |   void useGuesser(LinearFunction<FieldF> &g){ | ||||||
|  |     guesser = &g; | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   void operator() (const FieldD &src_d_in, FieldD &sol_d){ | ||||||
|  |     std::vector<FieldD> srcs_d_in{src_d_in}; | ||||||
|  |     std::vector<FieldD> sols_d{sol_d}; | ||||||
|  |  | ||||||
|  |     (*this)(srcs_d_in,sols_d); | ||||||
|  |  | ||||||
|  |     sol_d = sols_d[0]; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){ | ||||||
|  |     assert(src_d_in.size() == sol_d.size()); | ||||||
|  |     int NBatch = src_d_in.size(); | ||||||
|  |  | ||||||
|  |     std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl; | ||||||
|  |  | ||||||
|  |     Integer TotalOuterIterations = 0; //Number of restarts | ||||||
|  |     std::vector<Integer> TotalInnerIterations(NBatch,0);     //Number of inner CG iterations | ||||||
|  |     std::vector<Integer> TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step | ||||||
|  |    | ||||||
|  |     GridStopWatch TotalTimer; | ||||||
|  |     TotalTimer.Start(); | ||||||
|  |  | ||||||
|  |     GridStopWatch InnerCGtimer; | ||||||
|  |     GridStopWatch PrecChangeTimer; | ||||||
|  |      | ||||||
|  |     int cb = src_d_in[0].Checkerboard(); | ||||||
|  |      | ||||||
|  |     std::vector<RealD> src_norm; | ||||||
|  |     std::vector<RealD> norm; | ||||||
|  |     std::vector<RealD> stop; | ||||||
|  |      | ||||||
|  |     GridBase* DoublePrecGrid = src_d_in[0].Grid(); | ||||||
|  |     FieldD tmp_d(DoublePrecGrid); | ||||||
|  |     tmp_d.Checkerboard() = cb; | ||||||
|  |      | ||||||
|  |     FieldD tmp2_d(DoublePrecGrid); | ||||||
|  |     tmp2_d.Checkerboard() = cb; | ||||||
|  |  | ||||||
|  |     std::vector<FieldD> src_d; | ||||||
|  |     std::vector<FieldF> src_f; | ||||||
|  |     std::vector<FieldF> sol_f; | ||||||
|  |  | ||||||
|  |     for (int i=0; i<NBatch; i++) { | ||||||
|  |       sol_d[i].Checkerboard() = cb; | ||||||
|  |  | ||||||
|  |       src_norm.push_back(norm2(src_d_in[i])); | ||||||
|  |       norm.push_back(0.); | ||||||
|  |       stop.push_back(src_norm[i] * Tolerance*Tolerance); | ||||||
|  |  | ||||||
|  |       src_d.push_back(src_d_in[i]); //source for next inner iteration, computed from residual during operation | ||||||
|  |  | ||||||
|  |       src_f.push_back(SinglePrecGrid); | ||||||
|  |       src_f[i].Checkerboard() = cb; | ||||||
|  |  | ||||||
|  |       sol_f.push_back(SinglePrecGrid); | ||||||
|  |       sol_f[i].Checkerboard() = cb; | ||||||
|  |     } | ||||||
|  |      | ||||||
|  |     RealD inner_tol = InnerTolerance; | ||||||
|  |      | ||||||
|  |     ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations); | ||||||
|  |     CG_f.ErrorOnNoConverge = false; | ||||||
|  |      | ||||||
|  |     Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count | ||||||
|  |        | ||||||
|  |     for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){ | ||||||
|  |       std::cout << GridLogMessage << std::endl; | ||||||
|  |       std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl; | ||||||
|  |        | ||||||
|  |       bool allConverged = true; | ||||||
|  |        | ||||||
|  |       for (int i=0; i<NBatch; i++) { | ||||||
|  |         //Compute double precision rsd and also new RHS vector. | ||||||
|  |         Linop_d.HermOp(sol_d[i], tmp_d); | ||||||
|  |         norm[i] = axpy_norm(src_d[i], -1., tmp_d, src_d_in[i]); //src_d is residual vector | ||||||
|  |          | ||||||
|  |         std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Outer iteration " << outer_iter <<" solve " << i << " residual "<< norm[i] << " target "<< stop[i] <<std::endl; | ||||||
|  |  | ||||||
|  |         PrecChangeTimer.Start(); | ||||||
|  |         precisionChange(src_f[i], src_d[i]); | ||||||
|  |         PrecChangeTimer.Stop(); | ||||||
|  |          | ||||||
|  |         sol_f[i] = Zero(); | ||||||
|  |        | ||||||
|  |         if(norm[i] > OuterLoopNormMult * stop[i]) { | ||||||
|  |           allConverged = false; | ||||||
|  |         } | ||||||
|  |       } | ||||||
|  |       if (allConverged) break; | ||||||
|  |  | ||||||
|  |       if (updateResidual) { | ||||||
|  |         RealD normMax = *std::max_element(std::begin(norm), std::end(norm)); | ||||||
|  |         RealD stopMax = *std::max_element(std::begin(stop), std::end(stop)); | ||||||
|  |         while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ?? | ||||||
|  |         CG_f.Tolerance = inner_tol; | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       //Optionally improve inner solver guess (eg using known eigenvectors) | ||||||
|  |       if(guesser != NULL) { | ||||||
|  |         (*guesser)(src_f, sol_f); | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       for (int i=0; i<NBatch; i++) { | ||||||
|  |         //Inner CG | ||||||
|  |         InnerCGtimer.Start(); | ||||||
|  |         CG_f(Linop_f, src_f[i], sol_f[i]); | ||||||
|  |         InnerCGtimer.Stop(); | ||||||
|  |         TotalInnerIterations[i] += CG_f.IterationsToComplete; | ||||||
|  |          | ||||||
|  |         //Convert sol back to double and add to double prec solution | ||||||
|  |         PrecChangeTimer.Start(); | ||||||
|  |         precisionChange(tmp_d, sol_f[i]); | ||||||
|  |         PrecChangeTimer.Stop(); | ||||||
|  |          | ||||||
|  |         axpy(sol_d[i], 1.0, tmp_d, sol_d[i]); | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |     } | ||||||
|  |      | ||||||
|  |     //Final trial CG | ||||||
|  |     std::cout << GridLogMessage << std::endl; | ||||||
|  |     std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Starting final patch-up double-precision solve"<<std::endl; | ||||||
|  |      | ||||||
|  |     for (int i=0; i<NBatch; i++) { | ||||||
|  |       ConjugateGradient<FieldD> CG_d(Tolerance, MaxPatchupIterations); | ||||||
|  |       CG_d(Linop_d, src_d_in[i], sol_d[i]); | ||||||
|  |       TotalFinalStepIterations[i] += CG_d.IterationsToComplete; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     TotalTimer.Stop(); | ||||||
|  |  | ||||||
|  |     std::cout << GridLogMessage << std::endl; | ||||||
|  |     for (int i=0; i<NBatch; i++) { | ||||||
|  |       std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: solve " << i << " Inner CG iterations " << TotalInnerIterations[i] << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations[i] << std::endl; | ||||||
|  |     } | ||||||
|  |     std::cout << GridLogMessage << std::endl; | ||||||
|  |     std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl; | ||||||
|  |      | ||||||
|  |   } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
|  | #endif | ||||||
| @@ -166,16 +166,16 @@ public: | |||||||
|       rsqf[s] =rsq[s]; |       rsqf[s] =rsq[s]; | ||||||
|       std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift "<< s <<" target resid "<<rsq[s]<<std::endl; |       std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift "<< s <<" target resid "<<rsq[s]<<std::endl; | ||||||
|       //      ps_d[s] = src_d; |       //      ps_d[s] = src_d; | ||||||
|       precisionChangeFast(ps_f[s],src_d); |       precisionChange(ps_f[s],src_d); | ||||||
|     } |     } | ||||||
|     // r and p for primary |     // r and p for primary | ||||||
|     p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys |     p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys | ||||||
|     r_d = p_d; |     r_d = p_d; | ||||||
|      |      | ||||||
|     //MdagM+m[0] |     //MdagM+m[0] | ||||||
|     precisionChangeFast(p_f,p_d); |     precisionChange(p_f,p_d); | ||||||
|     Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp) |     Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp) | ||||||
|     precisionChangeFast(tmp_d,mmp_f); |     precisionChange(tmp_d,mmp_f); | ||||||
|     Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp) |     Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp) | ||||||
|     tmp_d = tmp_d - mmp_d; |     tmp_d = tmp_d - mmp_d; | ||||||
|     std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl; |     std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl; | ||||||
| @@ -204,7 +204,7 @@ public: | |||||||
|    |    | ||||||
|     for(int s=0;s<nshift;s++) { |     for(int s=0;s<nshift;s++) { | ||||||
|       axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d); |       axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d); | ||||||
|       precisionChangeFast(psi_f[s],psi_d[s]); |       precisionChange(psi_f[s],psi_d[s]); | ||||||
|     } |     } | ||||||
|    |    | ||||||
|     /////////////////////////////////////// |     /////////////////////////////////////// | ||||||
| @@ -225,7 +225,7 @@ public: | |||||||
|       AXPYTimer.Stop(); |       AXPYTimer.Stop(); | ||||||
|  |  | ||||||
|       PrecChangeTimer.Start(); |       PrecChangeTimer.Start(); | ||||||
|       precisionChangeFast(r_f, r_d); |       precisionChange(r_f, r_d); | ||||||
|       PrecChangeTimer.Stop(); |       PrecChangeTimer.Stop(); | ||||||
|  |  | ||||||
|       AXPYTimer.Start(); |       AXPYTimer.Start(); | ||||||
| @@ -243,13 +243,13 @@ public: | |||||||
|  |  | ||||||
|       cp=c; |       cp=c; | ||||||
|       PrecChangeTimer.Start(); |       PrecChangeTimer.Start(); | ||||||
|       precisionChangeFast(p_f, p_d); //get back single prec search direction for linop |       precisionChange(p_f, p_d); //get back single prec search direction for linop | ||||||
|       PrecChangeTimer.Stop(); |       PrecChangeTimer.Stop(); | ||||||
|       MatrixTimer.Start();   |       MatrixTimer.Start();   | ||||||
|       Linop_f.HermOp(p_f,mmp_f); |       Linop_f.HermOp(p_f,mmp_f); | ||||||
|       MatrixTimer.Stop();   |       MatrixTimer.Stop();   | ||||||
|       PrecChangeTimer.Start(); |       PrecChangeTimer.Start(); | ||||||
|       precisionChangeFast(mmp_d, mmp_f); // From Float to Double |       precisionChange(mmp_d, mmp_f); // From Float to Double | ||||||
|       PrecChangeTimer.Stop(); |       PrecChangeTimer.Stop(); | ||||||
|  |  | ||||||
|       d=real(innerProduct(p_d,mmp_d));     |       d=real(innerProduct(p_d,mmp_d));     | ||||||
| @@ -311,7 +311,7 @@ public: | |||||||
| 	SolverTimer.Stop(); | 	SolverTimer.Stop(); | ||||||
|  |  | ||||||
| 	for(int s=0;s<nshift;s++){ | 	for(int s=0;s<nshift;s++){ | ||||||
| 	  precisionChangeFast(psi_d[s],psi_f[s]); | 	  precisionChange(psi_d[s],psi_f[s]); | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	 | 	 | ||||||
|   | |||||||
| @@ -211,7 +211,7 @@ public: | |||||||
|     Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp) |     Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp) | ||||||
|     tmp_d = tmp_d - mmp_d; |     tmp_d = tmp_d - mmp_d; | ||||||
|     std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl; |     std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl; | ||||||
|     //    assert(norm2(tmp_d)< 1.0e-4); |     assert(norm2(tmp_d)< 1.0); | ||||||
|  |  | ||||||
|     axpy(mmp_d,mass[0],p_d,mmp_d); |     axpy(mmp_d,mass[0],p_d,mmp_d); | ||||||
|     RealD rn = norm2(p_d); |     RealD rn = norm2(p_d); | ||||||
|   | |||||||
| @@ -419,14 +419,15 @@ until convergence | |||||||
| 	} | 	} | ||||||
|       } |       } | ||||||
|  |  | ||||||
|       if ( Nconv < Nstop ) |       if ( Nconv < Nstop ) { | ||||||
| 	std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl; | 	std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl; | ||||||
|  | 	std::cout << GridLogIRL << "returning Nstop vectors, the last "<< Nstop-Nconv << "of which might meet convergence criterion only approximately" <<std::endl; | ||||||
|  |       } | ||||||
|       eval=eval2; |       eval=eval2; | ||||||
|        |        | ||||||
|       //Keep only converged |       //Keep only converged | ||||||
|       eval.resize(Nconv);// Nstop? |       eval.resize(Nstop);// was Nconv | ||||||
|       evec.resize(Nconv,grid);// Nstop? |       evec.resize(Nstop,grid);// was Nconv | ||||||
|       basisSortInPlace(evec,eval,reverse); |       basisSortInPlace(evec,eval,reverse); | ||||||
|        |        | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -4,11 +4,14 @@ NAMESPACE_BEGIN(Grid); | |||||||
|  |  | ||||||
| /*Allocation types, saying which pointer cache should be used*/ | /*Allocation types, saying which pointer cache should be used*/ | ||||||
| #define Cpu      (0) | #define Cpu      (0) | ||||||
| #define CpuSmall (1) | #define CpuHuge  (1) | ||||||
| #define Acc      (2) | #define CpuSmall (2) | ||||||
| #define AccSmall (3) | #define Acc      (3) | ||||||
| #define Shared   (4) | #define AccHuge  (4) | ||||||
| #define SharedSmall (5) | #define AccSmall (5) | ||||||
|  | #define Shared   (6) | ||||||
|  | #define SharedHuge  (7) | ||||||
|  | #define SharedSmall (8) | ||||||
| #undef GRID_MM_VERBOSE  | #undef GRID_MM_VERBOSE  | ||||||
| uint64_t total_shared; | uint64_t total_shared; | ||||||
| uint64_t total_device; | uint64_t total_device; | ||||||
| @@ -35,12 +38,15 @@ void MemoryManager::PrintBytes(void) | |||||||
|    |    | ||||||
| } | } | ||||||
|  |  | ||||||
|  | uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; } | ||||||
|  | uint64_t MemoryManager::HostCacheBytes()   { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; } | ||||||
|  |  | ||||||
| ////////////////////////////////////////////////////////////////////// | ////////////////////////////////////////////////////////////////////// | ||||||
| // Data tables for recently freed pooiniter caches | // Data tables for recently freed pooiniter caches | ||||||
| ////////////////////////////////////////////////////////////////////// | ////////////////////////////////////////////////////////////////////// | ||||||
| MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax]; | MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax]; | ||||||
| int MemoryManager::Victim[MemoryManager::NallocType]; | int MemoryManager::Victim[MemoryManager::NallocType]; | ||||||
| int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 }; | int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 }; | ||||||
| uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType]; | uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType]; | ||||||
| ////////////////////////////////////////////////////////////////////// | ////////////////////////////////////////////////////////////////////// | ||||||
| // Actual allocation and deallocation utils | // Actual allocation and deallocation utils | ||||||
| @@ -170,6 +176,16 @@ void MemoryManager::Init(void) | |||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   str= getenv("GRID_ALLOC_NCACHE_HUGE"); | ||||||
|  |   if ( str ) { | ||||||
|  |     Nc = atoi(str); | ||||||
|  |     if ( (Nc>=0) && (Nc < NallocCacheMax)) { | ||||||
|  |       Ncache[CpuHuge]=Nc; | ||||||
|  |       Ncache[AccHuge]=Nc; | ||||||
|  |       Ncache[SharedHuge]=Nc; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|   str= getenv("GRID_ALLOC_NCACHE_SMALL"); |   str= getenv("GRID_ALLOC_NCACHE_SMALL"); | ||||||
|   if ( str ) { |   if ( str ) { | ||||||
|     Nc = atoi(str); |     Nc = atoi(str); | ||||||
| @@ -190,7 +206,9 @@ void MemoryManager::InitMessage(void) { | |||||||
|    |    | ||||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl; |   std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl; | ||||||
| #ifdef ALLOCATION_CACHE | #ifdef ALLOCATION_CACHE | ||||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl; |   std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host   allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<" HUGE "<<Ncache[CpuHuge]<<std::endl; | ||||||
|  |   std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<" Huge "<<Ncache[AccHuge]<<std::endl; | ||||||
|  |   std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<" Huge "<<Ncache[SharedHuge]<<std::endl; | ||||||
| #endif | #endif | ||||||
|    |    | ||||||
| #ifdef GRID_UVM | #ifdef GRID_UVM | ||||||
| @@ -222,8 +240,11 @@ void MemoryManager::InitMessage(void) { | |||||||
| void *MemoryManager::Insert(void *ptr,size_t bytes,int type)  | void *MemoryManager::Insert(void *ptr,size_t bytes,int type)  | ||||||
| { | { | ||||||
| #ifdef ALLOCATION_CACHE | #ifdef ALLOCATION_CACHE | ||||||
|   bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); |   int cache; | ||||||
|   int cache = type + small; |   if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2; | ||||||
|  |   else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1; | ||||||
|  |   else                                     cache = type; | ||||||
|  |  | ||||||
|   return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);   |   return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);   | ||||||
| #else | #else | ||||||
|   return ptr; |   return ptr; | ||||||
| @@ -232,11 +253,12 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type) | |||||||
|  |  | ||||||
| void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)  | void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)  | ||||||
| { | { | ||||||
|   assert(ncache>0); |  | ||||||
| #ifdef GRID_OMP | #ifdef GRID_OMP | ||||||
|   assert(omp_in_parallel()==0); |   assert(omp_in_parallel()==0); | ||||||
| #endif  | #endif  | ||||||
|  |  | ||||||
|  |   if (ncache == 0) return ptr; | ||||||
|  |  | ||||||
|   void * ret = NULL; |   void * ret = NULL; | ||||||
|   int v = -1; |   int v = -1; | ||||||
|  |  | ||||||
| @@ -271,8 +293,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries | |||||||
| void *MemoryManager::Lookup(size_t bytes,int type) | void *MemoryManager::Lookup(size_t bytes,int type) | ||||||
| { | { | ||||||
| #ifdef ALLOCATION_CACHE | #ifdef ALLOCATION_CACHE | ||||||
|   bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); |   int cache; | ||||||
|   int cache = type+small; |   if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2; | ||||||
|  |   else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1; | ||||||
|  |   else                                     cache = type; | ||||||
|  |  | ||||||
|   return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]); |   return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]); | ||||||
| #else | #else | ||||||
|   return NULL; |   return NULL; | ||||||
| @@ -281,7 +306,6 @@ void *MemoryManager::Lookup(size_t bytes,int type) | |||||||
|  |  | ||||||
| void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)  | void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)  | ||||||
| { | { | ||||||
|   assert(ncache>0); |  | ||||||
| #ifdef GRID_OMP | #ifdef GRID_OMP | ||||||
|   assert(omp_in_parallel()==0); |   assert(omp_in_parallel()==0); | ||||||
| #endif  | #endif  | ||||||
|   | |||||||
| @@ -35,6 +35,7 @@ NAMESPACE_BEGIN(Grid); | |||||||
| // Move control to configure.ac and Config.h? | // Move control to configure.ac and Config.h? | ||||||
|  |  | ||||||
| #define GRID_ALLOC_SMALL_LIMIT (4096) | #define GRID_ALLOC_SMALL_LIMIT (4096) | ||||||
|  | #define GRID_ALLOC_HUGE_LIMIT  (2147483648) | ||||||
|  |  | ||||||
| #define STRINGIFY(x) #x | #define STRINGIFY(x) #x | ||||||
| #define TOSTRING(x) STRINGIFY(x) | #define TOSTRING(x) STRINGIFY(x) | ||||||
| @@ -70,6 +71,21 @@ enum ViewMode { | |||||||
|   CpuWriteDiscard = 0x10 // same for now |   CpuWriteDiscard = 0x10 // same for now | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | struct MemoryStatus { | ||||||
|  |   uint64_t     DeviceBytes; | ||||||
|  |   uint64_t     DeviceLRUBytes; | ||||||
|  |   uint64_t     DeviceMaxBytes; | ||||||
|  |   uint64_t     HostToDeviceBytes; | ||||||
|  |   uint64_t     DeviceToHostBytes; | ||||||
|  |   uint64_t     HostToDeviceXfer; | ||||||
|  |   uint64_t     DeviceToHostXfer; | ||||||
|  |   uint64_t     DeviceEvictions; | ||||||
|  |   uint64_t     DeviceDestroy; | ||||||
|  |   uint64_t     DeviceAllocCacheBytes; | ||||||
|  |   uint64_t     HostAllocCacheBytes; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  |  | ||||||
| class MemoryManager { | class MemoryManager { | ||||||
| private: | private: | ||||||
|  |  | ||||||
| @@ -83,7 +99,7 @@ private: | |||||||
|   } AllocationCacheEntry; |   } AllocationCacheEntry; | ||||||
|  |  | ||||||
|   static const int NallocCacheMax=128;  |   static const int NallocCacheMax=128;  | ||||||
|   static const int NallocType=6; |   static const int NallocType=9; | ||||||
|   static AllocationCacheEntry Entries[NallocType][NallocCacheMax]; |   static AllocationCacheEntry Entries[NallocType][NallocCacheMax]; | ||||||
|   static int Victim[NallocType]; |   static int Victim[NallocType]; | ||||||
|   static int Ncache[NallocType]; |   static int Ncache[NallocType]; | ||||||
| @@ -121,7 +137,26 @@ private: | |||||||
|   static uint64_t     DeviceToHostXfer; |   static uint64_t     DeviceToHostXfer; | ||||||
|   static uint64_t     DeviceEvictions; |   static uint64_t     DeviceEvictions; | ||||||
|   static uint64_t     DeviceDestroy; |   static uint64_t     DeviceDestroy; | ||||||
|   |    | ||||||
|  |   static uint64_t     DeviceCacheBytes(); | ||||||
|  |   static uint64_t     HostCacheBytes(); | ||||||
|  |  | ||||||
|  |   static MemoryStatus GetFootprint(void) { | ||||||
|  |     MemoryStatus stat; | ||||||
|  |     stat.DeviceBytes       = DeviceBytes; | ||||||
|  |     stat.DeviceLRUBytes    = DeviceLRUBytes; | ||||||
|  |     stat.DeviceMaxBytes    = DeviceMaxBytes; | ||||||
|  |     stat.HostToDeviceBytes = HostToDeviceBytes; | ||||||
|  |     stat.DeviceToHostBytes = DeviceToHostBytes; | ||||||
|  |     stat.HostToDeviceXfer  = HostToDeviceXfer; | ||||||
|  |     stat.DeviceToHostXfer  = DeviceToHostXfer; | ||||||
|  |     stat.DeviceEvictions   = DeviceEvictions; | ||||||
|  |     stat.DeviceDestroy     = DeviceDestroy; | ||||||
|  |     stat.DeviceAllocCacheBytes = DeviceCacheBytes(); | ||||||
|  |     stat.HostAllocCacheBytes   = HostCacheBytes(); | ||||||
|  |     return stat; | ||||||
|  |   }; | ||||||
|  |    | ||||||
|  private: |  private: | ||||||
| #ifndef GRID_UVM | #ifndef GRID_UVM | ||||||
|   ////////////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -519,7 +519,6 @@ void MemoryManager::Audit(std::string s) | |||||||
|   uint64_t LruBytes1=0; |   uint64_t LruBytes1=0; | ||||||
|   uint64_t LruBytes2=0; |   uint64_t LruBytes2=0; | ||||||
|   uint64_t LruCnt=0; |   uint64_t LruCnt=0; | ||||||
|   uint64_t LockedBytes=0; |  | ||||||
|    |    | ||||||
|   std::cout << " Memory Manager::Audit() from "<<s<<std::endl; |   std::cout << " Memory Manager::Audit() from "<<s<<std::endl; | ||||||
|   for(auto it=LRU.begin();it!=LRU.end();it++){ |   for(auto it=LRU.begin();it!=LRU.end();it++){ | ||||||
|   | |||||||
| @@ -400,9 +400,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques | |||||||
| } | } | ||||||
| void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir) | void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir) | ||||||
| { | { | ||||||
|   acceleratorCopySynchronise(); |  | ||||||
|   StencilBarrier();// Synch shared memory on a single nodes |  | ||||||
|  |  | ||||||
|   int nreq=list.size(); |   int nreq=list.size(); | ||||||
|  |  | ||||||
|   if (nreq==0) return; |   if (nreq==0) return; | ||||||
|   | |||||||
| @@ -128,7 +128,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques | |||||||
| 							 int recv_from_rank,int dor, | 							 int recv_from_rank,int dor, | ||||||
| 							 int xbytes,int rbytes, int dir) | 							 int xbytes,int rbytes, int dir) | ||||||
| { | { | ||||||
|   return 2.0*bytes; |   return xbytes+rbytes; | ||||||
| } | } | ||||||
| void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) | void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) | ||||||
| { | { | ||||||
|   | |||||||
| @@ -91,6 +91,59 @@ void *SharedMemory::ShmBufferSelf(void) | |||||||
|   //std::cerr << "ShmBufferSelf "<<ShmRank<<" "<<std::hex<< ShmCommBufs[ShmRank] <<std::dec<<std::endl; |   //std::cerr << "ShmBufferSelf "<<ShmRank<<" "<<std::hex<< ShmCommBufs[ShmRank] <<std::dec<<std::endl; | ||||||
|   return ShmCommBufs[ShmRank]; |   return ShmCommBufs[ShmRank]; | ||||||
| } | } | ||||||
|  | static inline int divides(int a,int b) | ||||||
|  | { | ||||||
|  |   return ( b == ( (b/a)*a ) ); | ||||||
|  | } | ||||||
|  | void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims) | ||||||
|  | { | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   // Allow user to configure through environment variable | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str()); | ||||||
|  |   if ( str ) { | ||||||
|  |     std::vector<int> IntShmDims; | ||||||
|  |     GridCmdOptionIntVector(std::string(str),IntShmDims); | ||||||
|  |     assert(IntShmDims.size() == WorldDims.size()); | ||||||
|  |     long ShmSize = 1; | ||||||
|  |     for (int dim=0;dim<WorldDims.size();dim++) { | ||||||
|  |       ShmSize *= (ShmDims[dim] = IntShmDims[dim]); | ||||||
|  |       assert(divides(ShmDims[dim],WorldDims[dim])); | ||||||
|  |     } | ||||||
|  |     assert(ShmSize == WorldShmSize); | ||||||
|  |     return; | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   // Powers of 2,3,5 only in prime decomposition for now | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   int ndimension = WorldDims.size(); | ||||||
|  |   ShmDims=Coordinate(ndimension,1); | ||||||
|  |  | ||||||
|  |   std::vector<int> primes({2,3,5}); | ||||||
|  |  | ||||||
|  |   int dim = 0; | ||||||
|  |   int last_dim = ndimension - 1; | ||||||
|  |   int AutoShmSize = 1; | ||||||
|  |   while(AutoShmSize != WorldShmSize) { | ||||||
|  |     int p; | ||||||
|  |     for(p=0;p<primes.size();p++) { | ||||||
|  |       int prime=primes[p]; | ||||||
|  |       if ( divides(prime,WorldDims[dim]/ShmDims[dim]) | ||||||
|  |         && divides(prime,WorldShmSize/AutoShmSize)  ) { | ||||||
|  |   AutoShmSize*=prime; | ||||||
|  |   ShmDims[dim]*=prime; | ||||||
|  |   last_dim = dim; | ||||||
|  |   break; | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     if (p == primes.size() && last_dim == dim) { | ||||||
|  |       std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl; | ||||||
|  |       exit(EXIT_FAILURE); | ||||||
|  |     } | ||||||
|  |     dim=(dim+1) %ndimension; | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid);  | NAMESPACE_END(Grid);  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -27,9 +27,10 @@ Author: Christoph Lehner <christoph@lhnr.de> | |||||||
| *************************************************************************************/ | *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
|  |  | ||||||
|  | #define Mheader "SharedMemoryMpi: " | ||||||
|  |  | ||||||
| #include <Grid/GridCore.h> | #include <Grid/GridCore.h> | ||||||
| #include <pwd.h> | #include <pwd.h> | ||||||
| #include <syscall.h> |  | ||||||
|  |  | ||||||
| #ifdef GRID_CUDA | #ifdef GRID_CUDA | ||||||
| #include <cuda_runtime_api.h> | #include <cuda_runtime_api.h> | ||||||
| @@ -37,12 +38,120 @@ Author: Christoph Lehner <christoph@lhnr.de> | |||||||
| #ifdef GRID_HIP | #ifdef GRID_HIP | ||||||
| #include <hip/hip_runtime_api.h> | #include <hip/hip_runtime_api.h> | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_SYCl | #ifdef GRID_SYCL | ||||||
|  | #define GRID_SYCL_LEVEL_ZERO_IPC | ||||||
|  | #include <syscall.h> | ||||||
|  | #define SHM_SOCKETS  | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | #include <sys/socket.h> | ||||||
|  | #include <sys/un.h> | ||||||
|  |  | ||||||
| NAMESPACE_BEGIN(Grid);  | NAMESPACE_BEGIN(Grid);  | ||||||
| #define header "SharedMemoryMpi: " |  | ||||||
|  | #ifdef SHM_SOCKETS | ||||||
|  |  | ||||||
|  | /* | ||||||
|  |  * Barbaric extra intranode communication route in case we need sockets to pass FDs | ||||||
|  |  * Forced by level_zero not being nicely designed | ||||||
|  |  */ | ||||||
|  | static int sock; | ||||||
|  | static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d"; | ||||||
|  | static char sock_path[256]; | ||||||
|  | class UnixSockets { | ||||||
|  | public: | ||||||
|  |   static void Open(int rank) | ||||||
|  |   { | ||||||
|  |     int errnum; | ||||||
|  |  | ||||||
|  |     sock = socket(AF_UNIX, SOCK_DGRAM, 0);  assert(sock>0); | ||||||
|  |  | ||||||
|  |     struct sockaddr_un sa_un = { 0 }; | ||||||
|  |     sa_un.sun_family = AF_UNIX; | ||||||
|  |     snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank); | ||||||
|  |     unlink(sa_un.sun_path); | ||||||
|  |     if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) { | ||||||
|  |       perror("bind failure"); | ||||||
|  |       exit(EXIT_FAILURE); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   static int RecvFileDescriptor(void) | ||||||
|  |   { | ||||||
|  |     int n; | ||||||
|  |     int fd; | ||||||
|  |     char buf[1]; | ||||||
|  |     struct iovec iov; | ||||||
|  |     struct msghdr msg; | ||||||
|  |     struct cmsghdr *cmsg; | ||||||
|  |     char cms[CMSG_SPACE(sizeof(int))]; | ||||||
|  |  | ||||||
|  |     iov.iov_base = buf; | ||||||
|  |     iov.iov_len = 1; | ||||||
|  |  | ||||||
|  |     memset(&msg, 0, sizeof msg); | ||||||
|  |     msg.msg_name = 0; | ||||||
|  |     msg.msg_namelen = 0; | ||||||
|  |     msg.msg_iov = &iov; | ||||||
|  |     msg.msg_iovlen = 1; | ||||||
|  |  | ||||||
|  |     msg.msg_control = (caddr_t)cms; | ||||||
|  |     msg.msg_controllen = sizeof cms; | ||||||
|  |  | ||||||
|  |     if((n=recvmsg(sock, &msg, 0)) < 0) { | ||||||
|  |       perror("recvmsg failed"); | ||||||
|  |       return -1; | ||||||
|  |     } | ||||||
|  |     if(n == 0){ | ||||||
|  |       perror("recvmsg returned 0"); | ||||||
|  |       return -1; | ||||||
|  |     } | ||||||
|  |     cmsg = CMSG_FIRSTHDR(&msg); | ||||||
|  |  | ||||||
|  |     memmove(&fd, CMSG_DATA(cmsg), sizeof(int)); | ||||||
|  |  | ||||||
|  |     return fd; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   static void SendFileDescriptor(int fildes,int xmit_to_rank) | ||||||
|  |   { | ||||||
|  |     struct msghdr msg; | ||||||
|  |     struct iovec iov; | ||||||
|  |     struct cmsghdr *cmsg = NULL; | ||||||
|  |     char ctrl[CMSG_SPACE(sizeof(int))]; | ||||||
|  |     char data = ' '; | ||||||
|  |  | ||||||
|  |     memset(&msg, 0, sizeof(struct msghdr)); | ||||||
|  |     memset(ctrl, 0, CMSG_SPACE(sizeof(int))); | ||||||
|  |     iov.iov_base = &data; | ||||||
|  |     iov.iov_len = sizeof(data); | ||||||
|  |      | ||||||
|  |     sprintf(sock_path,sock_path_fmt,xmit_to_rank); | ||||||
|  |      | ||||||
|  |     struct sockaddr_un sa_un = { 0 }; | ||||||
|  |     sa_un.sun_family = AF_UNIX; | ||||||
|  |     snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank); | ||||||
|  |  | ||||||
|  |     msg.msg_name = (void *)&sa_un; | ||||||
|  |     msg.msg_namelen = sizeof(sa_un); | ||||||
|  |     msg.msg_iov = &iov; | ||||||
|  |     msg.msg_iovlen = 1; | ||||||
|  |     msg.msg_controllen =  CMSG_SPACE(sizeof(int)); | ||||||
|  |     msg.msg_control = ctrl; | ||||||
|  |  | ||||||
|  |     cmsg = CMSG_FIRSTHDR(&msg); | ||||||
|  |     cmsg->cmsg_level = SOL_SOCKET; | ||||||
|  |     cmsg->cmsg_type = SCM_RIGHTS; | ||||||
|  |     cmsg->cmsg_len = CMSG_LEN(sizeof(int)); | ||||||
|  |  | ||||||
|  |     *((int *) CMSG_DATA(cmsg)) = fildes; | ||||||
|  |  | ||||||
|  |     sendmsg(sock, &msg, 0); | ||||||
|  |   }; | ||||||
|  | }; | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |  | ||||||
| /*Construct from an MPI communicator*/ | /*Construct from an MPI communicator*/ | ||||||
| void GlobalSharedMemory::Init(Grid_MPI_Comm comm) | void GlobalSharedMemory::Init(Grid_MPI_Comm comm) | ||||||
| { | { | ||||||
| @@ -65,8 +174,8 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm) | |||||||
|   MPI_Comm_size(WorldShmComm     ,&WorldShmSize); |   MPI_Comm_size(WorldShmComm     ,&WorldShmSize); | ||||||
|  |  | ||||||
|   if ( WorldRank == 0) { |   if ( WorldRank == 0) { | ||||||
|     std::cout << header " World communicator of size " <<WorldSize << std::endl;   |     std::cout << Mheader " World communicator of size " <<WorldSize << std::endl;   | ||||||
|     std::cout << header " Node  communicator of size " <<WorldShmSize << std::endl; |     std::cout << Mheader " Node  communicator of size " <<WorldShmSize << std::endl; | ||||||
|   } |   } | ||||||
|   // WorldShmComm, WorldShmSize, WorldShmRank |   // WorldShmComm, WorldShmSize, WorldShmRank | ||||||
|  |  | ||||||
| @@ -169,59 +278,7 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M | |||||||
|   if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM); |   if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM); | ||||||
|   else                          OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM); |   else                          OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM); | ||||||
| } | } | ||||||
| static inline int divides(int a,int b) |  | ||||||
| { |  | ||||||
|   return ( b == ( (b/a)*a ) ); |  | ||||||
| } |  | ||||||
| void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims) |  | ||||||
| { |  | ||||||
|   //////////////////////////////////////////////////////////////// |  | ||||||
|   // Allow user to configure through environment variable |  | ||||||
|   //////////////////////////////////////////////////////////////// |  | ||||||
|   char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str()); |  | ||||||
|   if ( str ) { |  | ||||||
|     std::vector<int> IntShmDims; |  | ||||||
|     GridCmdOptionIntVector(std::string(str),IntShmDims); |  | ||||||
|     assert(IntShmDims.size() == WorldDims.size()); |  | ||||||
|     long ShmSize = 1; |  | ||||||
|     for (int dim=0;dim<WorldDims.size();dim++) { |  | ||||||
|       ShmSize *= (ShmDims[dim] = IntShmDims[dim]); |  | ||||||
|       assert(divides(ShmDims[dim],WorldDims[dim])); |  | ||||||
|     } |  | ||||||
|     assert(ShmSize == WorldShmSize); |  | ||||||
|     return; |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   //////////////////////////////////////////////////////////////// |  | ||||||
|   // Powers of 2,3,5 only in prime decomposition for now |  | ||||||
|   //////////////////////////////////////////////////////////////// |  | ||||||
|   int ndimension = WorldDims.size(); |  | ||||||
|   ShmDims=Coordinate(ndimension,1); |  | ||||||
|  |  | ||||||
|   std::vector<int> primes({2,3,5}); |  | ||||||
|  |  | ||||||
|   int dim = 0; |  | ||||||
|   int last_dim = ndimension - 1; |  | ||||||
|   int AutoShmSize = 1; |  | ||||||
|   while(AutoShmSize != WorldShmSize) { |  | ||||||
|     int p; |  | ||||||
|     for(p=0;p<primes.size();p++) { |  | ||||||
|       int prime=primes[p]; |  | ||||||
|       if ( divides(prime,WorldDims[dim]/ShmDims[dim]) |  | ||||||
|         && divides(prime,WorldShmSize/AutoShmSize)  ) { |  | ||||||
| 	AutoShmSize*=prime; |  | ||||||
| 	ShmDims[dim]*=prime; |  | ||||||
| 	last_dim = dim; |  | ||||||
| 	break; |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     if (p == primes.size() && last_dim == dim) { |  | ||||||
|       std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl; |  | ||||||
|       exit(EXIT_FAILURE); |  | ||||||
|     } |  | ||||||
|     dim=(dim+1) %ndimension; |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM) | void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM) | ||||||
| { | { | ||||||
|   //////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////// | ||||||
| @@ -395,7 +452,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce | |||||||
| #ifdef GRID_MPI3_SHMGET | #ifdef GRID_MPI3_SHMGET | ||||||
| void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||||
| { | { | ||||||
|   std::cout << header "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl; |   std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl; | ||||||
|   assert(_ShmSetup==1); |   assert(_ShmSetup==1); | ||||||
|   assert(_ShmAlloc==0); |   assert(_ShmAlloc==0); | ||||||
|  |  | ||||||
| @@ -480,7 +537,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
|     exit(EXIT_FAILURE);   |     exit(EXIT_FAILURE);   | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes  |   std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes  | ||||||
| 	    << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl; | 	    << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl; | ||||||
|  |  | ||||||
|   SharedMemoryZero(ShmCommBuf,bytes); |   SharedMemoryZero(ShmCommBuf,bytes); | ||||||
| @@ -523,7 +580,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
|     exit(EXIT_FAILURE);   |     exit(EXIT_FAILURE);   | ||||||
|   } |   } | ||||||
|   if ( WorldRank == 0 ){ |   if ( WorldRank == 0 ){ | ||||||
|     std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes  |     std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes  | ||||||
| 	      << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl; | 	      << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl; | ||||||
|   } |   } | ||||||
|   SharedMemoryZero(ShmCommBuf,bytes); |   SharedMemoryZero(ShmCommBuf,bytes); | ||||||
| @@ -531,8 +588,13 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   // Loop over ranks/gpu's on our node |   // Loop over ranks/gpu's on our node | ||||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | #ifdef SHM_SOCKETS | ||||||
|  |   UnixSockets::Open(WorldShmRank); | ||||||
|  | #endif | ||||||
|   for(int r=0;r<WorldShmSize;r++){ |   for(int r=0;r<WorldShmSize;r++){ | ||||||
|  |  | ||||||
|  |     MPI_Barrier(WorldShmComm); | ||||||
|  |  | ||||||
| #ifndef GRID_MPI3_SHM_NONE | #ifndef GRID_MPI3_SHM_NONE | ||||||
|     ////////////////////////////////////////////////// |     ////////////////////////////////////////////////// | ||||||
|     // If it is me, pass around the IPC access key |     // If it is me, pass around the IPC access key | ||||||
| @@ -540,24 +602,32 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
|     void * thisBuf = ShmCommBuf; |     void * thisBuf = ShmCommBuf; | ||||||
|     if(!Stencil_force_mpi) { |     if(!Stencil_force_mpi) { | ||||||
| #ifdef GRID_SYCL_LEVEL_ZERO_IPC | #ifdef GRID_SYCL_LEVEL_ZERO_IPC | ||||||
|     typedef struct { int fd; pid_t pid ; } clone_mem_t; |     typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t; | ||||||
|  |  | ||||||
|     auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device()); |     auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device()); | ||||||
|     auto zeContext   = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context()); |     auto zeContext   = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context()); | ||||||
|        |        | ||||||
|     ze_ipc_mem_handle_t ihandle; |     ze_ipc_mem_handle_t ihandle; | ||||||
|     clone_mem_t handle; |     clone_mem_t handle; | ||||||
|  |      | ||||||
|     if ( r==WorldShmRank ) {  |     if ( r==WorldShmRank ) {  | ||||||
|       auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle); |       auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle); | ||||||
|       if ( err != ZE_RESULT_SUCCESS ) { |       if ( err != ZE_RESULT_SUCCESS ) { | ||||||
| 	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; | 	std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; | ||||||
| 	exit(EXIT_FAILURE); | 	exit(EXIT_FAILURE); | ||||||
|       } else { |       } else { | ||||||
| 	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; | 	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; | ||||||
|       } |       } | ||||||
|       memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int)); |       memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int)); | ||||||
|       handle.pid = getpid(); |       handle.pid = getpid(); | ||||||
|  |       memcpy((void *)&handle.ze,(void *)&ihandle,sizeof(ihandle)); | ||||||
|  | #ifdef SHM_SOCKETS | ||||||
|  |       for(int rr=0;rr<WorldShmSize;rr++){ | ||||||
|  | 	if(rr!=r){ | ||||||
|  | 	  UnixSockets::SendFileDescriptor(handle.fd,rr); | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  | #endif | ||||||
|     } |     } | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_CUDA | #ifdef GRID_CUDA | ||||||
| @@ -585,6 +655,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
|     // Share this IPC handle across the Shm Comm |     // Share this IPC handle across the Shm Comm | ||||||
|     ////////////////////////////////////////////////// |     ////////////////////////////////////////////////// | ||||||
|     {  |     {  | ||||||
|  |       MPI_Barrier(WorldShmComm); | ||||||
|       int ierr=MPI_Bcast(&handle, |       int ierr=MPI_Bcast(&handle, | ||||||
| 			 sizeof(handle), | 			 sizeof(handle), | ||||||
| 			 MPI_BYTE, | 			 MPI_BYTE, | ||||||
| @@ -600,6 +671,10 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
| #ifdef GRID_SYCL_LEVEL_ZERO_IPC | #ifdef GRID_SYCL_LEVEL_ZERO_IPC | ||||||
|     if ( r!=WorldShmRank ) { |     if ( r!=WorldShmRank ) { | ||||||
|       thisBuf = nullptr; |       thisBuf = nullptr; | ||||||
|  |       int myfd; | ||||||
|  | #ifdef SHM_SOCKETS | ||||||
|  |       myfd=UnixSockets::RecvFileDescriptor(); | ||||||
|  | #else | ||||||
|       std::cout<<"mapping seeking remote pid/fd " |       std::cout<<"mapping seeking remote pid/fd " | ||||||
| 	       <<handle.pid<<"/" | 	       <<handle.pid<<"/" | ||||||
| 	       <<handle.fd<<std::endl; | 	       <<handle.fd<<std::endl; | ||||||
| @@ -607,16 +682,22 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
|       int pidfd = syscall(SYS_pidfd_open,handle.pid,0); |       int pidfd = syscall(SYS_pidfd_open,handle.pid,0); | ||||||
|       std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n"; |       std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n"; | ||||||
|       //      int myfd  = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0); |       //      int myfd  = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0); | ||||||
|       int myfd  = syscall(438,pidfd,handle.fd,0); |       myfd  = syscall(438,pidfd,handle.fd,0); | ||||||
|  |       int err_t = errno; | ||||||
|       std::cout<<"Using IpcHandle myfd "<<myfd<<"\n"; |       if (myfd < 0) { | ||||||
|        |         fprintf(stderr,"pidfd_getfd returned %d errno was %d\n", myfd,err_t); fflush(stderr); | ||||||
|  | 	perror("pidfd_getfd failed "); | ||||||
|  | 	assert(0); | ||||||
|  |       } | ||||||
|  | #endif | ||||||
|  |       std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n"; | ||||||
|  |       memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle)); | ||||||
|       memcpy((void *)&ihandle,(void *)&myfd,sizeof(int)); |       memcpy((void *)&ihandle,(void *)&myfd,sizeof(int)); | ||||||
|  |  | ||||||
|       auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf); |       auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf); | ||||||
|       if ( err != ZE_RESULT_SUCCESS ) { |       if ( err != ZE_RESULT_SUCCESS ) { | ||||||
| 	std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl; | 	std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl; | ||||||
| 	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;  | 	std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;  | ||||||
| 	exit(EXIT_FAILURE); | 	exit(EXIT_FAILURE); | ||||||
|       } else { |       } else { | ||||||
| 	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl; | 	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl; | ||||||
| @@ -651,6 +732,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
| #else | #else | ||||||
|     WorldShmCommBufs[r] = ShmCommBuf; |     WorldShmCommBufs[r] = ShmCommBuf; | ||||||
| #endif | #endif | ||||||
|  |     MPI_Barrier(WorldShmComm); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   _ShmAllocBytes=bytes; |   _ShmAllocBytes=bytes; | ||||||
| @@ -662,7 +744,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
| #ifdef GRID_MPI3_SHMMMAP | #ifdef GRID_MPI3_SHMMMAP | ||||||
| void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||||
| { | { | ||||||
|   std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl; |   std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl; | ||||||
|   assert(_ShmSetup==1); |   assert(_ShmSetup==1); | ||||||
|   assert(_ShmAlloc==0); |   assert(_ShmAlloc==0); | ||||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| @@ -699,7 +781,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
|     assert(((uint64_t)ptr&0x3F)==0); |     assert(((uint64_t)ptr&0x3F)==0); | ||||||
|     close(fd); |     close(fd); | ||||||
|     WorldShmCommBufs[r] =ptr; |     WorldShmCommBufs[r] =ptr; | ||||||
|     //    std::cout << header "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl; |     //    std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl; | ||||||
|   } |   } | ||||||
|   _ShmAlloc=1; |   _ShmAlloc=1; | ||||||
|   _ShmAllocBytes  = bytes; |   _ShmAllocBytes  = bytes; | ||||||
| @@ -709,7 +791,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
| #ifdef GRID_MPI3_SHM_NONE | #ifdef GRID_MPI3_SHM_NONE | ||||||
| void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||||
| { | { | ||||||
|   std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl; |   std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl; | ||||||
|   assert(_ShmSetup==1); |   assert(_ShmSetup==1); | ||||||
|   assert(_ShmAlloc==0); |   assert(_ShmAlloc==0); | ||||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| @@ -756,7 +838,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
| //////////////////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||||
| {  | {  | ||||||
|   std::cout << header "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl; |   std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl; | ||||||
|   assert(_ShmSetup==1); |   assert(_ShmSetup==1); | ||||||
|   assert(_ShmAlloc==0);  |   assert(_ShmAlloc==0);  | ||||||
|   MPI_Barrier(WorldShmComm); |   MPI_Barrier(WorldShmComm); | ||||||
|   | |||||||
| @@ -29,8 +29,27 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
|  |  | ||||||
| NAMESPACE_BEGIN(Grid); | NAMESPACE_BEGIN(Grid); | ||||||
|  |  | ||||||
| extern Vector<std::pair<int,int> > Cshift_table;  | extern std::vector<std::pair<int,int> > Cshift_table;  | ||||||
|  | extern commVector<std::pair<int,int> > Cshift_table_device;  | ||||||
|  |  | ||||||
|  | inline std::pair<int,int> *MapCshiftTable(void) | ||||||
|  | { | ||||||
|  |   // GPU version | ||||||
|  | #ifdef ACCELERATOR_CSHIFT     | ||||||
|  |   uint64_t sz=Cshift_table.size(); | ||||||
|  |   if (Cshift_table_device.size()!=sz )    { | ||||||
|  |     Cshift_table_device.resize(sz); | ||||||
|  |   } | ||||||
|  |   acceleratorCopyToDevice((void *)&Cshift_table[0], | ||||||
|  | 			  (void *)&Cshift_table_device[0], | ||||||
|  | 			  sizeof(Cshift_table[0])*sz); | ||||||
|  |  | ||||||
|  |   return &Cshift_table_device[0]; | ||||||
|  | #else  | ||||||
|  |   return &Cshift_table[0]; | ||||||
|  | #endif | ||||||
|  |   // CPU version use identify map | ||||||
|  | } | ||||||
| /////////////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////////////// | ||||||
| // Gather for when there is no need to SIMD split  | // Gather for when there is no need to SIMD split  | ||||||
| /////////////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////////////// | ||||||
| @@ -74,8 +93,8 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim | |||||||
|   } |   } | ||||||
|   { |   { | ||||||
|     auto buffer_p = & buffer[0]; |     auto buffer_p = & buffer[0]; | ||||||
|     auto table = &Cshift_table[0]; |     auto table = MapCshiftTable(); | ||||||
| #ifdef ACCELERATOR_CSHIFT     | #ifdef ACCELERATOR_CSHIFT | ||||||
|     autoView(rhs_v , rhs, AcceleratorRead); |     autoView(rhs_v , rhs, AcceleratorRead); | ||||||
|     accelerator_for(i,ent,vobj::Nsimd(),{ |     accelerator_for(i,ent,vobj::Nsimd(),{ | ||||||
| 	coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); | 	coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); | ||||||
| @@ -225,7 +244,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector< | |||||||
|    |    | ||||||
|   { |   { | ||||||
|     auto buffer_p = & buffer[0]; |     auto buffer_p = & buffer[0]; | ||||||
|     auto table = &Cshift_table[0]; |     auto table = MapCshiftTable(); | ||||||
| #ifdef ACCELERATOR_CSHIFT     | #ifdef ACCELERATOR_CSHIFT     | ||||||
|     autoView( rhs_v, rhs, AcceleratorWrite); |     autoView( rhs_v, rhs, AcceleratorWrite); | ||||||
|     accelerator_for(i,ent,vobj::Nsimd(),{ |     accelerator_for(i,ent,vobj::Nsimd(),{ | ||||||
| @@ -340,7 +359,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs | |||||||
|   } |   } | ||||||
|  |  | ||||||
|   { |   { | ||||||
|     auto table = &Cshift_table[0]; |     auto table = MapCshiftTable(); | ||||||
| #ifdef ACCELERATOR_CSHIFT     | #ifdef ACCELERATOR_CSHIFT     | ||||||
|     autoView(rhs_v , rhs, AcceleratorRead); |     autoView(rhs_v , rhs, AcceleratorRead); | ||||||
|     autoView(lhs_v , lhs, AcceleratorWrite); |     autoView(lhs_v , lhs, AcceleratorWrite); | ||||||
| @@ -392,7 +411,7 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo | |||||||
|   } |   } | ||||||
|  |  | ||||||
|   { |   { | ||||||
|     auto table = &Cshift_table[0]; |     auto table = MapCshiftTable(); | ||||||
| #ifdef ACCELERATOR_CSHIFT     | #ifdef ACCELERATOR_CSHIFT     | ||||||
|     autoView( rhs_v, rhs, AcceleratorRead); |     autoView( rhs_v, rhs, AcceleratorRead); | ||||||
|     autoView( lhs_v, lhs, AcceleratorWrite); |     autoView( lhs_v, lhs, AcceleratorWrite); | ||||||
|   | |||||||
| @@ -52,7 +52,8 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension | |||||||
|   int comm_dim        = rhs.Grid()->_processors[dimension] >1 ; |   int comm_dim        = rhs.Grid()->_processors[dimension] >1 ; | ||||||
|   int splice_dim      = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim); |   int splice_dim      = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim); | ||||||
|  |  | ||||||
|  |   RealD t1,t0; | ||||||
|  |   t0=usecond(); | ||||||
|   if ( !comm_dim ) { |   if ( !comm_dim ) { | ||||||
|     //std::cout << "CSHIFT: Cshift_local" <<std::endl; |     //std::cout << "CSHIFT: Cshift_local" <<std::endl; | ||||||
|     Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding |     Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding | ||||||
| @@ -63,6 +64,8 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension | |||||||
|     //std::cout << "CSHIFT: Cshift_comms" <<std::endl; |     //std::cout << "CSHIFT: Cshift_comms" <<std::endl; | ||||||
|     Cshift_comms(ret,rhs,dimension,shift); |     Cshift_comms(ret,rhs,dimension,shift); | ||||||
|   } |   } | ||||||
|  |   t1=usecond(); | ||||||
|  |   //  std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl; | ||||||
|   return ret; |   return ret; | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -127,16 +130,20 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | |||||||
|      |      | ||||||
|   int cb= (cbmask==0x2)? Odd : Even; |   int cb= (cbmask==0x2)? Odd : Even; | ||||||
|   int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); |   int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); | ||||||
|  |   RealD tcopy=0.0; | ||||||
|  |   RealD tgather=0.0; | ||||||
|  |   RealD tscatter=0.0; | ||||||
|  |   RealD tcomms=0.0; | ||||||
|  |   uint64_t xbytes=0; | ||||||
|   for(int x=0;x<rd;x++){        |   for(int x=0;x<rd;x++){        | ||||||
|  |  | ||||||
|     int sx        =  (x+sshift)%rd; |     int sx        =  (x+sshift)%rd; | ||||||
|     int comm_proc = ((x+sshift)/rd)%pd; |     int comm_proc = ((x+sshift)/rd)%pd; | ||||||
|      |      | ||||||
|     if (comm_proc==0) { |     if (comm_proc==0) { | ||||||
|  |       tcopy-=usecond(); | ||||||
|       Copy_plane(ret,rhs,dimension,x,sx,cbmask);  |       Copy_plane(ret,rhs,dimension,x,sx,cbmask);  | ||||||
|  |       tcopy+=usecond(); | ||||||
|     } else { |     } else { | ||||||
|  |  | ||||||
|       int words = buffer_size; |       int words = buffer_size; | ||||||
| @@ -144,26 +151,39 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | |||||||
|  |  | ||||||
|       int bytes = words * sizeof(vobj); |       int bytes = words * sizeof(vobj); | ||||||
|  |  | ||||||
|  |       tgather-=usecond(); | ||||||
|       Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask); |       Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask); | ||||||
|  |       tgather+=usecond(); | ||||||
|  |  | ||||||
|       //      int rank           = grid->_processor; |       //      int rank           = grid->_processor; | ||||||
|       int recv_from_rank; |       int recv_from_rank; | ||||||
|       int xmit_to_rank; |       int xmit_to_rank; | ||||||
|       grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); |       grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); | ||||||
|  |        | ||||||
|       grid->Barrier(); |       tcomms-=usecond(); | ||||||
|  |       //      grid->Barrier(); | ||||||
|  |  | ||||||
|       grid->SendToRecvFrom((void *)&send_buf[0], |       grid->SendToRecvFrom((void *)&send_buf[0], | ||||||
| 			   xmit_to_rank, | 			   xmit_to_rank, | ||||||
| 			   (void *)&recv_buf[0], | 			   (void *)&recv_buf[0], | ||||||
| 			   recv_from_rank, | 			   recv_from_rank, | ||||||
| 			   bytes); | 			   bytes); | ||||||
|  |       xbytes+=bytes; | ||||||
|  |       //      grid->Barrier(); | ||||||
|  |       tcomms+=usecond(); | ||||||
|  |  | ||||||
|       grid->Barrier(); |       tscatter-=usecond(); | ||||||
|  |  | ||||||
|       Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); |       Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); | ||||||
|  |       tscatter+=usecond(); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |   /* | ||||||
|  |   std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl; | ||||||
|  |   std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl; | ||||||
|  |   std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl; | ||||||
|  |   std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl; | ||||||
|  |   std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl; | ||||||
|  |   */ | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) | template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) | ||||||
| @@ -190,6 +210,12 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | |||||||
|   assert(shift>=0); |   assert(shift>=0); | ||||||
|   assert(shift<fd); |   assert(shift<fd); | ||||||
|  |  | ||||||
|  |   RealD tcopy=0.0; | ||||||
|  |   RealD tgather=0.0; | ||||||
|  |   RealD tscatter=0.0; | ||||||
|  |   RealD tcomms=0.0; | ||||||
|  |   uint64_t xbytes=0; | ||||||
|  |    | ||||||
|   int permute_type=grid->PermuteType(dimension); |   int permute_type=grid->PermuteType(dimension); | ||||||
|  |  | ||||||
|   /////////////////////////////////////////////// |   /////////////////////////////////////////////// | ||||||
| @@ -227,7 +253,9 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | |||||||
|       pointers[i] = &send_buf_extract[i][0]; |       pointers[i] = &send_buf_extract[i][0]; | ||||||
|     } |     } | ||||||
|     int sx   = (x+sshift)%rd; |     int sx   = (x+sshift)%rd; | ||||||
|  |     tgather-=usecond(); | ||||||
|     Gather_plane_extract(rhs,pointers,dimension,sx,cbmask); |     Gather_plane_extract(rhs,pointers,dimension,sx,cbmask); | ||||||
|  |     tgather+=usecond(); | ||||||
|  |  | ||||||
|     for(int i=0;i<Nsimd;i++){ |     for(int i=0;i<Nsimd;i++){ | ||||||
|        |        | ||||||
| @@ -252,7 +280,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | |||||||
|       if(nbr_proc){ |       if(nbr_proc){ | ||||||
| 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);  | 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);  | ||||||
|  |  | ||||||
| 	grid->Barrier(); | 	tcomms-=usecond(); | ||||||
|  | 	//	grid->Barrier(); | ||||||
|  |  | ||||||
| 	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0]; | 	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0]; | ||||||
| 	recv_buf_extract_mpi = &recv_buf_extract[i][0]; | 	recv_buf_extract_mpi = &recv_buf_extract[i][0]; | ||||||
| @@ -262,7 +291,9 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | |||||||
| 			     recv_from_rank, | 			     recv_from_rank, | ||||||
| 			     bytes); | 			     bytes); | ||||||
|  |  | ||||||
| 	grid->Barrier(); | 	xbytes+=bytes; | ||||||
|  | 	//	grid->Barrier(); | ||||||
|  | 	tcomms+=usecond(); | ||||||
|  |  | ||||||
| 	rpointers[i] = &recv_buf_extract[i][0]; | 	rpointers[i] = &recv_buf_extract[i][0]; | ||||||
|       } else {  |       } else {  | ||||||
| @@ -270,9 +301,17 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | |||||||
|       } |       } | ||||||
|  |  | ||||||
|     } |     } | ||||||
|  |     tscatter-=usecond(); | ||||||
|     Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); |     Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); | ||||||
|  |     tscatter+=usecond(); | ||||||
|   } |   } | ||||||
|  |   /* | ||||||
|  |   std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl; | ||||||
|  |   std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl; | ||||||
|  |   std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl; | ||||||
|  |   std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl; | ||||||
|  |   std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl; | ||||||
|  |   */ | ||||||
| } | } | ||||||
| #else  | #else  | ||||||
| template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) | template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) | ||||||
| @@ -292,6 +331,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | |||||||
|   assert(comm_dim==1); |   assert(comm_dim==1); | ||||||
|   assert(shift>=0); |   assert(shift>=0); | ||||||
|   assert(shift<fd); |   assert(shift<fd); | ||||||
|  |   RealD tcopy=0.0; | ||||||
|  |   RealD tgather=0.0; | ||||||
|  |   RealD tscatter=0.0; | ||||||
|  |   RealD tcomms=0.0; | ||||||
|  |   uint64_t xbytes=0; | ||||||
|    |    | ||||||
|   int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; |   int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; | ||||||
|   static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size); |   static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size); | ||||||
| @@ -315,7 +359,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | |||||||
|      |      | ||||||
|     if (comm_proc==0) { |     if (comm_proc==0) { | ||||||
|  |  | ||||||
|  |       tcopy-=usecond(); | ||||||
|       Copy_plane(ret,rhs,dimension,x,sx,cbmask);  |       Copy_plane(ret,rhs,dimension,x,sx,cbmask);  | ||||||
|  |       tcopy+=usecond(); | ||||||
|  |  | ||||||
|     } else { |     } else { | ||||||
|  |  | ||||||
| @@ -324,7 +370,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | |||||||
|  |  | ||||||
|       int bytes = words * sizeof(vobj); |       int bytes = words * sizeof(vobj); | ||||||
|  |  | ||||||
|  |       tgather-=usecond(); | ||||||
|       Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask); |       Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask); | ||||||
|  |       tgather+=usecond(); | ||||||
|  |  | ||||||
|       //      int rank           = grid->_processor; |       //      int rank           = grid->_processor; | ||||||
|       int recv_from_rank; |       int recv_from_rank; | ||||||
| @@ -332,7 +380,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | |||||||
|       grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); |       grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); | ||||||
|  |  | ||||||
|  |  | ||||||
|       grid->Barrier(); |       tcomms-=usecond(); | ||||||
|  |       //      grid->Barrier(); | ||||||
|  |  | ||||||
|       acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes); |       acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes); | ||||||
|       grid->SendToRecvFrom((void *)&send_buf[0], |       grid->SendToRecvFrom((void *)&send_buf[0], | ||||||
| @@ -340,13 +389,24 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | |||||||
| 			   (void *)&recv_buf[0], | 			   (void *)&recv_buf[0], | ||||||
| 			   recv_from_rank, | 			   recv_from_rank, | ||||||
| 			   bytes); | 			   bytes); | ||||||
|  |       xbytes+=bytes; | ||||||
|       acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes); |       acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes); | ||||||
|  |  | ||||||
|       grid->Barrier(); |       //      grid->Barrier(); | ||||||
|  |       tcomms+=usecond(); | ||||||
|  |  | ||||||
|  |       tscatter-=usecond(); | ||||||
|       Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask); |       Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask); | ||||||
|  |       tscatter+=usecond(); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |   /* | ||||||
|  |   std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl; | ||||||
|  |   std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl; | ||||||
|  |   std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl; | ||||||
|  |   std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl; | ||||||
|  |   std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl; | ||||||
|  |   */ | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) | template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) | ||||||
| @@ -372,6 +432,11 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | |||||||
|   assert(simd_layout==2); |   assert(simd_layout==2); | ||||||
|   assert(shift>=0); |   assert(shift>=0); | ||||||
|   assert(shift<fd); |   assert(shift<fd); | ||||||
|  |   RealD tcopy=0.0; | ||||||
|  |   RealD tgather=0.0; | ||||||
|  |   RealD tscatter=0.0; | ||||||
|  |   RealD tcomms=0.0; | ||||||
|  |   uint64_t xbytes=0; | ||||||
|  |  | ||||||
|   int permute_type=grid->PermuteType(dimension); |   int permute_type=grid->PermuteType(dimension); | ||||||
|  |  | ||||||
| @@ -414,8 +479,10 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | |||||||
|     for(int i=0;i<Nsimd;i++){        |     for(int i=0;i<Nsimd;i++){        | ||||||
|       pointers[i] = &send_buf_extract[i][0]; |       pointers[i] = &send_buf_extract[i][0]; | ||||||
|     } |     } | ||||||
|  |     tgather-=usecond(); | ||||||
|     int sx   = (x+sshift)%rd; |     int sx   = (x+sshift)%rd; | ||||||
|     Gather_plane_extract(rhs,pointers,dimension,sx,cbmask); |     Gather_plane_extract(rhs,pointers,dimension,sx,cbmask); | ||||||
|  |     tgather+=usecond(); | ||||||
|  |  | ||||||
|     for(int i=0;i<Nsimd;i++){ |     for(int i=0;i<Nsimd;i++){ | ||||||
|        |        | ||||||
| @@ -440,7 +507,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | |||||||
|       if(nbr_proc){ |       if(nbr_proc){ | ||||||
| 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);  | 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);  | ||||||
|  |  | ||||||
| 	grid->Barrier(); | 	tcomms-=usecond(); | ||||||
|  | 	//	grid->Barrier(); | ||||||
|  |  | ||||||
| 	acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes); | 	acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes); | ||||||
| 	grid->SendToRecvFrom((void *)send_buf_extract_mpi, | 	grid->SendToRecvFrom((void *)send_buf_extract_mpi, | ||||||
| @@ -449,17 +517,28 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | |||||||
| 			     recv_from_rank, | 			     recv_from_rank, | ||||||
| 			     bytes); | 			     bytes); | ||||||
| 	acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes); | 	acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes); | ||||||
|  | 	xbytes+=bytes; | ||||||
|  |  | ||||||
| 	grid->Barrier(); | 	//	grid->Barrier(); | ||||||
|  | 	tcomms+=usecond(); | ||||||
| 	rpointers[i] = &recv_buf_extract[i][0]; | 	rpointers[i] = &recv_buf_extract[i][0]; | ||||||
|       } else {  |       } else {  | ||||||
| 	rpointers[i] = &send_buf_extract[nbr_lane][0]; | 	rpointers[i] = &send_buf_extract[nbr_lane][0]; | ||||||
|       } |       } | ||||||
|  |  | ||||||
|     } |     } | ||||||
|  |     tscatter-=usecond(); | ||||||
|     Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); |     Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); | ||||||
|   } |     tscatter+=usecond(); | ||||||
|  |  | ||||||
|  |   } | ||||||
|  |   /* | ||||||
|  |   std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl; | ||||||
|  |   std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl; | ||||||
|  |   std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl; | ||||||
|  |   std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl; | ||||||
|  |   std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl; | ||||||
|  |   */ | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
| NAMESPACE_END(Grid);  | NAMESPACE_END(Grid);  | ||||||
|   | |||||||
| @@ -1,4 +1,5 @@ | |||||||
| #include <Grid/GridCore.h>        | #include <Grid/GridCore.h>        | ||||||
| NAMESPACE_BEGIN(Grid); | NAMESPACE_BEGIN(Grid); | ||||||
| Vector<std::pair<int,int> > Cshift_table;  | std::vector<std::pair<int,int> > Cshift_table;  | ||||||
|  | commVector<std::pair<int,int> > Cshift_table_device;  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|   | |||||||
| @@ -47,3 +47,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #include <Grid/lattice/Lattice_transfer.h> | #include <Grid/lattice/Lattice_transfer.h> | ||||||
| #include <Grid/lattice/Lattice_basis.h> | #include <Grid/lattice/Lattice_basis.h> | ||||||
| #include <Grid/lattice/Lattice_crc.h> | #include <Grid/lattice/Lattice_crc.h> | ||||||
|  | #include <Grid/lattice/PaddedCell.h> | ||||||
|   | |||||||
| @@ -345,7 +345,9 @@ GridUnopClass(UnaryNot, Not(a)); | |||||||
| GridUnopClass(UnaryTrace, trace(a)); | GridUnopClass(UnaryTrace, trace(a)); | ||||||
| GridUnopClass(UnaryTranspose, transpose(a)); | GridUnopClass(UnaryTranspose, transpose(a)); | ||||||
| GridUnopClass(UnaryTa, Ta(a)); | GridUnopClass(UnaryTa, Ta(a)); | ||||||
|  | GridUnopClass(UnarySpTa, SpTa(a)); | ||||||
| GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a)); | GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a)); | ||||||
|  | GridUnopClass(UnaryProjectOnSpGroup, ProjectOnSpGroup(a)); | ||||||
| GridUnopClass(UnaryTimesI, timesI(a)); | GridUnopClass(UnaryTimesI, timesI(a)); | ||||||
| GridUnopClass(UnaryTimesMinusI, timesMinusI(a)); | GridUnopClass(UnaryTimesMinusI, timesMinusI(a)); | ||||||
| GridUnopClass(UnaryAbs, abs(a)); | GridUnopClass(UnaryAbs, abs(a)); | ||||||
| @@ -456,7 +458,9 @@ GRID_DEF_UNOP(operator!, UnaryNot); | |||||||
| GRID_DEF_UNOP(trace, UnaryTrace); | GRID_DEF_UNOP(trace, UnaryTrace); | ||||||
| GRID_DEF_UNOP(transpose, UnaryTranspose); | GRID_DEF_UNOP(transpose, UnaryTranspose); | ||||||
| GRID_DEF_UNOP(Ta, UnaryTa); | GRID_DEF_UNOP(Ta, UnaryTa); | ||||||
|  | GRID_DEF_UNOP(SpTa, UnarySpTa); | ||||||
| GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup); | GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup); | ||||||
|  | GRID_DEF_UNOP(ProjectOnSpGroup, UnaryProjectOnSpGroup); | ||||||
| GRID_DEF_UNOP(timesI, UnaryTimesI); | GRID_DEF_UNOP(timesI, UnaryTimesI); | ||||||
| GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI); | GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI); | ||||||
| GRID_DEF_UNOP(abs, UnaryAbs);  // abs overloaded in cmath C++98; DON'T do the | GRID_DEF_UNOP(abs, UnaryAbs);  // abs overloaded in cmath C++98; DON'T do the | ||||||
|   | |||||||
| @@ -270,5 +270,42 @@ RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const L | |||||||
|     return axpby_norm_fast(ret,a,b,x,y); |     return axpby_norm_fast(ret,a,b,x,y); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | /// Trace product | ||||||
|  | template<class obj> auto traceProduct(const Lattice<obj> &rhs_1,const Lattice<obj> &rhs_2) | ||||||
|  |   -> Lattice<decltype(trace(obj()))> | ||||||
|  | { | ||||||
|  |   typedef decltype(trace(obj())) robj; | ||||||
|  |   Lattice<robj> ret_i(rhs_1.Grid()); | ||||||
|  |   autoView( rhs1 , rhs_1, AcceleratorRead); | ||||||
|  |   autoView( rhs2 , rhs_2, AcceleratorRead); | ||||||
|  |   autoView( ret , ret_i, AcceleratorWrite); | ||||||
|  |   ret.Checkerboard() = rhs_1.Checkerboard(); | ||||||
|  |   accelerator_for(ss,rhs1.size(),obj::Nsimd(),{ | ||||||
|  |       coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2(ss))); | ||||||
|  |   }); | ||||||
|  |   return ret_i; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | template<class obj1,class obj2> auto traceProduct(const Lattice<obj1> &rhs_1,const obj2 &rhs2) | ||||||
|  |   -> Lattice<decltype(trace(obj1()))> | ||||||
|  | { | ||||||
|  |   typedef decltype(trace(obj1())) robj; | ||||||
|  |   Lattice<robj> ret_i(rhs_1.Grid()); | ||||||
|  |   autoView( rhs1 , rhs_1, AcceleratorRead); | ||||||
|  |   autoView( ret , ret_i, AcceleratorWrite); | ||||||
|  |   ret.Checkerboard() = rhs_1.Checkerboard(); | ||||||
|  |   accelerator_for(ss,rhs1.size(),obj1::Nsimd(),{ | ||||||
|  |       coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2)); | ||||||
|  |   }); | ||||||
|  |   return ret_i; | ||||||
|  | } | ||||||
|  | template<class obj1,class obj2> auto traceProduct(const obj2 &rhs_2,const Lattice<obj1> &rhs_1) | ||||||
|  |   -> Lattice<decltype(trace(obj1()))> | ||||||
|  | { | ||||||
|  |   return traceProduct(rhs_1,rhs_2); | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -153,33 +153,44 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites) | |||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> | template<class vobj> | ||||||
| inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) | inline typename vobj::scalar_object rankSum(const Lattice<vobj> &arg) | ||||||
| { | { | ||||||
|   Integer osites = arg.Grid()->oSites(); |   Integer osites = arg.Grid()->oSites(); | ||||||
| #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) | #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) | ||||||
|   typename vobj::scalar_object ssum; |  | ||||||
|   autoView( arg_v, arg, AcceleratorRead); |   autoView( arg_v, arg, AcceleratorRead); | ||||||
|   ssum= sum_gpu(&arg_v[0],osites); |   return sum_gpu(&arg_v[0],osites); | ||||||
| #else | #else | ||||||
|   autoView(arg_v, arg, CpuRead); |   autoView(arg_v, arg, CpuRead); | ||||||
|   auto ssum= sum_cpu(&arg_v[0],osites); |   return sum_cpu(&arg_v[0],osites); | ||||||
| #endif   | #endif   | ||||||
|  | } | ||||||
|  |  | ||||||
|  | template<class vobj> | ||||||
|  | inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) | ||||||
|  | { | ||||||
|  |   auto ssum = rankSum(arg); | ||||||
|   arg.Grid()->GlobalSum(ssum); |   arg.Grid()->GlobalSum(ssum); | ||||||
|   return ssum; |   return ssum; | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> | template<class vobj> | ||||||
| inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg) | inline typename vobj::scalar_object rankSumLarge(const Lattice<vobj> &arg) | ||||||
| { | { | ||||||
| #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) | #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) | ||||||
|   autoView( arg_v, arg, AcceleratorRead); |   autoView( arg_v, arg, AcceleratorRead); | ||||||
|   Integer osites = arg.Grid()->oSites(); |   Integer osites = arg.Grid()->oSites(); | ||||||
|   auto ssum= sum_gpu_large(&arg_v[0],osites); |   return sum_gpu_large(&arg_v[0],osites); | ||||||
| #else | #else | ||||||
|   autoView(arg_v, arg, CpuRead); |   autoView(arg_v, arg, CpuRead); | ||||||
|   Integer osites = arg.Grid()->oSites(); |   Integer osites = arg.Grid()->oSites(); | ||||||
|   auto ssum= sum_cpu(&arg_v[0],osites); |   return sum_cpu(&arg_v[0],osites); | ||||||
| #endif | #endif | ||||||
|  | } | ||||||
|  |  | ||||||
|  | template<class vobj> | ||||||
|  | inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg) | ||||||
|  | { | ||||||
|  |   auto ssum = rankSumLarge(arg); | ||||||
|   arg.Grid()->GlobalSum(ssum); |   arg.Grid()->GlobalSum(ssum); | ||||||
|   return ssum; |   return ssum; | ||||||
| } | } | ||||||
|   | |||||||
| @@ -30,7 +30,7 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator & | |||||||
|   cudaGetDevice(&device); |   cudaGetDevice(&device); | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_HIP | #ifdef GRID_HIP | ||||||
|   hipGetDevice(&device); |   auto r=hipGetDevice(&device); | ||||||
| #endif | #endif | ||||||
|    |    | ||||||
|   Iterator warpSize            = gpu_props[device].warpSize; |   Iterator warpSize            = gpu_props[device].warpSize; | ||||||
| @@ -211,25 +211,22 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi | |||||||
|   assert(ok); |   assert(ok); | ||||||
|  |  | ||||||
|   Integer smemSize = numThreads * sizeof(sobj); |   Integer smemSize = numThreads * sizeof(sobj); | ||||||
|   // UVM seems to be buggy under later CUDA drivers |   // Move out of UVM | ||||||
|   // This fails on A100 and driver 5.30.02 / CUDA 12.1 |   // Turns out I had messed up the synchronise after move to compute stream | ||||||
|   // Fails with multiple NVCC versions back to 11.4, |   // as running this on the default stream fools the synchronise | ||||||
|   // which worked with earlier drivers. |  | ||||||
|   // Not sure which driver had first fail and this bears checking |  | ||||||
|   // Is awkward as must install multiple driver versions |  | ||||||
| #undef UVM_BLOCK_BUFFER   | #undef UVM_BLOCK_BUFFER   | ||||||
| #ifndef UVM_BLOCK_BUFFER   | #ifndef UVM_BLOCK_BUFFER   | ||||||
|   commVector<sobj> buffer(numBlocks); |   commVector<sobj> buffer(numBlocks); | ||||||
|   sobj *buffer_v = &buffer[0]; |   sobj *buffer_v = &buffer[0]; | ||||||
|   sobj result; |   sobj result; | ||||||
|   reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); |   reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); | ||||||
|   accelerator_barrier(); |   accelerator_barrier(); | ||||||
|   acceleratorCopyFromDevice(buffer_v,&result,sizeof(result)); |   acceleratorCopyFromDevice(buffer_v,&result,sizeof(result)); | ||||||
| #else | #else | ||||||
|   Vector<sobj> buffer(numBlocks); |   Vector<sobj> buffer(numBlocks); | ||||||
|   sobj *buffer_v = &buffer[0]; |   sobj *buffer_v = &buffer[0]; | ||||||
|   sobj result; |   sobj result; | ||||||
|   reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); |   reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); | ||||||
|   accelerator_barrier(); |   accelerator_barrier(); | ||||||
|   result = *buffer_v; |   result = *buffer_v; | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -440,17 +440,8 @@ public: | |||||||
| 	_grid->GlobalCoorToGlobalIndex(gcoor,gidx); | 	_grid->GlobalCoorToGlobalIndex(gcoor,gidx); | ||||||
|  |  | ||||||
| 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); | 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); | ||||||
| #if 1 |  | ||||||
| 	assert(rank == _grid->ThisRank() ); |  | ||||||
| #else |  | ||||||
| //  |  | ||||||
| 	if (rank != _grid->ThisRank() ){ |  | ||||||
| 	std::cout <<"rank "<<rank<<" _grid->ThisRank() "<<_grid->ThisRank()<< std::endl; |  | ||||||
| //	exit(-42); |  | ||||||
| //	assert(0); |  | ||||||
| 	} |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | 	assert(rank == _grid->ThisRank() ); | ||||||
| 	 | 	 | ||||||
| 	int l_idx=generator_idx(o_idx,i_idx); | 	int l_idx=generator_idx(o_idx,i_idx); | ||||||
| 	_generators[l_idx] = master_engine; | 	_generators[l_idx] = master_engine; | ||||||
|   | |||||||
| @@ -66,6 +66,65 @@ inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex< | |||||||
|   return ret; |   return ret; | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | template<int N, class Vec> | ||||||
|  | Lattice<iScalar<iScalar<iScalar<Vec> > > > Determinant(const Lattice<iScalar<iScalar<iMatrix<Vec, N> > > > &Umu) | ||||||
|  | { | ||||||
|  |   GridBase *grid=Umu.Grid(); | ||||||
|  |   auto lvol = grid->lSites(); | ||||||
|  |   Lattice<iScalar<iScalar<iScalar<Vec> > > > ret(grid); | ||||||
|  |   typedef typename Vec::scalar_type scalar; | ||||||
|  |   autoView(Umu_v,Umu,CpuRead); | ||||||
|  |   autoView(ret_v,ret,CpuWrite); | ||||||
|  |   thread_for(site,lvol,{ | ||||||
|  |     Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N); | ||||||
|  |     Coordinate lcoor; | ||||||
|  |     grid->LocalIndexToLocalCoor(site, lcoor); | ||||||
|  |     iScalar<iScalar<iMatrix<scalar, N> > > Us; | ||||||
|  |     peekLocalSite(Us, Umu_v, lcoor); | ||||||
|  |     for(int i=0;i<N;i++){ | ||||||
|  |       for(int j=0;j<N;j++){ | ||||||
|  | 	scalar tmp= Us()()(i,j); | ||||||
|  | 	ComplexD ztmp(real(tmp),imag(tmp)); | ||||||
|  | 	EigenU(i,j)=ztmp; | ||||||
|  |       }} | ||||||
|  |     ComplexD detD  = EigenU.determinant(); | ||||||
|  |     typename Vec::scalar_type det(detD.real(),detD.imag()); | ||||||
|  |     pokeLocalSite(det,ret_v,lcoor); | ||||||
|  |   }); | ||||||
|  |   return ret; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | template<int N> | ||||||
|  | Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > Inverse(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu) | ||||||
|  | { | ||||||
|  |   GridBase *grid=Umu.Grid(); | ||||||
|  |   auto lvol = grid->lSites(); | ||||||
|  |   Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > ret(grid); | ||||||
|  |    | ||||||
|  |   autoView(Umu_v,Umu,CpuRead); | ||||||
|  |   autoView(ret_v,ret,CpuWrite); | ||||||
|  |   thread_for(site,lvol,{ | ||||||
|  |     Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N); | ||||||
|  |     Coordinate lcoor; | ||||||
|  |     grid->LocalIndexToLocalCoor(site, lcoor); | ||||||
|  |     iScalar<iScalar<iMatrix<ComplexD, N> > > Us; | ||||||
|  |     iScalar<iScalar<iMatrix<ComplexD, N> > > Ui; | ||||||
|  |     peekLocalSite(Us, Umu_v, lcoor); | ||||||
|  |     for(int i=0;i<N;i++){ | ||||||
|  |       for(int j=0;j<N;j++){ | ||||||
|  | 	EigenU(i,j) = Us()()(i,j); | ||||||
|  |       }} | ||||||
|  |     Eigen::MatrixXcd EigenUinv = EigenU.inverse(); | ||||||
|  |     for(int i=0;i<N;i++){ | ||||||
|  |       for(int j=0;j<N;j++){ | ||||||
|  | 	Ui()()(i,j) = EigenUinv(i,j); | ||||||
|  |       }} | ||||||
|  |     pokeLocalSite(Ui,ret_v,lcoor); | ||||||
|  |   }); | ||||||
|  |   return ret; | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|   | |||||||
| @@ -288,7 +288,36 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData, | |||||||
|     blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);  |     blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);  | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  | template<class vobj,class CComplex,int nbasis,class VLattice> | ||||||
|  | inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData, | ||||||
|  |                                const std::vector<Lattice<vobj>> &fineData, | ||||||
|  |                                const VLattice &Basis) | ||||||
|  | { | ||||||
|  |   int NBatch = fineData.size(); | ||||||
|  |   assert(coarseData.size() == NBatch); | ||||||
|  |  | ||||||
|  |   GridBase * fine  = fineData[0].Grid(); | ||||||
|  |   GridBase * coarse= coarseData[0].Grid(); | ||||||
|  |  | ||||||
|  |   Lattice<iScalar<CComplex>> ip(coarse); | ||||||
|  |   std::vector<Lattice<vobj>> fineDataCopy = fineData; | ||||||
|  |  | ||||||
|  |   autoView(ip_, ip, AcceleratorWrite); | ||||||
|  |   for(int v=0;v<nbasis;v++) { | ||||||
|  |     for (int k=0; k<NBatch; k++) { | ||||||
|  |       autoView( coarseData_ , coarseData[k], AcceleratorWrite); | ||||||
|  |       blockInnerProductD(ip,Basis[v],fineDataCopy[k]); // ip = <basis|fine> | ||||||
|  |       accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), { | ||||||
|  |         convertType(coarseData_[sc](v),ip_[sc]); | ||||||
|  |       }); | ||||||
|  |  | ||||||
|  |       // improve numerical stability of projection | ||||||
|  |       // |fine> = |fine> - <basis|fine> |basis> | ||||||
|  |       ip=-ip; | ||||||
|  |       blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]);  | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
| template<class vobj,class vobj2,class CComplex> | template<class vobj,class vobj2,class CComplex> | ||||||
|   inline void blockZAXPY(Lattice<vobj> &fineZ, |   inline void blockZAXPY(Lattice<vobj> &fineZ, | ||||||
| @@ -590,6 +619,26 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData, | |||||||
| } | } | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | template<class vobj,class CComplex,int nbasis,class VLattice> | ||||||
|  | inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData, | ||||||
|  |                                std::vector<Lattice<vobj>> &fineData, | ||||||
|  |                                const VLattice &Basis) | ||||||
|  | { | ||||||
|  |   int NBatch = coarseData.size(); | ||||||
|  |   assert(fineData.size() == NBatch); | ||||||
|  |  | ||||||
|  |   GridBase * fine   = fineData[0].Grid(); | ||||||
|  |   GridBase * coarse = coarseData[0].Grid(); | ||||||
|  |   for (int k=0; k<NBatch; k++) | ||||||
|  |     fineData[k]=Zero(); | ||||||
|  |   for (int i=0;i<nbasis;i++) { | ||||||
|  |     for (int k=0; k<NBatch; k++) { | ||||||
|  |       Lattice<iScalar<CComplex>> ip = PeekIndex<0>(coarseData[k],i); | ||||||
|  |       blockZAXPY(fineData[k],ip,Basis[i],fineData[k]); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
| // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars. | // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars. | ||||||
| // Simd layouts need not match since we use peek/poke Local | // Simd layouts need not match since we use peek/poke Local | ||||||
| template<class vobj,class vvobj> | template<class vobj,class vvobj> | ||||||
| @@ -648,8 +697,68 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro | |||||||
|   for(int d=0;d<nd;d++){ |   for(int d=0;d<nd;d++){ | ||||||
|     assert(Fg->_processors[d]  == Tg->_processors[d]); |     assert(Fg->_processors[d]  == Tg->_processors[d]); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   // the above should guarantee that the operations are local |   // the above should guarantee that the operations are local | ||||||
|  |    | ||||||
|  | #if 1 | ||||||
|  |  | ||||||
|  |   size_t nsite = 1; | ||||||
|  |   for(int i=0;i<nd;i++) nsite *= RegionSize[i]; | ||||||
|  |    | ||||||
|  |   size_t tbytes = 4*nsite*sizeof(int); | ||||||
|  |   int *table = (int*)malloc(tbytes); | ||||||
|  |   | ||||||
|  |   thread_for(idx, nsite, { | ||||||
|  |       Coordinate from_coor, to_coor; | ||||||
|  |       size_t rem = idx; | ||||||
|  |       for(int i=0;i<nd;i++){ | ||||||
|  | 	size_t base_i  = rem % RegionSize[i]; rem /= RegionSize[i]; | ||||||
|  | 	from_coor[i] = base_i + FromLowerLeft[i]; | ||||||
|  | 	to_coor[i] = base_i + ToLowerLeft[i]; | ||||||
|  |       } | ||||||
|  |        | ||||||
|  |       int foidx = Fg->oIndex(from_coor); | ||||||
|  |       int fiidx = Fg->iIndex(from_coor); | ||||||
|  |       int toidx = Tg->oIndex(to_coor); | ||||||
|  |       int tiidx = Tg->iIndex(to_coor); | ||||||
|  |       int* tt = table + 4*idx; | ||||||
|  |       tt[0] = foidx; | ||||||
|  |       tt[1] = fiidx; | ||||||
|  |       tt[2] = toidx; | ||||||
|  |       tt[3] = tiidx; | ||||||
|  |     }); | ||||||
|  |    | ||||||
|  |   int* table_d = (int*)acceleratorAllocDevice(tbytes); | ||||||
|  |   acceleratorCopyToDevice(table,table_d,tbytes); | ||||||
|  |  | ||||||
|  |   typedef typename vobj::vector_type vector_type; | ||||||
|  |   typedef typename vobj::scalar_type scalar_type; | ||||||
|  |  | ||||||
|  |   autoView(from_v,From,AcceleratorRead); | ||||||
|  |   autoView(to_v,To,AcceleratorWrite); | ||||||
|  |    | ||||||
|  |   accelerator_for(idx,nsite,1,{ | ||||||
|  |       static const int words=sizeof(vobj)/sizeof(vector_type); | ||||||
|  |       int* tt = table_d + 4*idx; | ||||||
|  |       int from_oidx = *tt++; | ||||||
|  |       int from_lane = *tt++; | ||||||
|  |       int to_oidx = *tt++; | ||||||
|  |       int to_lane = *tt; | ||||||
|  |  | ||||||
|  |       const vector_type* from = (const vector_type *)&from_v[from_oidx]; | ||||||
|  |       vector_type* to = (vector_type *)&to_v[to_oidx]; | ||||||
|  |        | ||||||
|  |       scalar_type stmp; | ||||||
|  |       for(int w=0;w<words;w++){ | ||||||
|  | 	stmp = getlane(from[w], from_lane); | ||||||
|  | 	putlane(to[w], stmp, to_lane); | ||||||
|  |       } | ||||||
|  |     }); | ||||||
|  |    | ||||||
|  |   acceleratorFreeDevice(table_d);     | ||||||
|  |   free(table); | ||||||
|  |    | ||||||
|  |  | ||||||
|  | #else   | ||||||
|   Coordinate ldf = Fg->_ldimensions; |   Coordinate ldf = Fg->_ldimensions; | ||||||
|   Coordinate rdf = Fg->_rdimensions; |   Coordinate rdf = Fg->_rdimensions; | ||||||
|   Coordinate isf = Fg->_istride; |   Coordinate isf = Fg->_istride; | ||||||
| @@ -658,9 +767,9 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro | |||||||
|   Coordinate ist = Tg->_istride; |   Coordinate ist = Tg->_istride; | ||||||
|   Coordinate ost = Tg->_ostride; |   Coordinate ost = Tg->_ostride; | ||||||
|  |  | ||||||
|   autoView( t_v , To, AcceleratorWrite); |   autoView( t_v , To, CpuWrite); | ||||||
|   autoView( f_v , From, AcceleratorRead); |   autoView( f_v , From, CpuRead); | ||||||
|   accelerator_for(idx,Fg->lSites(),1,{ |   thread_for(idx,Fg->lSites(),{ | ||||||
|     sobj s; |     sobj s; | ||||||
|     Coordinate Fcoor(nd); |     Coordinate Fcoor(nd); | ||||||
|     Coordinate Tcoor(nd); |     Coordinate Tcoor(nd); | ||||||
| @@ -673,17 +782,24 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro | |||||||
|       Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d]; |       Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d]; | ||||||
|     } |     } | ||||||
|     if (in_region) { |     if (in_region) { | ||||||
|       Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]); | #if 0       | ||||||
|       Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]); |       Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]); // inner index from | ||||||
|       Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]); |       Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]); // inner index to | ||||||
|       Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]); |       Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]); // outer index from | ||||||
|       vector_type * fp = (vector_type *)&f_v[odx_f]; |       Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]); // outer index to | ||||||
|       vector_type * tp = (vector_type *)&t_v[odx_t]; |       scalar_type * fp = (scalar_type *)&f_v[odx_f]; | ||||||
|  |       scalar_type * tp = (scalar_type *)&t_v[odx_t]; | ||||||
|       for(int w=0;w<words;w++){ |       for(int w=0;w<words;w++){ | ||||||
| 	tp[w].putlane(fp[w].getlane(idx_f),idx_t); | 	tp[w].putlane(fp[w].getlane(idx_f),idx_t); | ||||||
|       } |       } | ||||||
|  | #else | ||||||
|  |     peekLocalSite(s,f_v,Fcoor); | ||||||
|  |     pokeLocalSite(s,t_v,Tcoor); | ||||||
|  | #endif | ||||||
|     } |     } | ||||||
|   }); |   }); | ||||||
|  |  | ||||||
|  | #endif | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -776,6 +892,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic | |||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | //Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim | ||||||
|  | //The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same | ||||||
| template<class vobj> | template<class vobj> | ||||||
| void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog) | void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog) | ||||||
| { | { | ||||||
| @@ -792,11 +910,70 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int | |||||||
|  |  | ||||||
|   for(int d=0;d<nh;d++){ |   for(int d=0;d<nh;d++){ | ||||||
|     if ( d!=orthog ) { |     if ( d!=orthog ) { | ||||||
|     assert(lg->_processors[d]  == hg->_processors[d]); |       assert(lg->_processors[d]  == hg->_processors[d]); | ||||||
|     assert(lg->_ldimensions[d] == hg->_ldimensions[d]); |       assert(lg->_ldimensions[d] == hg->_ldimensions[d]); | ||||||
|   } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  | #if 1 | ||||||
|  |   size_t nsite = lg->lSites()/lg->LocalDimensions()[orthog]; | ||||||
|  |   size_t tbytes = 4*nsite*sizeof(int); | ||||||
|  |   int *table = (int*)malloc(tbytes); | ||||||
|  |    | ||||||
|  |   thread_for(idx,nsite,{ | ||||||
|  |     Coordinate lcoor(nl); | ||||||
|  |     Coordinate hcoor(nh); | ||||||
|  |     lcoor[orthog] = slice_lo; | ||||||
|  |     hcoor[orthog] = slice_hi; | ||||||
|  |     size_t rem = idx; | ||||||
|  |     for(int mu=0;mu<nl;mu++){ | ||||||
|  |       if(mu != orthog){ | ||||||
|  | 	int xmu = rem % lg->LocalDimensions()[mu];  rem /= lg->LocalDimensions()[mu]; | ||||||
|  | 	lcoor[mu] = hcoor[mu] = xmu; | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     int loidx = lg->oIndex(lcoor); | ||||||
|  |     int liidx = lg->iIndex(lcoor); | ||||||
|  |     int hoidx = hg->oIndex(hcoor); | ||||||
|  |     int hiidx = hg->iIndex(hcoor); | ||||||
|  |     int* tt = table + 4*idx; | ||||||
|  |     tt[0] = loidx; | ||||||
|  |     tt[1] = liidx; | ||||||
|  |     tt[2] = hoidx; | ||||||
|  |     tt[3] = hiidx; | ||||||
|  |     }); | ||||||
|  |     | ||||||
|  |   int* table_d = (int*)acceleratorAllocDevice(tbytes); | ||||||
|  |   acceleratorCopyToDevice(table,table_d,tbytes); | ||||||
|  |  | ||||||
|  |   typedef typename vobj::vector_type vector_type; | ||||||
|  |   typedef typename vobj::scalar_type scalar_type; | ||||||
|  |  | ||||||
|  |   autoView(lowDim_v,lowDim,AcceleratorRead); | ||||||
|  |   autoView(higherDim_v,higherDim,AcceleratorWrite); | ||||||
|  |    | ||||||
|  |   accelerator_for(idx,nsite,1,{ | ||||||
|  |       static const int words=sizeof(vobj)/sizeof(vector_type); | ||||||
|  |       int* tt = table_d + 4*idx; | ||||||
|  |       int from_oidx = *tt++; | ||||||
|  |       int from_lane = *tt++; | ||||||
|  |       int to_oidx = *tt++; | ||||||
|  |       int to_lane = *tt; | ||||||
|  |  | ||||||
|  |       const vector_type* from = (const vector_type *)&lowDim_v[from_oidx]; | ||||||
|  |       vector_type* to = (vector_type *)&higherDim_v[to_oidx]; | ||||||
|  |        | ||||||
|  |       scalar_type stmp; | ||||||
|  |       for(int w=0;w<words;w++){ | ||||||
|  | 	stmp = getlane(from[w], from_lane); | ||||||
|  | 	putlane(to[w], stmp, to_lane); | ||||||
|  |       } | ||||||
|  |     }); | ||||||
|  |    | ||||||
|  |   acceleratorFreeDevice(table_d);     | ||||||
|  |   free(table); | ||||||
|  |    | ||||||
|  | #else | ||||||
|   // the above should guarantee that the operations are local |   // the above should guarantee that the operations are local | ||||||
|   autoView(lowDimv,lowDim,CpuRead); |   autoView(lowDimv,lowDim,CpuRead); | ||||||
|   autoView(higherDimv,higherDim,CpuWrite); |   autoView(higherDimv,higherDim,CpuWrite); | ||||||
| @@ -812,6 +989,7 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int | |||||||
|       pokeLocalSite(s,higherDimv,hcoor); |       pokeLocalSite(s,higherDimv,hcoor); | ||||||
|     } |     } | ||||||
|   }); |   }); | ||||||
|  | #endif | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										174
									
								
								Grid/lattice/PaddedCell.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										174
									
								
								Grid/lattice/PaddedCell.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,174 @@ | |||||||
|  | /************************************************************************************* | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  |     Source file: ./lib/lattice/PaddedCell.h | ||||||
|  |  | ||||||
|  |     Copyright (C) 2019 | ||||||
|  |  | ||||||
|  | Author: Peter Boyle pboyle@bnl.gov | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |  | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |  | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  | *************************************************************************************/ | ||||||
|  | /*  END LEGAL */ | ||||||
|  | #pragma once | ||||||
|  |  | ||||||
|  | #include<Grid/cshift/Cshift.h> | ||||||
|  |  | ||||||
|  | NAMESPACE_BEGIN(Grid); | ||||||
|  |  | ||||||
|  | //Allow the user to specify how the C-shift is performed, e.g. to respect the appropriate boundary conditions | ||||||
|  | template<typename vobj> | ||||||
|  | struct CshiftImplBase{ | ||||||
|  |   virtual Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const = 0; | ||||||
|  |   virtual ~CshiftImplBase(){} | ||||||
|  | }; | ||||||
|  | template<typename vobj> | ||||||
|  | struct CshiftImplDefault: public CshiftImplBase<vobj>{ | ||||||
|  |   Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const override{ return Grid::Cshift(in,dir,shift); } | ||||||
|  | }; | ||||||
|  | template<typename Gimpl> | ||||||
|  | struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::vector_object>{ | ||||||
|  |   typename Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override{ return Gimpl::CshiftLink(in,dir,shift); } | ||||||
|  | };   | ||||||
|  |  | ||||||
|  | class PaddedCell { | ||||||
|  | public: | ||||||
|  |   GridCartesian * unpadded_grid; | ||||||
|  |   int dims; | ||||||
|  |   int depth; | ||||||
|  |   std::vector<GridCartesian *> grids; | ||||||
|  |  | ||||||
|  |   ~PaddedCell() | ||||||
|  |   { | ||||||
|  |     DeleteGrids(); | ||||||
|  |   } | ||||||
|  |   PaddedCell(int _depth,GridCartesian *_grid) | ||||||
|  |   { | ||||||
|  |     unpadded_grid = _grid; | ||||||
|  |     depth=_depth; | ||||||
|  |     dims=_grid->Nd(); | ||||||
|  |     AllocateGrids(); | ||||||
|  |     Coordinate local     =unpadded_grid->LocalDimensions(); | ||||||
|  |     for(int d=0;d<dims;d++){ | ||||||
|  |       assert(local[d]>=depth); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   void DeleteGrids(void) | ||||||
|  |   { | ||||||
|  |     for(int d=0;d<grids.size();d++){ | ||||||
|  |       delete grids[d]; | ||||||
|  |     } | ||||||
|  |     grids.resize(0); | ||||||
|  |   }; | ||||||
|  |   void AllocateGrids(void) | ||||||
|  |   { | ||||||
|  |     Coordinate local     =unpadded_grid->LocalDimensions(); | ||||||
|  |     Coordinate simd      =unpadded_grid->_simd_layout; | ||||||
|  |     Coordinate processors=unpadded_grid->_processors; | ||||||
|  |     Coordinate plocal    =unpadded_grid->LocalDimensions(); | ||||||
|  |     Coordinate global(dims); | ||||||
|  |  | ||||||
|  |     // expand up one dim at a time | ||||||
|  |     for(int d=0;d<dims;d++){ | ||||||
|  |  | ||||||
|  |       plocal[d] += 2*depth;  | ||||||
|  |  | ||||||
|  |       for(int d=0;d<dims;d++){ | ||||||
|  | 	global[d] = plocal[d]*processors[d]; | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       grids.push_back(new GridCartesian(global,simd,processors)); | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |   template<class vobj> | ||||||
|  |   inline Lattice<vobj> Extract(const Lattice<vobj> &in) const | ||||||
|  |   { | ||||||
|  |     Lattice<vobj> out(unpadded_grid); | ||||||
|  |  | ||||||
|  |     Coordinate local     =unpadded_grid->LocalDimensions(); | ||||||
|  |     Coordinate fll(dims,depth); // depends on the MPI spread | ||||||
|  |     Coordinate tll(dims,0); // depends on the MPI spread | ||||||
|  |     localCopyRegion(in,out,fll,tll,local); | ||||||
|  |     return out; | ||||||
|  |   } | ||||||
|  |   template<class vobj> | ||||||
|  |   inline Lattice<vobj> Exchange(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const | ||||||
|  |   { | ||||||
|  |     GridBase *old_grid = in.Grid(); | ||||||
|  |     int dims = old_grid->Nd(); | ||||||
|  |     Lattice<vobj> tmp = in; | ||||||
|  |     for(int d=0;d<dims;d++){ | ||||||
|  |       tmp = Expand(d,tmp,cshift); // rvalue && assignment | ||||||
|  |     } | ||||||
|  |     return tmp; | ||||||
|  |   } | ||||||
|  |   // expand up one dim at a time | ||||||
|  |   template<class vobj> | ||||||
|  |   inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const | ||||||
|  |   { | ||||||
|  |     GridBase *old_grid = in.Grid(); | ||||||
|  |     GridCartesian *new_grid = grids[dim];//These are new grids | ||||||
|  |     Lattice<vobj>  padded(new_grid); | ||||||
|  |     Lattice<vobj> shifted(old_grid);     | ||||||
|  |     Coordinate local     =old_grid->LocalDimensions(); | ||||||
|  |     Coordinate plocal    =new_grid->LocalDimensions(); | ||||||
|  |     if(dim==0) conformable(old_grid,unpadded_grid); | ||||||
|  |     else       conformable(old_grid,grids[dim-1]); | ||||||
|  |  | ||||||
|  |     std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl; | ||||||
|  |  | ||||||
|  |     double tins=0, tshift=0; | ||||||
|  |      | ||||||
|  |     // Middle bit | ||||||
|  |     double t = usecond(); | ||||||
|  |     for(int x=0;x<local[dim];x++){ | ||||||
|  |       InsertSliceLocal(in,padded,x,depth+x,dim); | ||||||
|  |     } | ||||||
|  |     tins += usecond() - t; | ||||||
|  |      | ||||||
|  |     // High bit | ||||||
|  |     t = usecond(); | ||||||
|  |     shifted = cshift.Cshift(in,dim,depth); | ||||||
|  |     tshift += usecond() - t; | ||||||
|  |  | ||||||
|  |     t=usecond(); | ||||||
|  |     for(int x=0;x<depth;x++){ | ||||||
|  |       InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim); | ||||||
|  |     } | ||||||
|  |     tins += usecond() - t; | ||||||
|  |      | ||||||
|  |     // Low bit | ||||||
|  |     t = usecond(); | ||||||
|  |     shifted = cshift.Cshift(in,dim,-depth); | ||||||
|  |     tshift += usecond() - t; | ||||||
|  |      | ||||||
|  |     t = usecond(); | ||||||
|  |     for(int x=0;x<depth;x++){ | ||||||
|  |       InsertSliceLocal(shifted,padded,x,x,dim); | ||||||
|  |     } | ||||||
|  |     tins += usecond() - t; | ||||||
|  |  | ||||||
|  |     std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl; | ||||||
|  |      | ||||||
|  |     return padded; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  | }; | ||||||
|  |   | ||||||
|  |  | ||||||
|  | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
| @@ -104,6 +104,7 @@ template<typename vtype> using iSpinMatrix                = iScalar<iMatrix<iSca | |||||||
| template<typename vtype> using iColourMatrix              = iScalar<iScalar<iMatrix<vtype, Nc> > > ; | template<typename vtype> using iColourMatrix              = iScalar<iScalar<iMatrix<vtype, Nc> > > ; | ||||||
| template<typename vtype> using iSpinColourMatrix          = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >; | template<typename vtype> using iSpinColourMatrix          = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >; | ||||||
| template<typename vtype> using iLorentzColourMatrix       = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ; | template<typename vtype> using iLorentzColourMatrix       = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ; | ||||||
|  | template<typename vtype> using iLorentzComplex            = iVector<iScalar<iScalar<vtype> >, Nd > ; | ||||||
| template<typename vtype> using iDoubleStoredColourMatrix  = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ; | template<typename vtype> using iDoubleStoredColourMatrix  = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ; | ||||||
| template<typename vtype> using iSpinVector                = iScalar<iVector<iScalar<vtype>, Ns> >; | template<typename vtype> using iSpinVector                = iScalar<iVector<iScalar<vtype>, Ns> >; | ||||||
| template<typename vtype> using iColourVector              = iScalar<iScalar<iVector<vtype, Nc> > >; | template<typename vtype> using iColourVector              = iScalar<iScalar<iVector<vtype, Nc> > >; | ||||||
| @@ -178,6 +179,15 @@ typedef iLorentzColourMatrix<vComplexF>  vLorentzColourMatrixF; | |||||||
| typedef iLorentzColourMatrix<vComplexD>  vLorentzColourMatrixD; | typedef iLorentzColourMatrix<vComplexD>  vLorentzColourMatrixD; | ||||||
| typedef iLorentzColourMatrix<vComplexD2> vLorentzColourMatrixD2; | typedef iLorentzColourMatrix<vComplexD2> vLorentzColourMatrixD2; | ||||||
|  |  | ||||||
|  | // LorentzComplex | ||||||
|  | typedef iLorentzComplex<Complex  > LorentzComplex; | ||||||
|  | typedef iLorentzComplex<ComplexF > LorentzComplexF; | ||||||
|  | typedef iLorentzComplex<ComplexD > LorentzComplexD; | ||||||
|  |  | ||||||
|  | typedef iLorentzComplex<vComplex > vLorentzComplex; | ||||||
|  | typedef iLorentzComplex<vComplexF> vLorentzComplexF; | ||||||
|  | typedef iLorentzComplex<vComplexD> vLorentzComplexD; | ||||||
|  |  | ||||||
| // DoubleStored gauge field | // DoubleStored gauge field | ||||||
| typedef iDoubleStoredColourMatrix<Complex  > DoubleStoredColourMatrix; | typedef iDoubleStoredColourMatrix<Complex  > DoubleStoredColourMatrix; | ||||||
| typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF; | typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF; | ||||||
| @@ -307,6 +317,10 @@ typedef Lattice<vLorentzColourMatrixF>  LatticeLorentzColourMatrixF; | |||||||
| typedef Lattice<vLorentzColourMatrixD>  LatticeLorentzColourMatrixD; | typedef Lattice<vLorentzColourMatrixD>  LatticeLorentzColourMatrixD; | ||||||
| typedef Lattice<vLorentzColourMatrixD2> LatticeLorentzColourMatrixD2; | typedef Lattice<vLorentzColourMatrixD2> LatticeLorentzColourMatrixD2; | ||||||
|  |  | ||||||
|  | typedef Lattice<vLorentzComplex>  LatticeLorentzComplex; | ||||||
|  | typedef Lattice<vLorentzComplexF> LatticeLorentzComplexF; | ||||||
|  | typedef Lattice<vLorentzComplexD> LatticeLorentzComplexD; | ||||||
|  |  | ||||||
| // DoubleStored gauge field | // DoubleStored gauge field | ||||||
| typedef Lattice<vDoubleStoredColourMatrix>   LatticeDoubleStoredColourMatrix; | typedef Lattice<vDoubleStoredColourMatrix>   LatticeDoubleStoredColourMatrix; | ||||||
| typedef Lattice<vDoubleStoredColourMatrixF>  LatticeDoubleStoredColourMatrixF; | typedef Lattice<vDoubleStoredColourMatrixF>  LatticeDoubleStoredColourMatrixF; | ||||||
|   | |||||||
| @@ -34,10 +34,24 @@ directory | |||||||
|  |  | ||||||
| NAMESPACE_BEGIN(Grid); | NAMESPACE_BEGIN(Grid); | ||||||
|  |  | ||||||
|  | /////////////////////////////////// | ||||||
|  | // Smart configuration base class | ||||||
|  | /////////////////////////////////// | ||||||
|  | template< class Field > | ||||||
|  | class ConfigurationBase | ||||||
|  | { | ||||||
|  | public: | ||||||
|  |   ConfigurationBase() {} | ||||||
|  |   virtual ~ConfigurationBase() {} | ||||||
|  |   virtual void set_Field(Field& U) =0; | ||||||
|  |   virtual void smeared_force(Field&) = 0; | ||||||
|  |   virtual Field& get_SmearedU() =0; | ||||||
|  |   virtual Field &get_U(bool smeared = false) = 0; | ||||||
|  | }; | ||||||
|  |  | ||||||
| template <class GaugeField > | template <class GaugeField > | ||||||
| class Action  | class Action  | ||||||
| { | { | ||||||
|  |  | ||||||
| public: | public: | ||||||
|   bool is_smeared = false; |   bool is_smeared = false; | ||||||
|   RealD deriv_norm_sum; |   RealD deriv_norm_sum; | ||||||
| @@ -77,16 +91,60 @@ public: | |||||||
|   void refresh_timer_stop(void)  { refresh_us+=usecond(); } |   void refresh_timer_stop(void)  { refresh_us+=usecond(); } | ||||||
|   void S_timer_start(void)       { S_us-=usecond(); } |   void S_timer_start(void)       { S_us-=usecond(); } | ||||||
|   void S_timer_stop(void)        { S_us+=usecond(); } |   void S_timer_stop(void)        { S_us+=usecond(); } | ||||||
|  |   ///////////////////////////// | ||||||
|   // Heatbath? |   // Heatbath? | ||||||
|  |   ///////////////////////////// | ||||||
|   virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions |   virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions | ||||||
|   virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action |   virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action | ||||||
|   virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ;  // if the refresh computes the action, can cache it. Alternately refreshAndAction() ? |   virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ;  // if the refresh computes the action, can cache it. Alternately refreshAndAction() ? | ||||||
|   virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0;        // evaluate the action derivative |   virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0;        // evaluate the action derivative | ||||||
|  |  | ||||||
|  |   ///////////////////////////////////////////////////////////// | ||||||
|  |   // virtual smeared interface through configuration container | ||||||
|  |   ///////////////////////////////////////////////////////////// | ||||||
|  |   virtual void refresh(ConfigurationBase<GaugeField> & U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) | ||||||
|  |   { | ||||||
|  |     refresh(U.get_U(is_smeared),sRNG,pRNG); | ||||||
|  |   } | ||||||
|  |   virtual RealD S(ConfigurationBase<GaugeField>& U) | ||||||
|  |   { | ||||||
|  |     return S(U.get_U(is_smeared)); | ||||||
|  |   } | ||||||
|  |   virtual RealD Sinitial(ConfigurationBase<GaugeField>& U)  | ||||||
|  |   { | ||||||
|  |     return Sinitial(U.get_U(is_smeared)); | ||||||
|  |   } | ||||||
|  |   virtual void deriv(ConfigurationBase<GaugeField>& U, GaugeField& dSdU) | ||||||
|  |   { | ||||||
|  |     deriv(U.get_U(is_smeared),dSdU);  | ||||||
|  |     if ( is_smeared ) { | ||||||
|  |       U.smeared_force(dSdU); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   /////////////////////////////// | ||||||
|  |   // Logging | ||||||
|  |   /////////////////////////////// | ||||||
|   virtual std::string action_name()    = 0;                             // return the action name |   virtual std::string action_name()    = 0;                             // return the action name | ||||||
|   virtual std::string LogParameters()  = 0;                             // prints action parameters |   virtual std::string LogParameters()  = 0;                             // prints action parameters | ||||||
|   virtual ~Action(){} |   virtual ~Action(){} | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | template <class GaugeField > | ||||||
|  | class EmptyAction : public Action <GaugeField> | ||||||
|  | { | ||||||
|  |   virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions | ||||||
|  |   virtual RealD S(const GaugeField& U) { return 0.0;};                             // evaluate the action | ||||||
|  |   virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); };        // evaluate the action derivative | ||||||
|  |  | ||||||
|  |   /////////////////////////////// | ||||||
|  |   // Logging | ||||||
|  |   /////////////////////////////// | ||||||
|  |   virtual std::string action_name()    { return std::string("Level Force Log"); }; | ||||||
|  |   virtual std::string LogParameters()  { return std::string("No parameters");}; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
| #endif // ACTION_BASE_H | #endif // ACTION_BASE_H | ||||||
|   | |||||||
| @@ -30,6 +30,8 @@ directory | |||||||
| #ifndef QCD_ACTION_CORE | #ifndef QCD_ACTION_CORE | ||||||
| #define QCD_ACTION_CORE | #define QCD_ACTION_CORE | ||||||
|  |  | ||||||
|  | #include <Grid/qcd/action/gauge/GaugeImplementations.h> | ||||||
|  |  | ||||||
| #include <Grid/qcd/action/ActionBase.h> | #include <Grid/qcd/action/ActionBase.h> | ||||||
| NAMESPACE_CHECK(ActionBase); | NAMESPACE_CHECK(ActionBase); | ||||||
| #include <Grid/qcd/action/ActionSet.h> | #include <Grid/qcd/action/ActionSet.h> | ||||||
|   | |||||||
| @@ -126,6 +126,16 @@ typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermi | |||||||
| typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonTwoIndexAntiSymmetricFermionF; | typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonTwoIndexAntiSymmetricFermionF; | ||||||
| typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonTwoIndexAntiSymmetricFermionD; | typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonTwoIndexAntiSymmetricFermionD; | ||||||
|  |  | ||||||
|  | // Sp(2n) | ||||||
|  | typedef WilsonFermion<SpWilsonImplF> SpWilsonFermionF; | ||||||
|  | typedef WilsonFermion<SpWilsonImplD> SpWilsonFermionD; | ||||||
|  |  | ||||||
|  | typedef WilsonFermion<SpWilsonTwoIndexAntiSymmetricImplF> SpWilsonTwoIndexAntiSymmetricFermionF; | ||||||
|  | typedef WilsonFermion<SpWilsonTwoIndexAntiSymmetricImplD> SpWilsonTwoIndexAntiSymmetricFermionD; | ||||||
|  |  | ||||||
|  | typedef WilsonFermion<SpWilsonTwoIndexSymmetricImplF> SpWilsonTwoIndexSymmetricFermionF; | ||||||
|  | typedef WilsonFermion<SpWilsonTwoIndexSymmetricImplD> SpWilsonTwoIndexSymmetricFermionD; | ||||||
|  |  | ||||||
| // Twisted mass fermion | // Twisted mass fermion | ||||||
| typedef WilsonTMFermion<WilsonImplD2> WilsonTMFermionD2; | typedef WilsonTMFermion<WilsonImplD2> WilsonTMFermionD2; | ||||||
| typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF; | typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF; | ||||||
|   | |||||||
| @@ -507,6 +507,7 @@ public: | |||||||
|     } |     } | ||||||
|     this->face_table_computed=1; |     this->face_table_computed=1; | ||||||
|     assert(this->u_comm_offset==this->_unified_buffer_size); |     assert(this->u_comm_offset==this->_unified_buffer_size); | ||||||
|  |     accelerator_barrier(); | ||||||
|   } |   } | ||||||
|  |  | ||||||
| }; | }; | ||||||
|   | |||||||
| @@ -261,6 +261,22 @@ typedef WilsonImpl<vComplex,  TwoIndexAntiSymmetricRepresentation, CoeffReal > W | |||||||
| typedef WilsonImpl<vComplexF, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplF;  // Float | typedef WilsonImpl<vComplexF, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplF;  // Float | ||||||
| typedef WilsonImpl<vComplexD, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplD;  // Double | typedef WilsonImpl<vComplexD, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplD;  // Double | ||||||
|  |  | ||||||
|  | //sp 2n | ||||||
|  |  | ||||||
|  | typedef WilsonImpl<vComplex,  SpFundamentalRepresentation, CoeffReal > SpWilsonImplR;  // Real.. whichever prec | ||||||
|  | typedef WilsonImpl<vComplexF, SpFundamentalRepresentation, CoeffReal > SpWilsonImplF;  // Float | ||||||
|  | typedef WilsonImpl<vComplexD, SpFundamentalRepresentation, CoeffReal > SpWilsonImplD;  // Double | ||||||
|  |  | ||||||
|  | typedef WilsonImpl<vComplex,  SpTwoIndexAntiSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexAntiSymmetricImplR;  // Real.. whichever prec | ||||||
|  | typedef WilsonImpl<vComplexF, SpTwoIndexAntiSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexAntiSymmetricImplF;  // Float | ||||||
|  | typedef WilsonImpl<vComplexD, SpTwoIndexAntiSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexAntiSymmetricImplD;  // Double | ||||||
|  |  | ||||||
|  | typedef WilsonImpl<vComplex,  SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexSymmetricImplR;  // Real.. whichever prec | ||||||
|  | typedef WilsonImpl<vComplexF, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexSymmetricImplF;  // Float | ||||||
|  | typedef WilsonImpl<vComplexD, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexSymmetricImplD;  // Double | ||||||
|  |  | ||||||
|  | typedef WilsonImpl<vComplex,  SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonAdjImplR;  // Real.. whichever prec    // adj = 2indx symmetric for Sp(2N) | ||||||
|  | typedef WilsonImpl<vComplexF, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonAdjImplF;  // Float     // adj = 2indx symmetric for Sp(2N) | ||||||
|  | typedef WilsonImpl<vComplexD, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonAdjImplD;  // Double    // adj = 2indx symmetric for Sp(2N) | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -332,8 +332,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg | |||||||
|   ///////////////////////////// |   ///////////////////////////// | ||||||
|   { |   { | ||||||
|     GRID_TRACE("Gather"); |     GRID_TRACE("Gather"); | ||||||
|     st.HaloExchangeOptGather(in,compressor); |     st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine | ||||||
|     accelerator_barrier(); |  | ||||||
|   } |   } | ||||||
|    |    | ||||||
|   std::vector<std::vector<CommsRequest_t> > requests; |   std::vector<std::vector<CommsRequest_t> > requests; | ||||||
|   | |||||||
| @@ -423,14 +423,14 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S | |||||||
| #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier(); | #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier(); | ||||||
|  |  | ||||||
| #define KERNEL_CALL_EXT(A)						\ | #define KERNEL_CALL_EXT(A)						\ | ||||||
|   const uint64_t    NN = Nsite*Ls;					\ |  | ||||||
|   const uint64_t    sz = st.surface_list.size();			\ |   const uint64_t    sz = st.surface_list.size();			\ | ||||||
|   auto ptr = &st.surface_list[0];					\ |   auto ptr = &st.surface_list[0];					\ | ||||||
|   accelerator_forNB( ss, sz, Simd::Nsimd(), {				\ |   accelerator_forNB( ss, sz, Simd::Nsimd(), {				\ | ||||||
|       int sF = ptr[ss];							\ |       int sF = ptr[ss];							\ | ||||||
|       int sU = ss/Ls;							\ |       int sU = sF/Ls;							\ | ||||||
|       WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\ |       WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\ | ||||||
|     });									 |     });									\ | ||||||
|  |   accelerator_barrier(); | ||||||
|  |  | ||||||
| #define ASM_CALL(A)							\ | #define ASM_CALL(A)							\ | ||||||
|   thread_for( sss, Nsite, {						\ |   thread_for( sss, Nsite, {						\ | ||||||
| @@ -463,11 +463,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | |||||||
|  |  | ||||||
|    if( interior && exterior ) { |    if( interior && exterior ) { | ||||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;} |      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;} | ||||||
| #ifdef SYCL_HACK      |  | ||||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteSycl);    return; } |  | ||||||
| #else |  | ||||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;} |      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;} | ||||||
| #endif      |  | ||||||
| #ifndef GRID_CUDA | #ifndef GRID_CUDA | ||||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite);    return;} |      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite);    return;} | ||||||
| #endif | #endif | ||||||
| @@ -478,8 +474,10 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | |||||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;} |      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;} | ||||||
| #endif | #endif | ||||||
|    } else if( exterior ) { |    } else if( exterior ) { | ||||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteExt); return;} |      // dependent on result of merge | ||||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt);    return;} |      acceleratorFenceComputeStream(); | ||||||
|  |      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;} | ||||||
|  |      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt);    return;} | ||||||
| #ifndef GRID_CUDA | #ifndef GRID_CUDA | ||||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt);    return;} |      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt);    return;} | ||||||
| #endif | #endif | ||||||
| @@ -502,21 +500,20 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | |||||||
| #ifndef GRID_CUDA | #ifndef GRID_CUDA | ||||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;} |      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;} | ||||||
| #endif | #endif | ||||||
|      acceleratorFenceComputeStream(); |  | ||||||
|    } else if( interior ) { |    } else if( interior ) { | ||||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagInt); return;} |      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALLNB(GenericDhopSiteDagInt); return;} | ||||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt);    return;} |      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteDagInt);    return;} | ||||||
| #ifndef GRID_CUDA | #ifndef GRID_CUDA | ||||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;} |      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;} | ||||||
| #endif | #endif | ||||||
|    } else if( exterior ) { |    } else if( exterior ) { | ||||||
|  |      // Dependent on result of merge | ||||||
|      acceleratorFenceComputeStream(); |      acceleratorFenceComputeStream(); | ||||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagExt); return;} |      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL_EXT(GenericDhopSiteDagExt); return;} | ||||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt);    return;} |      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteDagExt);    return;} | ||||||
| #ifndef GRID_CUDA | #ifndef GRID_CUDA | ||||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;} |      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;} | ||||||
| #endif | #endif | ||||||
|      acceleratorFenceComputeStream(); |  | ||||||
|    } |    } | ||||||
|    assert(0 && " Kernel optimisation case not covered "); |    assert(0 && " Kernel optimisation case not covered "); | ||||||
|   } |   } | ||||||
|   | |||||||
| @@ -0,0 +1 @@ | |||||||
|  | #define IMPLEMENTATION SpWilsonImplD | ||||||
| @@ -0,0 +1 @@ | |||||||
|  | ../WilsonCloverFermionInstantiation.cc.master | ||||||
| @@ -0,0 +1 @@ | |||||||
|  | ../WilsonFermionInstantiation.cc.master | ||||||
| @@ -0,0 +1 @@ | |||||||
|  | ../WilsonTMFermionInstantiation.cc.master | ||||||
| @@ -0,0 +1 @@ | |||||||
|  | #define IMPLEMENTATION SpWilsonImplF | ||||||
| @@ -0,0 +1 @@ | |||||||
|  | ../WilsonCloverFermionInstantiation.cc.master | ||||||
| @@ -0,0 +1 @@ | |||||||
|  | ../WilsonFermionInstantiation.cc.master | ||||||
| @@ -0,0 +1 @@ | |||||||
|  | ../WilsonKernelsInstantiation.cc.master | ||||||
| @@ -0,0 +1 @@ | |||||||
|  | ../WilsonTMFermionInstantiation.cc.master | ||||||
| @@ -0,0 +1 @@ | |||||||
|  | #define IMPLEMENTATION SpWilsonTwoIndexAntiSymmetricImplD | ||||||
| @@ -0,0 +1 @@ | |||||||
|  | ../WilsonCloverFermionInstantiation.cc.master | ||||||
| @@ -0,0 +1 @@ | |||||||
|  | ../WilsonFermionInstantiation.cc.master | ||||||
| @@ -0,0 +1 @@ | |||||||
|  | ../WilsonKernelsInstantiation.cc.master | ||||||
| @@ -0,0 +1 @@ | |||||||
|  | ../WilsonTMFermionInstantiation.cc.master | ||||||
| @@ -0,0 +1 @@ | |||||||
|  | #define IMPLEMENTATION SpWilsonTwoIndexAntiSymmetricImplF | ||||||
| @@ -0,0 +1 @@ | |||||||
|  | ../WilsonCloverFermionInstantiation.cc.master | ||||||
| @@ -0,0 +1 @@ | |||||||
|  | ../WilsonFermionInstantiation.cc.master | ||||||
| @@ -0,0 +1 @@ | |||||||
|  | ../WilsonKernelsInstantiation.cc.master | ||||||
| @@ -0,0 +1 @@ | |||||||
|  | ../WilsonTMFermionInstantiation.cc.master | ||||||
| @@ -0,0 +1 @@ | |||||||
|  | #define IMPLEMENTATION SpWilsonTwoIndexSymmetricImplD | ||||||
| @@ -0,0 +1 @@ | |||||||
|  | ../WilsonCloverFermionInstantiation.cc.master | ||||||
| @@ -0,0 +1 @@ | |||||||
|  | ../WilsonFermionInstantiation.cc.master | ||||||
| @@ -0,0 +1 @@ | |||||||
|  | ../WilsonKernelsInstantiation.cc.master | ||||||
| @@ -0,0 +1 @@ | |||||||
|  | ../WilsonTMFermionInstantiation.cc.master | ||||||
| @@ -0,0 +1 @@ | |||||||
|  | #define IMPLEMENTATION SpWilsonTwoIndexSymmetricImplF | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../CayleyFermion5DInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../ContinuedFractionFermion5DInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../DomainWallEOFAFermionInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../MobiusEOFAFermionInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../PartialFractionFermion5DInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../WilsonFermion5DInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| #define IMPLEMENTATION WilsonImplD2 |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../CayleyFermion5DInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../ContinuedFractionFermion5DInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../DomainWallEOFAFermionInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../MobiusEOFAFermionInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../PartialFractionFermion5DInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../WilsonFermion5DInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| #define IMPLEMENTATION ZWilsonImplD2 |  | ||||||
| @@ -10,12 +10,18 @@ WILSON_IMPL_LIST=" \ | |||||||
| 	   WilsonImplF \ | 	   WilsonImplF \ | ||||||
| 	   WilsonImplD \ | 	   WilsonImplD \ | ||||||
| 	   WilsonImplD2 \ | 	   WilsonImplD2 \ | ||||||
|  | 	   SpWilsonImplF \ | ||||||
|  | 	   SpWilsonImplD \ | ||||||
| 	   WilsonAdjImplF \ | 	   WilsonAdjImplF \ | ||||||
| 	   WilsonAdjImplD \ | 	   WilsonAdjImplD \ | ||||||
| 	   WilsonTwoIndexSymmetricImplF \ | 	   WilsonTwoIndexSymmetricImplF \ | ||||||
| 	   WilsonTwoIndexSymmetricImplD \ | 	   WilsonTwoIndexSymmetricImplD \ | ||||||
| 	   WilsonTwoIndexAntiSymmetricImplF \ | 	   WilsonTwoIndexAntiSymmetricImplF \ | ||||||
| 	   WilsonTwoIndexAntiSymmetricImplD \ | 	   WilsonTwoIndexAntiSymmetricImplD \ | ||||||
|  | 	   SpWilsonTwoIndexAntiSymmetricImplF \ | ||||||
|  | 	   SpWilsonTwoIndexAntiSymmetricImplD \ | ||||||
|  | 	   SpWilsonTwoIndexSymmetricImplF \ | ||||||
|  | 	   SpWilsonTwoIndexSymmetricImplD \ | ||||||
| 	   GparityWilsonImplF \ | 	   GparityWilsonImplF \ | ||||||
| 	   GparityWilsonImplD " | 	   GparityWilsonImplD " | ||||||
|  |  | ||||||
|   | |||||||
| @@ -39,6 +39,9 @@ NAMESPACE_BEGIN(Grid); | |||||||
| typedef WilsonGaugeAction<PeriodicGimplR>          WilsonGaugeActionR; | typedef WilsonGaugeAction<PeriodicGimplR>          WilsonGaugeActionR; | ||||||
| typedef WilsonGaugeAction<PeriodicGimplF>          WilsonGaugeActionF; | typedef WilsonGaugeAction<PeriodicGimplF>          WilsonGaugeActionF; | ||||||
| typedef WilsonGaugeAction<PeriodicGimplD>          WilsonGaugeActionD; | typedef WilsonGaugeAction<PeriodicGimplD>          WilsonGaugeActionD; | ||||||
|  | typedef WilsonGaugeAction<SpPeriodicGimplR>        SpWilsonGaugeActionR; | ||||||
|  | typedef WilsonGaugeAction<SpPeriodicGimplF>        SpWilsonGaugeActionF; | ||||||
|  | typedef WilsonGaugeAction<SpPeriodicGimplD>        SpWilsonGaugeActionD; | ||||||
| typedef PlaqPlusRectangleAction<PeriodicGimplR>    PlaqPlusRectangleActionR; | typedef PlaqPlusRectangleAction<PeriodicGimplR>    PlaqPlusRectangleActionR; | ||||||
| typedef PlaqPlusRectangleAction<PeriodicGimplF>    PlaqPlusRectangleActionF; | typedef PlaqPlusRectangleAction<PeriodicGimplF>    PlaqPlusRectangleActionF; | ||||||
| typedef PlaqPlusRectangleAction<PeriodicGimplD>    PlaqPlusRectangleActionD; | typedef PlaqPlusRectangleAction<PeriodicGimplD>    PlaqPlusRectangleActionD; | ||||||
|   | |||||||
| @@ -61,7 +61,7 @@ NAMESPACE_BEGIN(Grid); | |||||||
|   typedef typename Impl::Field Field; |   typedef typename Impl::Field Field; | ||||||
|  |  | ||||||
| // hardcodes the exponential approximation in the template | // hardcodes the exponential approximation in the template | ||||||
| template <class S, int Nrepresentation = Nc, int Nexp = 12 > class GaugeImplTypes { | template <class S, int Nrepresentation = Nc, int Nexp = 12, class Group = SU<Nc> > class GaugeImplTypes { | ||||||
| public: | public: | ||||||
|   typedef S Simd; |   typedef S Simd; | ||||||
|   typedef typename Simd::scalar_type scalar_type; |   typedef typename Simd::scalar_type scalar_type; | ||||||
| @@ -78,8 +78,6 @@ public: | |||||||
|   typedef Lattice<SiteLink>    LinkField;  |   typedef Lattice<SiteLink>    LinkField;  | ||||||
|   typedef Lattice<SiteField>   Field; |   typedef Lattice<SiteField>   Field; | ||||||
|  |  | ||||||
|   typedef SU<Nrepresentation> Group; |  | ||||||
|  |  | ||||||
|   // Guido: we can probably separate the types from the HMC functions |   // Guido: we can probably separate the types from the HMC functions | ||||||
|   // this will create 2 kind of implementations |   // this will create 2 kind of implementations | ||||||
|   // probably confusing the users |   // probably confusing the users | ||||||
| @@ -119,6 +117,7 @@ public: | |||||||
|     // |     // | ||||||
|     LinkField Pmu(P.Grid()); |     LinkField Pmu(P.Grid()); | ||||||
|     Pmu = Zero(); |     Pmu = Zero(); | ||||||
|  |  | ||||||
|     for (int mu = 0; mu < Nd; mu++) { |     for (int mu = 0; mu < Nd; mu++) { | ||||||
|       Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pmu); |       Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pmu); | ||||||
|       RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR) ; |       RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR) ; | ||||||
| @@ -126,8 +125,12 @@ public: | |||||||
|       PokeIndex<LorentzIndex>(P, Pmu, mu); |       PokeIndex<LorentzIndex>(P, Pmu, mu); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |      | ||||||
|   static inline Field projectForce(Field &P) { return Ta(P); } |   static inline Field projectForce(Field &P) { | ||||||
|  |       Field ret(P.Grid()); | ||||||
|  |       Group::taProj(P, ret); | ||||||
|  |       return ret; | ||||||
|  |     } | ||||||
|  |  | ||||||
|   static inline void update_field(Field& P, Field& U, double ep){ |   static inline void update_field(Field& P, Field& U, double ep){ | ||||||
|     //static std::chrono::duration<double> diff; |     //static std::chrono::duration<double> diff; | ||||||
| @@ -137,14 +140,15 @@ public: | |||||||
|     autoView(P_v,P,AcceleratorRead); |     autoView(P_v,P,AcceleratorRead); | ||||||
|     accelerator_for(ss, P.Grid()->oSites(),1,{ |     accelerator_for(ss, P.Grid()->oSites(),1,{ | ||||||
|       for (int mu = 0; mu < Nd; mu++) { |       for (int mu = 0; mu < Nd; mu++) { | ||||||
|         U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu)); |           U_v[ss](mu) = Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu); | ||||||
|  |           U_v[ss](mu) = Group::ProjectOnGeneralGroup(U_v[ss](mu)); | ||||||
|       } |       } | ||||||
|     }); |     }); | ||||||
|    //auto end = std::chrono::high_resolution_clock::now(); |    //auto end = std::chrono::high_resolution_clock::now(); | ||||||
|    // diff += end - start; |    // diff += end - start; | ||||||
|    // std::cout << "Time to exponentiate matrix " << diff.count() << " s\n"; |    // std::cout << "Time to exponentiate matrix " << diff.count() << " s\n"; | ||||||
|   } |   } | ||||||
|  |      | ||||||
|   static inline RealD FieldSquareNorm(Field& U){ |   static inline RealD FieldSquareNorm(Field& U){ | ||||||
|     LatticeComplex Hloc(U.Grid()); |     LatticeComplex Hloc(U.Grid()); | ||||||
|     Hloc = Zero(); |     Hloc = Zero(); | ||||||
| @@ -157,7 +161,7 @@ public: | |||||||
|   } |   } | ||||||
|  |  | ||||||
|   static inline void Project(Field &U) { |   static inline void Project(Field &U) { | ||||||
|     ProjectSUn(U); |     Group::ProjectOnSpecialGroup(U); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) { |   static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) { | ||||||
| @@ -171,6 +175,7 @@ public: | |||||||
|   static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) { |   static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) { | ||||||
|     Group::ColdConfiguration(pRNG, U); |     Group::ColdConfiguration(pRNG, U); | ||||||
|   } |   } | ||||||
|  |  | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -178,10 +183,17 @@ typedef GaugeImplTypes<vComplex, Nc> GimplTypesR; | |||||||
| typedef GaugeImplTypes<vComplexF, Nc> GimplTypesF; | typedef GaugeImplTypes<vComplexF, Nc> GimplTypesF; | ||||||
| typedef GaugeImplTypes<vComplexD, Nc> GimplTypesD; | typedef GaugeImplTypes<vComplexD, Nc> GimplTypesD; | ||||||
|  |  | ||||||
|  | typedef GaugeImplTypes<vComplex, Nc, 12, Sp<Nc> > SpGimplTypesR; | ||||||
|  | typedef GaugeImplTypes<vComplexF, Nc, 12, Sp<Nc> > SpGimplTypesF; | ||||||
|  | typedef GaugeImplTypes<vComplexD, Nc, 12, Sp<Nc> > SpGimplTypesD; | ||||||
|  |  | ||||||
| typedef GaugeImplTypes<vComplex, SU<Nc>::AdjointDimension> GimplAdjointTypesR; | typedef GaugeImplTypes<vComplex, SU<Nc>::AdjointDimension> GimplAdjointTypesR; | ||||||
| typedef GaugeImplTypes<vComplexF, SU<Nc>::AdjointDimension> GimplAdjointTypesF; | typedef GaugeImplTypes<vComplexF, SU<Nc>::AdjointDimension> GimplAdjointTypesF; | ||||||
| typedef GaugeImplTypes<vComplexD, SU<Nc>::AdjointDimension> GimplAdjointTypesD; | typedef GaugeImplTypes<vComplexD, SU<Nc>::AdjointDimension> GimplAdjointTypesD; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
| #endif // GRID_GAUGE_IMPL_TYPES_H | #endif // GRID_GAUGE_IMPL_TYPES_H | ||||||
|   | |||||||
| @@ -176,7 +176,7 @@ public: | |||||||
|       return PeriodicBC::CshiftLink(Link,mu,shift); |       return PeriodicBC::CshiftLink(Link,mu,shift); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   static inline void       setDirections(std::vector<int> &conjDirs) { _conjDirs=conjDirs; } |   static inline void       setDirections(const std::vector<int> &conjDirs) { _conjDirs=conjDirs; } | ||||||
|   static inline std::vector<int> getDirections(void) { return _conjDirs; } |   static inline std::vector<int> getDirections(void) { return _conjDirs; } | ||||||
|   static inline bool isPeriodicGaugeField(void) { return false; } |   static inline bool isPeriodicGaugeField(void) { return false; } | ||||||
| }; | }; | ||||||
| @@ -193,6 +193,11 @@ typedef ConjugateGaugeImpl<GimplTypesR> ConjugateGimplR; // Real.. whichever pre | |||||||
| typedef ConjugateGaugeImpl<GimplTypesF> ConjugateGimplF; // Float | typedef ConjugateGaugeImpl<GimplTypesF> ConjugateGimplF; // Float | ||||||
| typedef ConjugateGaugeImpl<GimplTypesD> ConjugateGimplD; // Double | typedef ConjugateGaugeImpl<GimplTypesD> ConjugateGimplD; // Double | ||||||
|  |  | ||||||
|  | typedef PeriodicGaugeImpl<SpGimplTypesR> SpPeriodicGimplR; // Real.. whichever prec | ||||||
|  | typedef PeriodicGaugeImpl<SpGimplTypesF> SpPeriodicGimplF; // Float | ||||||
|  | typedef PeriodicGaugeImpl<SpGimplTypesD> SpPeriodicGimplD; // Double | ||||||
|  |  | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -43,7 +43,7 @@ public: | |||||||
| private: | private: | ||||||
|   RealD c_plaq; |   RealD c_plaq; | ||||||
|   RealD c_rect; |   RealD c_rect; | ||||||
|  |   typename WilsonLoops<Gimpl>::StapleAndRectStapleAllWorkspace workspace; | ||||||
| public: | public: | ||||||
|   PlaqPlusRectangleAction(RealD b,RealD c): c_plaq(b),c_rect(c){}; |   PlaqPlusRectangleAction(RealD b,RealD c): c_plaq(b),c_rect(c){}; | ||||||
|  |  | ||||||
| @@ -79,27 +79,18 @@ public: | |||||||
|     GridBase *grid = Umu.Grid(); |     GridBase *grid = Umu.Grid(); | ||||||
|  |  | ||||||
|     std::vector<GaugeLinkField> U (Nd,grid); |     std::vector<GaugeLinkField> U (Nd,grid); | ||||||
|     std::vector<GaugeLinkField> U2(Nd,grid); |  | ||||||
|  |  | ||||||
|     for(int mu=0;mu<Nd;mu++){ |     for(int mu=0;mu<Nd;mu++){ | ||||||
|       U[mu] = PeekIndex<LorentzIndex>(Umu,mu); |       U[mu] = PeekIndex<LorentzIndex>(Umu,mu); | ||||||
|       WilsonLoops<Gimpl>::RectStapleDouble(U2[mu],U[mu],mu); |  | ||||||
|     } |     } | ||||||
|  |     std::vector<GaugeLinkField> RectStaple(Nd,grid), Staple(Nd,grid); | ||||||
|  |     WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, U, workspace); | ||||||
|  |  | ||||||
|     GaugeLinkField dSdU_mu(grid); |     GaugeLinkField dSdU_mu(grid); | ||||||
|     GaugeLinkField staple(grid); |     GaugeLinkField staple(grid); | ||||||
|  |  | ||||||
|     for (int mu=0; mu < Nd; mu++){ |     for (int mu=0; mu < Nd; mu++){ | ||||||
|  |       dSdU_mu = Ta(U[mu]*Staple[mu])*factor_p; | ||||||
|       // Staple in direction mu |       dSdU_mu = dSdU_mu + Ta(U[mu]*RectStaple[mu])*factor_r; | ||||||
|  |  | ||||||
|       WilsonLoops<Gimpl>::Staple(staple,Umu,mu); |  | ||||||
|  |  | ||||||
|       dSdU_mu = Ta(U[mu]*staple)*factor_p; |  | ||||||
|  |  | ||||||
|       WilsonLoops<Gimpl>::RectStaple(Umu,staple,U2,U,mu); |  | ||||||
|  |  | ||||||
|       dSdU_mu = dSdU_mu + Ta(U[mu]*staple)*factor_r; |  | ||||||
| 	   | 	   | ||||||
|       PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu); |       PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu); | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -86,8 +86,13 @@ public: | |||||||
|     assert(ForceE.Checkerboard()==Even); |     assert(ForceE.Checkerboard()==Even); | ||||||
|     assert(ForceO.Checkerboard()==Odd); |     assert(ForceO.Checkerboard()==Odd); | ||||||
|  |  | ||||||
|  | #if defined(GRID_CUDA) || defined(GRID_HIP)  || defined(GRID_SYCL) | ||||||
|  |     acceleratorSetCheckerboard(Force,ForceE); | ||||||
|  |     acceleratorSetCheckerboard(Force,ForceO); | ||||||
|  | #else | ||||||
|     setCheckerboard(Force,ForceE);  |     setCheckerboard(Force,ForceE);  | ||||||
|     setCheckerboard(Force,ForceO); |     setCheckerboard(Force,ForceO); | ||||||
|  | #endif | ||||||
|     Force=-Force; |     Force=-Force; | ||||||
|  |  | ||||||
|     delete forcecb; |     delete forcecb; | ||||||
| @@ -130,8 +135,13 @@ public: | |||||||
|     assert(ForceE.Checkerboard()==Even); |     assert(ForceE.Checkerboard()==Even); | ||||||
|     assert(ForceO.Checkerboard()==Odd); |     assert(ForceO.Checkerboard()==Odd); | ||||||
|  |  | ||||||
|  | #if defined(GRID_CUDA) || defined(GRID_HIP)  || defined(GRID_SYCL) | ||||||
|  |     acceleratorSetCheckerboard(Force,ForceE); | ||||||
|  |     acceleratorSetCheckerboard(Force,ForceO); | ||||||
|  | #else | ||||||
|     setCheckerboard(Force,ForceE);  |     setCheckerboard(Force,ForceE);  | ||||||
|     setCheckerboard(Force,ForceO); |     setCheckerboard(Force,ForceO); | ||||||
|  | #endif | ||||||
|     Force=-Force; |     Force=-Force; | ||||||
|  |  | ||||||
|     delete forcecb; |     delete forcecb; | ||||||
|   | |||||||
| @@ -38,91 +38,73 @@ NAMESPACE_BEGIN(Grid); | |||||||
|     // cf. GeneralEvenOddRational.h for details |     // cf. GeneralEvenOddRational.h for details | ||||||
|     ///////////////////////////////////////////////////////////////////////////////////////////////////////////// |     ///////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|        |        | ||||||
|     template<class ImplD, class ImplF, class ImplD2> |     template<class ImplD, class ImplF> | ||||||
|     class GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<ImplD> { |     class GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<ImplD> { | ||||||
|     private: |     private: | ||||||
|       typedef typename ImplD2::FermionField FermionFieldD2; |  | ||||||
|       typedef typename ImplD::FermionField FermionFieldD; |       typedef typename ImplD::FermionField FermionFieldD; | ||||||
|       typedef typename ImplF::FermionField FermionFieldF; |       typedef typename ImplF::FermionField FermionFieldF; | ||||||
|  |  | ||||||
|       FermionOperator<ImplD> & NumOpD; |       FermionOperator<ImplD> & NumOpD; | ||||||
|       FermionOperator<ImplD> & DenOpD; |       FermionOperator<ImplD> & DenOpD; | ||||||
|  |  | ||||||
|       FermionOperator<ImplD2> & NumOpD2; |  | ||||||
|       FermionOperator<ImplD2> & DenOpD2; |  | ||||||
|       |  | ||||||
|       FermionOperator<ImplF> & NumOpF; |       FermionOperator<ImplF> & NumOpF; | ||||||
|       FermionOperator<ImplF> & DenOpF; |       FermionOperator<ImplF> & DenOpF; | ||||||
|  |  | ||||||
|       Integer ReliableUpdateFreq; |       Integer ReliableUpdateFreq; | ||||||
|     protected: |     protected: | ||||||
|  |  | ||||||
|  |       //Action evaluation | ||||||
|       //Allow derived classes to override the multishift CG |       //Allow derived classes to override the multishift CG | ||||||
|       virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){ |       virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){ | ||||||
| #if 0 | #if 1 | ||||||
| 	SchurDifferentiableOperator<ImplD> schurOp(numerator ? NumOpD : DenOpD); | 	SchurDifferentiableOperator<ImplD> schurOp(numerator ? NumOpD : DenOpD); | ||||||
| 	ConjugateGradientMultiShift<FermionFieldD> msCG(MaxIter, approx); | 	ConjugateGradientMultiShift<FermionFieldD> msCG(MaxIter, approx); | ||||||
| 	msCG(schurOp,in, out); | 	msCG(schurOp,in, out); | ||||||
| #else | #else | ||||||
| 	SchurDifferentiableOperator<ImplD2> schurOpD2(numerator ? NumOpD2 : DenOpD2); | 	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD); | ||||||
| 	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF); | 	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF); | ||||||
| 	FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid()); | 	FermionFieldD inD(NumOpD.FermionRedBlackGrid()); | ||||||
| 	FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid()); | 	FermionFieldD outD(NumOpD.FermionRedBlackGrid()); | ||||||
|  |  | ||||||
| 	// Action better with higher precision? | 	// Action better with higher precision? | ||||||
| 	ConjugateGradientMultiShiftMixedPrec<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); | 	ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); | ||||||
| 	precisionChange(inD2,in); | 	msCG(schurOpD, in, out); | ||||||
| 	std::cout << "msCG single solve "<<norm2(inD2)<<" " <<norm2(in)<<std::endl; |  | ||||||
| 	msCG(schurOpD2, inD2, outD2); |  | ||||||
| 	precisionChange(out,outD2); |  | ||||||
| #endif | #endif | ||||||
|       } |       } | ||||||
|  |       //Force evaluation | ||||||
|       virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){ |       virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){ | ||||||
| 	SchurDifferentiableOperator<ImplD2> schurOpD2(numerator ? NumOpD2 : DenOpD2); | 	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD); | ||||||
| 	SchurDifferentiableOperator<ImplF>  schurOpF (numerator ? NumOpF  : DenOpF); | 	SchurDifferentiableOperator<ImplF>  schurOpF(numerator ? NumOpF  : DenOpF); | ||||||
|  |  | ||||||
| 	FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid()); | 	FermionFieldD inD(NumOpD.FermionRedBlackGrid()); | ||||||
| 	FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid()); | 	FermionFieldD outD(NumOpD.FermionRedBlackGrid()); | ||||||
| 	std::vector<FermionFieldD2> out_elemsD2(out_elems.size(),NumOpD2.FermionRedBlackGrid()); | 	std::vector<FermionFieldD> out_elemsD(out_elems.size(),NumOpD.FermionRedBlackGrid()); | ||||||
| 	ConjugateGradientMultiShiftMixedPrecCleanup<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); | 	ConjugateGradientMultiShiftMixedPrecCleanup<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); | ||||||
| 	precisionChange(inD2,in); | 	msCG(schurOpD, in, out_elems, out); | ||||||
| 	std::cout << "msCG in "<<norm2(inD2)<<" " <<norm2(in)<<std::endl; |  | ||||||
| 	msCG(schurOpD2, inD2, out_elemsD2, outD2); |  | ||||||
| 	precisionChange(out,outD2); |  | ||||||
| 	for(int i=0;i<out_elems.size();i++){ |  | ||||||
| 	  precisionChange(out_elems[i],out_elemsD2[i]); |  | ||||||
| 	} |  | ||||||
|       } |       } | ||||||
|       //Allow derived classes to override the gauge import |       //Allow derived classes to override the gauge import | ||||||
|       virtual void ImportGauge(const typename ImplD::GaugeField &Ud){ |       virtual void ImportGauge(const typename ImplD::GaugeField &Ud){ | ||||||
|  |  | ||||||
| 	typename ImplF::GaugeField Uf(NumOpF.GaugeGrid()); | 	typename ImplF::GaugeField Uf(NumOpF.GaugeGrid()); | ||||||
| 	typename ImplD2::GaugeField Ud2(NumOpD2.GaugeGrid()); |  | ||||||
| 	precisionChange(Uf, Ud); | 	precisionChange(Uf, Ud); | ||||||
| 	precisionChange(Ud2, Ud); |  | ||||||
|  |  | ||||||
| 	std::cout << "Importing "<<norm2(Ud)<<" "<< norm2(Uf)<<" " << norm2(Ud2)<<std::endl; | 	std::cout << "Importing "<<norm2(Ud)<<" "<< norm2(Uf)<<" " <<std::endl; | ||||||
| 	 | 	 | ||||||
| 	NumOpD.ImportGauge(Ud); | 	NumOpD.ImportGauge(Ud); | ||||||
| 	DenOpD.ImportGauge(Ud); | 	DenOpD.ImportGauge(Ud); | ||||||
|  |  | ||||||
| 	NumOpF.ImportGauge(Uf); | 	NumOpF.ImportGauge(Uf); | ||||||
| 	DenOpF.ImportGauge(Uf); | 	DenOpF.ImportGauge(Uf); | ||||||
|  |  | ||||||
| 	NumOpD2.ImportGauge(Ud2); |  | ||||||
| 	DenOpD2.ImportGauge(Ud2); |  | ||||||
|       } |       } | ||||||
|        |        | ||||||
|     public: |     public: | ||||||
|       GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator<ImplD>  &_NumOpD, FermionOperator<ImplD>  &_DenOpD,  |       GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator<ImplD>  &_NumOpD, FermionOperator<ImplD>  &_DenOpD,  | ||||||
| 							      FermionOperator<ImplF>  &_NumOpF, FermionOperator<ImplF>  &_DenOpF,  | 							      FermionOperator<ImplF>  &_NumOpF, FermionOperator<ImplF>  &_DenOpF,  | ||||||
| 							      FermionOperator<ImplD2>  &_NumOpD2, FermionOperator<ImplD2>  &_DenOpD2,  |  | ||||||
| 							      const RationalActionParams & p, Integer _ReliableUpdateFreq | 							      const RationalActionParams & p, Integer _ReliableUpdateFreq | ||||||
| 							      ) : GeneralEvenOddRatioRationalPseudoFermionAction<ImplD>(_NumOpD, _DenOpD, p), | 							      ) : GeneralEvenOddRatioRationalPseudoFermionAction<ImplD>(_NumOpD, _DenOpD, p), | ||||||
| 								  ReliableUpdateFreq(_ReliableUpdateFreq), | 								  ReliableUpdateFreq(_ReliableUpdateFreq), | ||||||
| 								  NumOpD(_NumOpD), DenOpD(_DenOpD), | 								  NumOpD(_NumOpD), DenOpD(_DenOpD), | ||||||
| 								  NumOpF(_NumOpF), DenOpF(_DenOpF), | 								  NumOpF(_NumOpF), DenOpF(_DenOpF) | ||||||
| 								  NumOpD2(_NumOpD2), DenOpD2(_DenOpD2) |  | ||||||
|       {} |       {} | ||||||
|        |        | ||||||
|       virtual std::string action_name(){return "GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction";} |       virtual std::string action_name(){return "GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction";} | ||||||
|   | |||||||
| @@ -67,9 +67,9 @@ NAMESPACE_BEGIN(Grid); | |||||||
|       virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}       |       virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}       | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     template<class Impl,class ImplF,class ImplD2> |     template<class Impl,class ImplF> | ||||||
|     class OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction |     class OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction | ||||||
|       : public GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF,ImplD2> { |       : public GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF> { | ||||||
|     public: |     public: | ||||||
|       typedef OneFlavourRationalParams Params; |       typedef OneFlavourRationalParams Params; | ||||||
|     private: |     private: | ||||||
| @@ -91,11 +91,9 @@ NAMESPACE_BEGIN(Grid); | |||||||
| 								 FermionOperator<Impl>  &_DenOp,  | 								 FermionOperator<Impl>  &_DenOp,  | ||||||
| 								 FermionOperator<ImplF>  &_NumOpF,  | 								 FermionOperator<ImplF>  &_NumOpF,  | ||||||
| 								 FermionOperator<ImplF>  &_DenOpF,  | 								 FermionOperator<ImplF>  &_DenOpF,  | ||||||
| 								 FermionOperator<ImplD2>  &_NumOpD2,  |  | ||||||
| 								 FermionOperator<ImplD2>  &_DenOpD2,  |  | ||||||
| 								 const Params & p, Integer ReliableUpdateFreq | 								 const Params & p, Integer ReliableUpdateFreq | ||||||
| 							) :  | 							) :  | ||||||
| 	GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF,ImplD2>(_NumOp, _DenOp,_NumOpF, _DenOpF,_NumOpD2, _DenOpD2, transcribe(p),ReliableUpdateFreq){} | 	GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF>(_NumOp, _DenOp,_NumOpF, _DenOpF, transcribe(p),ReliableUpdateFreq){} | ||||||
|  |  | ||||||
|       virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}       |       virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}       | ||||||
|     }; |     }; | ||||||
|   | |||||||
| @@ -112,40 +112,27 @@ NAMESPACE_BEGIN(Grid); | |||||||
|         // NumOp == V |         // NumOp == V | ||||||
|         // DenOp == M |         // DenOp == M | ||||||
|         // |         // | ||||||
|     AUDIT(); |  | ||||||
|         FermionField etaOdd (NumOp.FermionRedBlackGrid()); |         FermionField etaOdd (NumOp.FermionRedBlackGrid()); | ||||||
|         FermionField etaEven(NumOp.FermionRedBlackGrid()); |         FermionField etaEven(NumOp.FermionRedBlackGrid()); | ||||||
|         FermionField tmp    (NumOp.FermionRedBlackGrid()); |         FermionField tmp    (NumOp.FermionRedBlackGrid()); | ||||||
|  |  | ||||||
|     AUDIT(); |  | ||||||
|         pickCheckerboard(Even,etaEven,eta); |         pickCheckerboard(Even,etaEven,eta); | ||||||
|     AUDIT(); |  | ||||||
|         pickCheckerboard(Odd,etaOdd,eta); |         pickCheckerboard(Odd,etaOdd,eta); | ||||||
|  |  | ||||||
|     AUDIT(); |  | ||||||
|         NumOp.ImportGauge(U); |         NumOp.ImportGauge(U); | ||||||
|     AUDIT(); |  | ||||||
|         DenOp.ImportGauge(U); |         DenOp.ImportGauge(U); | ||||||
| 	std::cout << " TwoFlavourRefresh:  Imported gauge "<<std::endl; | 	std::cout << " TwoFlavourRefresh:  Imported gauge "<<std::endl; | ||||||
|     AUDIT(); |  | ||||||
|  |  | ||||||
|         SchurDifferentiableOperator<Impl> Mpc(DenOp); |         SchurDifferentiableOperator<Impl> Mpc(DenOp); | ||||||
|     AUDIT(); |  | ||||||
|         SchurDifferentiableOperator<Impl> Vpc(NumOp); |         SchurDifferentiableOperator<Impl> Vpc(NumOp); | ||||||
|     AUDIT(); |  | ||||||
|  |  | ||||||
| 	std::cout << " TwoFlavourRefresh: Diff ops "<<std::endl; | 	std::cout << " TwoFlavourRefresh: Diff ops "<<std::endl; | ||||||
|     AUDIT(); |  | ||||||
|         // Odd det factors |         // Odd det factors | ||||||
|         Mpc.MpcDag(etaOdd,PhiOdd); |         Mpc.MpcDag(etaOdd,PhiOdd); | ||||||
|     AUDIT(); |  | ||||||
| 	std::cout << " TwoFlavourRefresh: MpcDag "<<std::endl; | 	std::cout << " TwoFlavourRefresh: MpcDag "<<std::endl; | ||||||
|         tmp=Zero(); |         tmp=Zero(); | ||||||
|     AUDIT(); |  | ||||||
| 	std::cout << " TwoFlavourRefresh: Zero() guess "<<std::endl; | 	std::cout << " TwoFlavourRefresh: Zero() guess "<<std::endl; | ||||||
|     AUDIT(); |  | ||||||
|         HeatbathSolver(Vpc,PhiOdd,tmp); |         HeatbathSolver(Vpc,PhiOdd,tmp); | ||||||
|     AUDIT(); |  | ||||||
| 	std::cout << " TwoFlavourRefresh: Heatbath solver "<<std::endl; | 	std::cout << " TwoFlavourRefresh: Heatbath solver "<<std::endl; | ||||||
|         Vpc.Mpc(tmp,PhiOdd);             |         Vpc.Mpc(tmp,PhiOdd);             | ||||||
| 	std::cout << " TwoFlavourRefresh: Mpc "<<std::endl; | 	std::cout << " TwoFlavourRefresh: Mpc "<<std::endl; | ||||||
| @@ -220,20 +207,27 @@ NAMESPACE_BEGIN(Grid); | |||||||
|         //X = (Mdag M)^-1 V^dag phi |         //X = (Mdag M)^-1 V^dag phi | ||||||
|         //Y = (Mdag)^-1 V^dag  phi |         //Y = (Mdag)^-1 V^dag  phi | ||||||
|         Vpc.MpcDag(PhiOdd,Y);          // Y= Vdag phi |         Vpc.MpcDag(PhiOdd,Y);          // Y= Vdag phi | ||||||
|  | 	std::cout << GridLogMessage <<" Y "<<norm2(Y)<<std::endl; | ||||||
|         X=Zero(); |         X=Zero(); | ||||||
|         DerivativeSolver(Mpc,Y,X);     // X= (MdagM)^-1 Vdag phi |         DerivativeSolver(Mpc,Y,X);     // X= (MdagM)^-1 Vdag phi | ||||||
|  | 	std::cout << GridLogMessage <<" X "<<norm2(X)<<std::endl; | ||||||
|         Mpc.Mpc(X,Y);                  // Y=  Mdag^-1 Vdag phi |         Mpc.Mpc(X,Y);                  // Y=  Mdag^-1 Vdag phi | ||||||
|  | 	std::cout << GridLogMessage <<" Y "<<norm2(Y)<<std::endl; | ||||||
|  |  | ||||||
|         // phi^dag V (Mdag M)^-1 dV^dag  phi |         // phi^dag V (Mdag M)^-1 dV^dag  phi | ||||||
|         Vpc.MpcDagDeriv(force , X, PhiOdd );   dSdU = force; |         Vpc.MpcDagDeriv(force , X, PhiOdd );   dSdU = force; | ||||||
|  | 	std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl; | ||||||
|    |    | ||||||
|         // phi^dag dV (Mdag M)^-1 V^dag  phi |         // phi^dag dV (Mdag M)^-1 V^dag  phi | ||||||
|         Vpc.MpcDeriv(force , PhiOdd, X );      dSdU = dSdU+force; |         Vpc.MpcDeriv(force , PhiOdd, X );      dSdU = dSdU+force; | ||||||
|  | 	std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl; | ||||||
|  |  | ||||||
|         //    -    phi^dag V (Mdag M)^-1 Mdag dM   (Mdag M)^-1 V^dag  phi |         //    -    phi^dag V (Mdag M)^-1 Mdag dM   (Mdag M)^-1 V^dag  phi | ||||||
|         //    -    phi^dag V (Mdag M)^-1 dMdag M   (Mdag M)^-1 V^dag  phi |         //    -    phi^dag V (Mdag M)^-1 dMdag M   (Mdag M)^-1 V^dag  phi | ||||||
|         Mpc.MpcDeriv(force,Y,X);              dSdU = dSdU-force; |         Mpc.MpcDeriv(force,Y,X);              dSdU = dSdU-force; | ||||||
|  | 	std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl; | ||||||
|         Mpc.MpcDagDeriv(force,X,Y);           dSdU = dSdU-force; |         Mpc.MpcDagDeriv(force,X,Y);           dSdU = dSdU-force; | ||||||
|  | 	std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl; | ||||||
|  |  | ||||||
|         // FIXME No force contribution from EvenEven assumed here |         // FIXME No force contribution from EvenEven assumed here | ||||||
|         // Needs a fix for clover. |         // Needs a fix for clover. | ||||||
|   | |||||||
| @@ -225,6 +225,18 @@ template <class RepresentationsPolicy, | |||||||
| using GenericHMCRunnerHirep = | using GenericHMCRunnerHirep = | ||||||
| 				     HMCWrapperTemplate<PeriodicGimplR, Integrator, RepresentationsPolicy>; | 				     HMCWrapperTemplate<PeriodicGimplR, Integrator, RepresentationsPolicy>; | ||||||
|  |  | ||||||
|  | // sp2n | ||||||
|  |  | ||||||
|  | template <template <typename, typename, typename> class Integrator> | ||||||
|  | using GenericSpHMCRunner = HMCWrapperTemplate<SpPeriodicGimplR, Integrator>; | ||||||
|  |  | ||||||
|  | template <class RepresentationsPolicy, | ||||||
|  |           template <typename, typename, typename> class Integrator> | ||||||
|  | using GenericSpHMCRunnerHirep = | ||||||
|  |                      HMCWrapperTemplate<SpPeriodicGimplR, Integrator, RepresentationsPolicy>; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| template <class Implementation, class RepresentationsPolicy,  | template <class Implementation, class RepresentationsPolicy,  | ||||||
|           template <typename, typename, typename> class Integrator> |           template <typename, typename, typename> class Integrator> | ||||||
| using GenericHMCRunnerTemplate = HMCWrapperTemplate<Implementation, Integrator, RepresentationsPolicy>; | using GenericHMCRunnerTemplate = HMCWrapperTemplate<Implementation, Integrator, RepresentationsPolicy>; | ||||||
|   | |||||||
| @@ -283,12 +283,13 @@ public: | |||||||
|       std::cout << GridLogHMC << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl; |       std::cout << GridLogHMC << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl; | ||||||
|  |  | ||||||
|       TheIntegrator.print_timer(); |       TheIntegrator.print_timer(); | ||||||
|  |        | ||||||
|  |       TheIntegrator.Smearer.set_Field(Ucur); | ||||||
|       for (int obs = 0; obs < Observables.size(); obs++) { |       for (int obs = 0; obs < Observables.size(); obs++) { | ||||||
|       	std::cout << GridLogDebug << "Observables # " << obs << std::endl; |       	std::cout << GridLogDebug << "Observables # " << obs << std::endl; | ||||||
|       	std::cout << GridLogDebug << "Observables total " << Observables.size() << std::endl; |       	std::cout << GridLogDebug << "Observables total " << Observables.size() << std::endl; | ||||||
|       	std::cout << GridLogDebug << "Observables pointer " << Observables[obs] << std::endl; |       	std::cout << GridLogDebug << "Observables pointer " << Observables[obs] << std::endl; | ||||||
|         Observables[obs]->TrajectoryComplete(traj + 1, Ucur, sRNG, pRNG); |         Observables[obs]->TrajectoryComplete(traj + 1, TheIntegrator.Smearer, sRNG, pRNG); | ||||||
|       } |       } | ||||||
|       std::cout << GridLogHMC << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl; |       std::cout << GridLogHMC << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl; | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -35,13 +35,16 @@ class CheckpointerParameters : Serializable { | |||||||
| public: | public: | ||||||
|   GRID_SERIALIZABLE_CLASS_MEMBERS(CheckpointerParameters,  |   GRID_SERIALIZABLE_CLASS_MEMBERS(CheckpointerParameters,  | ||||||
| 				  std::string, config_prefix,  | 				  std::string, config_prefix,  | ||||||
|  | 				  std::string, smeared_prefix,  | ||||||
| 				  std::string, rng_prefix,  | 				  std::string, rng_prefix,  | ||||||
| 				  int, saveInterval,  | 				  int, saveInterval,  | ||||||
|  | 				  bool, saveSmeared,  | ||||||
| 				  std::string, format, ); | 				  std::string, format, ); | ||||||
|  |  | ||||||
|   CheckpointerParameters(std::string cf = "cfg", std::string rn = "rng", |   CheckpointerParameters(std::string cf = "cfg", std::string sf="cfg_smr" , std::string rn = "rng", | ||||||
| 			 int savemodulo = 1, const std::string &f = "IEEE64BIG") | 			 int savemodulo = 1, const std::string &f = "IEEE64BIG") | ||||||
|     : config_prefix(cf), |     : config_prefix(cf), | ||||||
|  |       smeared_prefix(sf), | ||||||
|       rng_prefix(rn), |       rng_prefix(rn), | ||||||
|       saveInterval(savemodulo), |       saveInterval(savemodulo), | ||||||
|       format(f){}; |       format(f){}; | ||||||
| @@ -61,13 +64,21 @@ template <class Impl> | |||||||
| class BaseHmcCheckpointer : public HmcObservable<typename Impl::Field> { | class BaseHmcCheckpointer : public HmcObservable<typename Impl::Field> { | ||||||
| public: | public: | ||||||
|   void build_filenames(int traj, CheckpointerParameters &Params, |   void build_filenames(int traj, CheckpointerParameters &Params, | ||||||
|                        std::string &conf_file, std::string &rng_file) { |                        std::string &conf_file, | ||||||
|  |                        std::string &smear_file, | ||||||
|  | 		       std::string &rng_file) { | ||||||
|     { |     { | ||||||
|       std::ostringstream os; |       std::ostringstream os; | ||||||
|       os << Params.rng_prefix << "." << traj; |       os << Params.rng_prefix << "." << traj; | ||||||
|       rng_file = os.str(); |       rng_file = os.str(); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     { | ||||||
|  |       std::ostringstream os; | ||||||
|  |       os << Params.smeared_prefix << "." << traj; | ||||||
|  |       smear_file = os.str(); | ||||||
|  |     } | ||||||
|  |  | ||||||
|     { |     { | ||||||
|       std::ostringstream os; |       std::ostringstream os; | ||||||
|       os << Params.config_prefix << "." << traj; |       os << Params.config_prefix << "." << traj; | ||||||
| @@ -84,6 +95,11 @@ public: | |||||||
|   } |   } | ||||||
|   virtual void initialize(const CheckpointerParameters &Params) = 0; |   virtual void initialize(const CheckpointerParameters &Params) = 0; | ||||||
|  |  | ||||||
|  |   virtual void TrajectoryComplete(int traj, | ||||||
|  |                                   typename Impl::Field &U, | ||||||
|  |                                   GridSerialRNG &sRNG, | ||||||
|  |                                   GridParallelRNG &pRNG) { assert(0); } ; // HMC should pass the smart config with smeared and unsmeared | ||||||
|  |    | ||||||
|   virtual void CheckpointRestore(int traj, typename Impl::Field &U, |   virtual void CheckpointRestore(int traj, typename Impl::Field &U, | ||||||
|                                  GridSerialRNG &sRNG, |                                  GridSerialRNG &sRNG, | ||||||
|                                  GridParallelRNG &pRNG) = 0; |                                  GridParallelRNG &pRNG) = 0; | ||||||
|   | |||||||
| @@ -61,11 +61,14 @@ public: | |||||||
|     fout.close(); |     fout.close(); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) { |   void TrajectoryComplete(int traj, | ||||||
|  | 			  ConfigurationBase<Field> &SmartConfig, | ||||||
|  | 			  GridSerialRNG &sRNG, GridParallelRNG &pRNG) | ||||||
|  |   { | ||||||
|  |  | ||||||
|     if ((traj % Params.saveInterval) == 0) { |     if ((traj % Params.saveInterval) == 0) { | ||||||
|       std::string config, rng; |       std::string config, rng, smr; | ||||||
|       this->build_filenames(traj, Params, config, rng); |       this->build_filenames(traj, Params, config, smr, rng); | ||||||
|  |  | ||||||
|       uint32_t nersc_csum; |       uint32_t nersc_csum; | ||||||
|       uint32_t scidac_csuma; |       uint32_t scidac_csuma; | ||||||
| @@ -74,9 +77,15 @@ public: | |||||||
|       BinarySimpleUnmunger<sobj_double, sobj> munge; |       BinarySimpleUnmunger<sobj_double, sobj> munge; | ||||||
|       truncate(rng); |       truncate(rng); | ||||||
|       BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb); |       BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb); | ||||||
|       truncate(config); |       std::cout << GridLogMessage << "Written Binary RNG " << rng | ||||||
|  |                 << " checksum " << std::hex  | ||||||
|  | 		<< nersc_csum   <<"/" | ||||||
|  | 		<< scidac_csuma   <<"/" | ||||||
|  | 		<< scidac_csumb  | ||||||
|  | 		<< std::dec << std::endl; | ||||||
|  |  | ||||||
|       BinaryIO::writeLatticeObject<vobj, sobj_double>(U, config, munge, 0, Params.format, |       truncate(config); | ||||||
|  |       BinaryIO::writeLatticeObject<vobj, sobj_double>(SmartConfig.get_U(false), config, munge, 0, Params.format, | ||||||
| 						      nersc_csum,scidac_csuma,scidac_csumb); | 						      nersc_csum,scidac_csuma,scidac_csumb); | ||||||
|  |  | ||||||
|       std::cout << GridLogMessage << "Written Binary Configuration " << config |       std::cout << GridLogMessage << "Written Binary Configuration " << config | ||||||
| @@ -85,6 +94,18 @@ public: | |||||||
| 		<< scidac_csuma   <<"/" | 		<< scidac_csuma   <<"/" | ||||||
| 		<< scidac_csumb  | 		<< scidac_csumb  | ||||||
| 		<< std::dec << std::endl; | 		<< std::dec << std::endl; | ||||||
|  |  | ||||||
|  |       if ( Params.saveSmeared ) { | ||||||
|  | 	truncate(smr); | ||||||
|  | 	BinaryIO::writeLatticeObject<vobj, sobj_double>(SmartConfig.get_U(true), smr, munge, 0, Params.format, | ||||||
|  | 							nersc_csum,scidac_csuma,scidac_csumb); | ||||||
|  | 	std::cout << GridLogMessage << "Written Binary Smeared Configuration " << smr | ||||||
|  |                 << " checksum " << std::hex  | ||||||
|  | 		<< nersc_csum   <<"/" | ||||||
|  | 		<< scidac_csuma   <<"/" | ||||||
|  | 		<< scidac_csumb  | ||||||
|  | 		<< std::dec << std::endl; | ||||||
|  |       } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|   }; |   }; | ||||||
|   | |||||||
| @@ -69,17 +69,27 @@ public: | |||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG, |   void TrajectoryComplete(int traj, | ||||||
|  | 			  ConfigurationBase<GaugeField> &SmartConfig, | ||||||
|  | 			  GridSerialRNG &sRNG, | ||||||
|                           GridParallelRNG &pRNG) { |                           GridParallelRNG &pRNG) { | ||||||
|     if ((traj % Params.saveInterval) == 0) { |     if ((traj % Params.saveInterval) == 0) { | ||||||
|       std::string config, rng; |       std::string config, rng, smr; | ||||||
|       this->build_filenames(traj, Params, config, rng); |       this->build_filenames(traj, Params, config, rng); | ||||||
|       GridBase *grid = U.Grid(); |       GridBase *grid = SmartConfig.get_U(false).Grid(); | ||||||
|       uint32_t nersc_csum,scidac_csuma,scidac_csumb; |       uint32_t nersc_csum,scidac_csuma,scidac_csumb; | ||||||
|       BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb); |       BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb); | ||||||
|  |       std::cout << GridLogMessage << "Written BINARY RNG " << rng | ||||||
|  |                 << " checksum " << std::hex  | ||||||
|  | 		<< nersc_csum<<"/" | ||||||
|  | 		<< scidac_csuma<<"/" | ||||||
|  | 		<< scidac_csumb | ||||||
|  | 		<< std::dec << std::endl; | ||||||
|  |  | ||||||
|  |        | ||||||
|       IldgWriter _IldgWriter(grid->IsBoss()); |       IldgWriter _IldgWriter(grid->IsBoss()); | ||||||
|       _IldgWriter.open(config); |       _IldgWriter.open(config); | ||||||
|       _IldgWriter.writeConfiguration<GaugeStats>(U, traj, config, config); |       _IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(false), traj, config, config); | ||||||
|       _IldgWriter.close(); |       _IldgWriter.close(); | ||||||
|  |  | ||||||
|       std::cout << GridLogMessage << "Written ILDG Configuration on " << config |       std::cout << GridLogMessage << "Written ILDG Configuration on " << config | ||||||
| @@ -88,6 +98,21 @@ public: | |||||||
| 		<< scidac_csuma<<"/" | 		<< scidac_csuma<<"/" | ||||||
| 		<< scidac_csumb | 		<< scidac_csumb | ||||||
| 		<< std::dec << std::endl; | 		<< std::dec << std::endl; | ||||||
|  |  | ||||||
|  |       if ( Params.saveSmeared ) {  | ||||||
|  | 	IldgWriter _IldgWriter(grid->IsBoss()); | ||||||
|  | 	_IldgWriter.open(smr); | ||||||
|  | 	_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, config, config); | ||||||
|  | 	_IldgWriter.close(); | ||||||
|  |  | ||||||
|  | 	std::cout << GridLogMessage << "Written ILDG Configuration on " << smr | ||||||
|  |                 << " checksum " << std::hex  | ||||||
|  | 		<< nersc_csum<<"/" | ||||||
|  | 		<< scidac_csuma<<"/" | ||||||
|  | 		<< scidac_csumb | ||||||
|  | 		<< std::dec << std::endl; | ||||||
|  |       } | ||||||
|  |  | ||||||
|     } |     } | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -52,23 +52,29 @@ public: | |||||||
|     Params.format = "IEEE64BIG";  // fixed, overwrite any other choice |     Params.format = "IEEE64BIG";  // fixed, overwrite any other choice | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG, |   virtual void TrajectoryComplete(int traj, | ||||||
|                           GridParallelRNG &pRNG) { |                                   ConfigurationBase<GaugeField> &SmartConfig, | ||||||
|  |                                   GridSerialRNG &sRNG, | ||||||
|  |                                   GridParallelRNG &pRNG) | ||||||
|  |   { | ||||||
|     if ((traj % Params.saveInterval) == 0) { |     if ((traj % Params.saveInterval) == 0) { | ||||||
|       std::string config, rng; |       std::string config, rng, smr; | ||||||
|       this->build_filenames(traj, Params, config, rng); |       this->build_filenames(traj, Params, config, smr, rng); | ||||||
|  |        | ||||||
|       int precision32 = 1; |       int precision32 = 1; | ||||||
|       int tworow = 0; |       int tworow = 0; | ||||||
|       NerscIO::writeRNGState(sRNG, pRNG, rng); |       NerscIO::writeRNGState(sRNG, pRNG, rng); | ||||||
|       NerscIO::writeConfiguration<GaugeStats>(U, config, tworow, precision32); |       NerscIO::writeConfiguration<GaugeStats>(SmartConfig.get_U(false), config, tworow, precision32); | ||||||
|  |       if ( Params.saveSmeared ) { | ||||||
|  | 	NerscIO::writeConfiguration<GaugeStats>(SmartConfig.get_U(true), smr, tworow, precision32); | ||||||
|  |       } | ||||||
|     } |     } | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   void CheckpointRestore(int traj, GaugeField &U, GridSerialRNG &sRNG, |   void CheckpointRestore(int traj, GaugeField &U, GridSerialRNG &sRNG, | ||||||
|                          GridParallelRNG &pRNG) { |                          GridParallelRNG &pRNG) { | ||||||
|     std::string config, rng; |     std::string config, rng, smr; | ||||||
|     this->build_filenames(traj, Params, config, rng); |     this->build_filenames(traj, Params, config, smr, rng ); | ||||||
|     this->check_filename(rng); |     this->check_filename(rng); | ||||||
|     this->check_filename(config); |     this->check_filename(config); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -70,19 +70,37 @@ class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> { | |||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG, |   void TrajectoryComplete(int traj,  | ||||||
|  | 			  ConfigurationBase<Field> &SmartConfig, | ||||||
|  | 			  GridSerialRNG &sRNG, | ||||||
|                           GridParallelRNG &pRNG) { |                           GridParallelRNG &pRNG) { | ||||||
|     if ((traj % Params.saveInterval) == 0) { |     if ((traj % Params.saveInterval) == 0) { | ||||||
|       std::string config, rng; |       std::string config, rng,smr; | ||||||
|       this->build_filenames(traj, Params, config, rng); |       this->build_filenames(traj, Params, config, smr, rng); | ||||||
|       GridBase *grid = U.Grid(); |       GridBase *grid = SmartConfig.get_U(false).Grid(); | ||||||
|       uint32_t nersc_csum,scidac_csuma,scidac_csumb; |       uint32_t nersc_csum,scidac_csuma,scidac_csumb; | ||||||
|       BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb); |       BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb); | ||||||
|       ScidacWriter _ScidacWriter(grid->IsBoss()); |       std::cout << GridLogMessage << "Written Binary RNG " << rng | ||||||
|       _ScidacWriter.open(config); |                 << " checksum " << std::hex  | ||||||
|       _ScidacWriter.writeScidacFieldRecord(U, MData); | 		<< nersc_csum   <<"/" | ||||||
|       _ScidacWriter.close(); | 		<< scidac_csuma   <<"/" | ||||||
|  | 		<< scidac_csumb  | ||||||
|  | 		<< std::dec << std::endl; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |       { | ||||||
|  | 	ScidacWriter _ScidacWriter(grid->IsBoss()); | ||||||
|  | 	_ScidacWriter.open(config); | ||||||
|  | 	_ScidacWriter.writeScidacFieldRecord(SmartConfig.get_U(false), MData); | ||||||
|  | 	_ScidacWriter.close(); | ||||||
|  |       } | ||||||
|  |        | ||||||
|  |       if ( Params.saveSmeared ) { | ||||||
|  | 	ScidacWriter _ScidacWriter(grid->IsBoss()); | ||||||
|  | 	_ScidacWriter.open(smr); | ||||||
|  | 	_ScidacWriter.writeScidacFieldRecord(SmartConfig.get_U(true), MData); | ||||||
|  | 	_ScidacWriter.close(); | ||||||
|  |       } | ||||||
|       std::cout << GridLogMessage << "Written Scidac Configuration on " << config << std::endl; |       std::cout << GridLogMessage << "Written Scidac Configuration on " << config << std::endl; | ||||||
|     } |     } | ||||||
|   }; |   }; | ||||||
|   | |||||||
| @@ -66,6 +66,7 @@ public: | |||||||
| template <class FieldImplementation_, class SmearingPolicy, class RepresentationPolicy> | template <class FieldImplementation_, class SmearingPolicy, class RepresentationPolicy> | ||||||
| class Integrator { | class Integrator { | ||||||
| protected: | protected: | ||||||
|  | public: | ||||||
|   typedef FieldImplementation_ FieldImplementation; |   typedef FieldImplementation_ FieldImplementation; | ||||||
|   typedef typename FieldImplementation::Field MomentaField;  //for readability |   typedef typename FieldImplementation::Field MomentaField;  //for readability | ||||||
|   typedef typename FieldImplementation::Field Field; |   typedef typename FieldImplementation::Field Field; | ||||||
| @@ -86,6 +87,8 @@ protected: | |||||||
|  |  | ||||||
|   const ActionSet<Field, RepresentationPolicy> as; |   const ActionSet<Field, RepresentationPolicy> as; | ||||||
|  |  | ||||||
|  |   ActionSet<Field,RepresentationPolicy> LevelForces; | ||||||
|  |    | ||||||
|   //Get a pointer to a shared static instance of the "do-nothing" momentum filter to serve as a default |   //Get a pointer to a shared static instance of the "do-nothing" momentum filter to serve as a default | ||||||
|   static MomentumFilterBase<MomentaField> const* getDefaultMomFilter(){  |   static MomentumFilterBase<MomentaField> const* getDefaultMomFilter(){  | ||||||
|     static MomentumFilterNone<MomentaField> filter; |     static MomentumFilterNone<MomentaField> filter; | ||||||
| @@ -96,7 +99,6 @@ protected: | |||||||
|   { |   { | ||||||
|     t_P[level] += ep; |     t_P[level] += ep; | ||||||
|     update_P(P, U, level, ep); |     update_P(P, U, level, ep); | ||||||
|  |  | ||||||
|     std::cout << GridLogIntegrator << "[" << level << "] P " << " dt " << ep << " : t_P " << t_P[level] << std::endl; |     std::cout << GridLogIntegrator << "[" << level << "] P " << " dt " << ep << " : t_P " << t_P[level] << std::endl; | ||||||
|   } |   } | ||||||
|  |  | ||||||
| @@ -124,37 +126,33 @@ protected: | |||||||
|     // input U actually not used in the fundamental case |     // input U actually not used in the fundamental case | ||||||
|     // Fundamental updates, include smearing |     // Fundamental updates, include smearing | ||||||
|  |  | ||||||
|  |     assert(as.size()==LevelForces.size()); | ||||||
|  |      | ||||||
|  |     Field level_force(U.Grid()); level_force =Zero(); | ||||||
|     for (int a = 0; a < as[level].actions.size(); ++a) { |     for (int a = 0; a < as[level].actions.size(); ++a) { | ||||||
|  |  | ||||||
|       double start_full = usecond(); |       double start_full = usecond(); | ||||||
|       Field force(U.Grid()); |       Field force(U.Grid()); | ||||||
|       conformable(U.Grid(), Mom.Grid()); |       conformable(U.Grid(), Mom.Grid()); | ||||||
|  |  | ||||||
|       Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared); |  | ||||||
|       double start_force = usecond(); |       double start_force = usecond(); | ||||||
|  |  | ||||||
|       std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] before"<<std::endl; |  | ||||||
|       AUDIT(); |  | ||||||
|        |  | ||||||
|       as[level].actions.at(a)->deriv_timer_start(); |       as[level].actions.at(a)->deriv_timer_start(); | ||||||
|       as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta |       as[level].actions.at(a)->deriv(Smearer, force);  // deriv should NOT include Ta | ||||||
|       as[level].actions.at(a)->deriv_timer_stop(); |       as[level].actions.at(a)->deriv_timer_stop(); | ||||||
|  |  | ||||||
|       std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] after"<<std::endl; |  | ||||||
|       AUDIT(); |  | ||||||
|  |  | ||||||
|       std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl; |  | ||||||
|       auto name = as[level].actions.at(a)->action_name(); |       auto name = as[level].actions.at(a)->action_name(); | ||||||
|       if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force); |  | ||||||
|  |  | ||||||
|       force = FieldImplementation::projectForce(force); // Ta for gauge fields |       force = FieldImplementation::projectForce(force); // Ta for gauge fields | ||||||
|       double end_force = usecond(); |       double end_force = usecond(); | ||||||
|  |  | ||||||
|       //      DumpSliceNorm("force ",force,Nd-1); |  | ||||||
|       MomFilter->applyFilter(force); |  | ||||||
|       std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<<" dt "<<ep<<  std::endl; |  | ||||||
|       DumpSliceNorm("force filtered ",force,Nd-1); |  | ||||||
|        |        | ||||||
|  |       MomFilter->applyFilter(force); | ||||||
|  |  | ||||||
|  |       std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<<" dt "<<ep<<  std::endl; | ||||||
|  |  | ||||||
|  |       // track the total | ||||||
|  |       level_force = level_force+force; | ||||||
|  |  | ||||||
|       Real force_abs   = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x])  |       Real force_abs   = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x])  | ||||||
|       Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;     |       Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;     | ||||||
|  |  | ||||||
| @@ -177,6 +175,16 @@ protected: | |||||||
|  |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     { | ||||||
|  |       // total force | ||||||
|  |       Real force_abs   = std::sqrt(norm2(level_force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x])  | ||||||
|  |       Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;     | ||||||
|  |  | ||||||
|  |       Real force_max   = std::sqrt(maxLocalNorm2(level_force)); | ||||||
|  |       Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR;     | ||||||
|  |       LevelForces[level].actions.at(0)->deriv_log(force_abs,force_max,impulse_abs,impulse_max); | ||||||
|  |     } | ||||||
|  |  | ||||||
|     // Force from the other representations |     // Force from the other representations | ||||||
|     as[level].apply(update_P_hireps, Representations, Mom, U, ep); |     as[level].apply(update_P_hireps, Representations, Mom, U, ep); | ||||||
|  |  | ||||||
| @@ -226,6 +234,16 @@ public: | |||||||
|  |  | ||||||
|     //Default the momentum filter to "do-nothing" |     //Default the momentum filter to "do-nothing" | ||||||
|     MomFilter = getDefaultMomFilter(); |     MomFilter = getDefaultMomFilter(); | ||||||
|  |  | ||||||
|  |     for (int level = 0; level < as.size(); ++level) { | ||||||
|  |       int multiplier = as.at(level).multiplier; | ||||||
|  |       ActionLevel<Field> * Level = new ActionLevel<Field>(multiplier); | ||||||
|  |       Level->push_back(new EmptyAction<Field>);  | ||||||
|  |       LevelForces.push_back(*Level); | ||||||
|  |       // does it copy by value or reference?? | ||||||
|  |       // - answer it copies by value, BUT the action level contains a reference that is NOT updated. | ||||||
|  |       // Unsafe code in Guido's area | ||||||
|  |     } | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   virtual ~Integrator() {} |   virtual ~Integrator() {} | ||||||
| @@ -243,10 +261,14 @@ public: | |||||||
|  |  | ||||||
|   void reset_timer(void) |   void reset_timer(void) | ||||||
|   { |   { | ||||||
|  |     assert(as.size()==LevelForces.size()); | ||||||
|     for (int level = 0; level < as.size(); ++level) { |     for (int level = 0; level < as.size(); ++level) { | ||||||
|       for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { |       for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { | ||||||
|         as[level].actions.at(actionID)->reset_timer(); |         as[level].actions.at(actionID)->reset_timer(); | ||||||
|       } |       } | ||||||
|  |       int actionID=0; | ||||||
|  |       assert(LevelForces.at(level).actions.size()==1); | ||||||
|  |       LevelForces.at(level).actions.at(actionID)->reset_timer(); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|   void print_timer(void) |   void print_timer(void) | ||||||
| @@ -308,6 +330,16 @@ public: | |||||||
| 		  <<" calls "     << as[level].actions.at(actionID)->deriv_num | 		  <<" calls "     << as[level].actions.at(actionID)->deriv_num | ||||||
| 		  << std::endl; | 		  << std::endl; | ||||||
|       } |       } | ||||||
|  |       int actionID=0; | ||||||
|  |       std::cout << GridLogMessage  | ||||||
|  | 		  << LevelForces[level].actions.at(actionID)->action_name() | ||||||
|  | 		  <<"["<<level<<"]["<< actionID<<"] :\n\t\t " | ||||||
|  | 		  <<" force max " << LevelForces[level].actions.at(actionID)->deriv_max_average() | ||||||
|  | 		  <<" norm "      << LevelForces[level].actions.at(actionID)->deriv_norm_average() | ||||||
|  | 		  <<" Fdt max  "  << LevelForces[level].actions.at(actionID)->Fdt_max_average() | ||||||
|  | 		  <<" Fdt norm "  << LevelForces[level].actions.at(actionID)->Fdt_norm_average() | ||||||
|  | 		  <<" calls "     << LevelForces[level].actions.at(actionID)->deriv_num | ||||||
|  | 		  << std::endl; | ||||||
|     } |     } | ||||||
|     std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl; |     std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl; | ||||||
|   } |   } | ||||||
| @@ -329,6 +361,13 @@ public: | |||||||
| 	std::cout << as[level].actions.at(actionID)->LogParameters(); | 	std::cout << as[level].actions.at(actionID)->LogParameters(); | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |     std::cout << " [Integrator] Total Force loggers: "<< LevelForces.size() <<std::endl; | ||||||
|  |     for (int level = 0; level < LevelForces.size(); ++level) { | ||||||
|  |       std::cout << GridLogMessage << "[Integrator] ---- Level: "<< level << std::endl; | ||||||
|  |       for (int actionID = 0; actionID < LevelForces[level].actions.size(); ++actionID) { | ||||||
|  | 	std::cout << GridLogMessage << "["<< LevelForces[level].actions.at(actionID)->action_name() << "] ID: " << actionID << std::endl; | ||||||
|  |       } | ||||||
|  |     } | ||||||
|     std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl; |     std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl; | ||||||
|   } |   } | ||||||
|  |  | ||||||
| @@ -379,15 +418,10 @@ public: | |||||||
| 	auto name = as[level].actions.at(actionID)->action_name(); | 	auto name = as[level].actions.at(actionID)->action_name(); | ||||||
|         std::cout << GridLogMessage << "refresh [" << level << "][" << actionID << "] "<<name << std::endl; |         std::cout << GridLogMessage << "refresh [" << level << "][" << actionID << "] "<<name << std::endl; | ||||||
|  |  | ||||||
|         Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); |  | ||||||
|  |  | ||||||
| 	std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] before"<<std::endl; |  | ||||||
| 	AUDIT(); |  | ||||||
| 	as[level].actions.at(actionID)->refresh_timer_start(); | 	as[level].actions.at(actionID)->refresh_timer_start(); | ||||||
|         as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG); |         as[level].actions.at(actionID)->refresh(Smearer, sRNG, pRNG); | ||||||
| 	as[level].actions.at(actionID)->refresh_timer_stop(); | 	as[level].actions.at(actionID)->refresh_timer_stop(); | ||||||
| 	std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] after"<<std::endl; |  | ||||||
| 	AUDIT(); |  | ||||||
|       } |       } | ||||||
|  |  | ||||||
|       // Refresh the higher representation actions |       // Refresh the higher representation actions | ||||||
| @@ -415,6 +449,7 @@ public: | |||||||
|   RealD S(Field& U)  |   RealD S(Field& U)  | ||||||
|   {  // here also U not used |   {  // here also U not used | ||||||
|  |  | ||||||
|  |     assert(as.size()==LevelForces.size()); | ||||||
|     std::cout << GridLogIntegrator << "Integrator action\n"; |     std::cout << GridLogIntegrator << "Integrator action\n"; | ||||||
|  |  | ||||||
|     RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom |     RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom | ||||||
| @@ -424,17 +459,16 @@ public: | |||||||
|     // Actions |     // Actions | ||||||
|     for (int level = 0; level < as.size(); ++level) { |     for (int level = 0; level < as.size(); ++level) { | ||||||
|       for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { |       for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { | ||||||
| 	AUDIT(); |  | ||||||
|         // get gauge field from the SmearingPolicy and |         // get gauge field from the SmearingPolicy and | ||||||
|         // based on the boolean is_smeared in actionID |         // based on the boolean is_smeared in actionID | ||||||
|         Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); |  | ||||||
|         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; |         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; | ||||||
| 	        as[level].actions.at(actionID)->S_timer_start(); | 	        as[level].actions.at(actionID)->S_timer_start(); | ||||||
|         Hterm = as[level].actions.at(actionID)->S(Us); |         Hterm = as[level].actions.at(actionID)->S(Smearer); | ||||||
|    	        as[level].actions.at(actionID)->S_timer_stop(); |    	        as[level].actions.at(actionID)->S_timer_stop(); | ||||||
|         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; |         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; | ||||||
|         H += Hterm; |         H += Hterm; | ||||||
| 	AUDIT(); |  | ||||||
|       } |       } | ||||||
|       as[level].apply(S_hireps, Representations, level, H); |       as[level].apply(S_hireps, Representations, level, H); | ||||||
|     } |     } | ||||||
| @@ -447,9 +481,9 @@ public: | |||||||
|     void operator()(std::vector<Action<FieldType>*> repr_set, Repr& Rep, int level, RealD& H) { |     void operator()(std::vector<Action<FieldType>*> repr_set, Repr& Rep, int level, RealD& H) { | ||||||
|        |        | ||||||
|       for (int a = 0; a < repr_set.size(); ++a) { |       for (int a = 0; a < repr_set.size(); ++a) { | ||||||
| 	AUDIT(); |  | ||||||
|         RealD Hterm = repr_set.at(a)->Sinitial(Rep.U); |         RealD Hterm = repr_set.at(a)->Sinitial(Rep.U); | ||||||
| 	AUDIT(); |  | ||||||
|         std::cout << GridLogMessage << "Sinitial Level " << level << " term " << a << " H Hirep = " << Hterm << std::endl; |         std::cout << GridLogMessage << "Sinitial Level " << level << " term " << a << " H Hirep = " << Hterm << std::endl; | ||||||
|         H += Hterm; |         H += Hterm; | ||||||
|  |  | ||||||
| @@ -471,13 +505,12 @@ public: | |||||||
|       for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { |       for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { | ||||||
|         // get gauge field from the SmearingPolicy and |         // get gauge field from the SmearingPolicy and | ||||||
|         // based on the boolean is_smeared in actionID |         // based on the boolean is_smeared in actionID | ||||||
|         Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); |  | ||||||
|         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; |         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; | ||||||
| 	        as[level].actions.at(actionID)->S_timer_start(); |  | ||||||
| 	AUDIT(); | 	as[level].actions.at(actionID)->S_timer_start(); | ||||||
|         Hterm = as[level].actions.at(actionID)->Sinitial(Us); |         Hterm = as[level].actions.at(actionID)->S(Smearer); | ||||||
|    	        as[level].actions.at(actionID)->S_timer_stop(); | 	as[level].actions.at(actionID)->S_timer_stop(); | ||||||
| 	AUDIT(); |  | ||||||
|         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; |         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; | ||||||
|         H += Hterm; |         H += Hterm; | ||||||
|       } |       } | ||||||
| @@ -490,7 +523,6 @@ public: | |||||||
|    |    | ||||||
|   void integrate(Field& U)  |   void integrate(Field& U)  | ||||||
|   { |   { | ||||||
|     AUDIT(); |  | ||||||
|     // reset the clocks |     // reset the clocks | ||||||
|     t_U = 0; |     t_U = 0; | ||||||
|     for (int level = 0; level < as.size(); ++level) { |     for (int level = 0; level < as.size(); ++level) { | ||||||
| @@ -508,10 +540,8 @@ public: | |||||||
|       assert(fabs(t_U - t_P[level]) < 1.0e-6);  // must be the same |       assert(fabs(t_U - t_P[level]) < 1.0e-6);  // must be the same | ||||||
|       std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl; |       std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl; | ||||||
|     } |     } | ||||||
|     AUDIT(); |  | ||||||
|  |  | ||||||
|     FieldImplementation::Project(U); |     FieldImplementation::Project(U); | ||||||
|     AUDIT(); |  | ||||||
|  |  | ||||||
|     // and that we indeed got to the end of the trajectory |     // and that we indeed got to the end of the trajectory | ||||||
|     assert(fabs(t_U - Params.trajL) < 1.0e-6); |     assert(fabs(t_U - Params.trajL) < 1.0e-6); | ||||||
|   | |||||||
| @@ -34,6 +34,13 @@ NAMESPACE_BEGIN(Grid); | |||||||
| template <class Field> | template <class Field> | ||||||
| class HmcObservable { | class HmcObservable { | ||||||
|  public: |  public: | ||||||
|  |   virtual void TrajectoryComplete(int traj, | ||||||
|  |                                   ConfigurationBase<Field> &SmartConfig, | ||||||
|  |                                   GridSerialRNG &sRNG, | ||||||
|  |                                   GridParallelRNG &pRNG) | ||||||
|  |   { | ||||||
|  |     TrajectoryComplete(traj,SmartConfig.get_U(false),sRNG,pRNG); // Unsmeared observable | ||||||
|  |   }; | ||||||
|   virtual void TrajectoryComplete(int traj, |   virtual void TrajectoryComplete(int traj, | ||||||
|                                   Field &U, |                                   Field &U, | ||||||
|                                   GridSerialRNG &sRNG, |                                   GridSerialRNG &sRNG, | ||||||
|   | |||||||
| @@ -42,6 +42,18 @@ public: | |||||||
|   // necessary for HmcObservable compatibility |   // necessary for HmcObservable compatibility | ||||||
|   typedef typename Impl::Field Field; |   typedef typename Impl::Field Field; | ||||||
|  |  | ||||||
|  |   virtual void TrajectoryComplete(int traj, | ||||||
|  |                                   ConfigurationBase<Field> &SmartConfig, | ||||||
|  |                                   GridSerialRNG &sRNG, | ||||||
|  |                                   GridParallelRNG &pRNG) | ||||||
|  |   { | ||||||
|  |     std::cout << GridLogMessage << "+++++++++++++++++++"<<std::endl; | ||||||
|  |     std::cout << GridLogMessage << "Unsmeared plaquette"<<std::endl; | ||||||
|  |     TrajectoryComplete(traj,SmartConfig.get_U(false),sRNG,pRNG); // Unsmeared observable | ||||||
|  |     std::cout << GridLogMessage << "Smeared plaquette"<<std::endl; | ||||||
|  |     TrajectoryComplete(traj,SmartConfig.get_U(true),sRNG,pRNG); // Unsmeared observable | ||||||
|  |     std::cout << GridLogMessage << "+++++++++++++++++++"<<std::endl; | ||||||
|  |   }; | ||||||
|   void TrajectoryComplete(int traj, |   void TrajectoryComplete(int traj, | ||||||
|                           Field &U, |                           Field &U, | ||||||
|                           GridSerialRNG &sRNG, |                           GridSerialRNG &sRNG, | ||||||
|   | |||||||
| @@ -13,7 +13,7 @@ NAMESPACE_BEGIN(Grid); | |||||||
|  * Empty since HMC updates already the fundamental representation  |  * Empty since HMC updates already the fundamental representation  | ||||||
|  */ |  */ | ||||||
|  |  | ||||||
| template <int ncolour> | template <int ncolour, class group_name> | ||||||
| class FundamentalRep { | class FundamentalRep { | ||||||
| public: | public: | ||||||
|   static const int Dimension = ncolour; |   static const int Dimension = ncolour; | ||||||
| @@ -21,7 +21,7 @@ public: | |||||||
|  |  | ||||||
|   // typdef to be used by the Representations class in HMC to get the |   // typdef to be used by the Representations class in HMC to get the | ||||||
|   // types for the higher representation fields |   // types for the higher representation fields | ||||||
|   typedef typename SU<ncolour>::LatticeMatrix LatticeMatrix; |   typedef typename GaugeGroup<ncolour,group_name>::LatticeMatrix LatticeMatrix; | ||||||
|   typedef LatticeGaugeField LatticeField; |   typedef LatticeGaugeField LatticeField; | ||||||
|    |    | ||||||
|   explicit FundamentalRep(GridBase* grid) {} //do nothing |   explicit FundamentalRep(GridBase* grid) {} //do nothing | ||||||
| @@ -45,7 +45,8 @@ public: | |||||||
|      |      | ||||||
|  |  | ||||||
|    |    | ||||||
| typedef	 FundamentalRep<Nc> FundamentalRepresentation; | typedef	 FundamentalRep<Nc,GroupName::SU> FundamentalRepresentation; | ||||||
|  | typedef	 FundamentalRep<Nc,GroupName::Sp> SpFundamentalRepresentation; | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid);   | NAMESPACE_END(Grid);   | ||||||
|  |  | ||||||
|   | |||||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user