mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-31 03:54:33 +00:00 
			
		
		
		
	Compare commits
	
		
			78 Commits
		
	
	
		
			feature/bl
			...
			debug-crus
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | bbec7f9fa9 | ||
|  | 3aa43e6065 | ||
|  | 78ac4044ff | ||
|  | 119c3db47f | ||
|  | 21bbdb8fc2 | ||
|  | 739bd7572c | ||
|  | 074627a5bd | ||
|  | 6a23b2c599 | ||
|  | bd891fb3f5 | ||
|  | 3984265851 | ||
|  | 45361d188f | ||
|  | 80c9d77e02 | ||
|  | 3aff64dddb | ||
|  | b4f2ca81ff | ||
|  | d1dea5f840 | ||
|  | 54f8b84d16 | ||
|  | da503fef0e | ||
|  | 4a6802098a | ||
|  | f9b41a84d2 | ||
| 5d7e0d18b9 | |||
| 9e64387933 | |||
| 983b681d46 | |||
| 4072408b6f | |||
| bd76b47fbf | |||
| 18ce23aa75 | |||
|  | ffa7fe0cc2 | ||
|  | 6b979f0a69 | ||
|  | 86dac5ff4f | ||
|  | 4a382fad3f | ||
|  | cc753670d9 | ||
|  | cc9d88ea1c | ||
|  | b281b0166e | ||
|  | 6a21f694ff | ||
|  | fc4db5e963 | ||
|  | 6252ffaf76 | ||
|  | af64c1c6b6 | ||
|  | 866f48391a | ||
|  | a4df527d74 | ||
|  | 5764d21161 | ||
|  | 496d04cd85 | ||
|  | 10e6d7c6ce | ||
|  | c42e25e5b8 | ||
|  | a00ae981e0 | ||
|  | 58e020b62a | ||
|  | a7e1aceeca | ||
|  | 7212432f43 | ||
|  | 4a261fab30 | ||
|  | 6af97069b9 | ||
|  | 5068413cdb | ||
|  | 71c6960eea | ||
|  | ddf6d5c9e3 | ||
| 39214702f6 | |||
| 3e4614c63a | |||
|  | 900e01f49b | ||
|  | 2376156fbc | ||
|  | 3f2fd49db4 | ||
|  | 0efa107cb6 | ||
|  | 8feedb4f6f | ||
|  | 05e562e3d7 | ||
|  | dd3bbb8fa2 | ||
|  | 2fbcf13c46 | ||
|  | 4ea48ef0c4 | ||
|  | 5c85774ee3 | ||
|  | d8a9a745d8 | ||
|  | dcf172da3b | ||
|  | 546be724e7 | ||
|  | 481bbaf1fc | ||
|  | 281488611a | ||
|  | bae0f8ea99 | ||
|  | bbbcd36ae5 | ||
| a3e935c902 | |||
| 7731c7db8e | |||
| ff97340324 | |||
| 920a51438d | |||
| be528b6d27 | |||
|  | 7d62f1d6d2 | ||
|  | 458c943987 | ||
|  | 88015b0858 | 
							
								
								
									
										54
									
								
								.github/ISSUE_TEMPLATE/bug-report.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								.github/ISSUE_TEMPLATE/bug-report.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,54 @@ | |||||||
|  | name: Bug report | ||||||
|  | description: Report a bug. | ||||||
|  | title: "<insert title>" | ||||||
|  | labels: [bug] | ||||||
|  |  | ||||||
|  | body: | ||||||
|  |   - type: markdown | ||||||
|  |     attributes: | ||||||
|  |       value: > | ||||||
|  |         Thank you for taking the time to file a bug report. | ||||||
|  |         Please check that the code is pointing to the HEAD of develop | ||||||
|  |         or any commit in master which is tagged with a version number. | ||||||
|  |  | ||||||
|  |   - type: textarea | ||||||
|  |     attributes: | ||||||
|  |       label: "Describe the issue:" | ||||||
|  |       description: > | ||||||
|  |         Describe the issue and any previous attempt to solve it. | ||||||
|  |     validations: | ||||||
|  |       required: true | ||||||
|  |  | ||||||
|  |   - type: textarea | ||||||
|  |     attributes: | ||||||
|  |       label: "Code example:" | ||||||
|  |       description: > | ||||||
|  |         If relevant, show how to reproduce the issue using a minimal working | ||||||
|  |         example. | ||||||
|  |       placeholder: | | ||||||
|  |         << your code here >> | ||||||
|  |       render: shell | ||||||
|  |     validations: | ||||||
|  |       required: false | ||||||
|  |  | ||||||
|  |   - type: textarea | ||||||
|  |     attributes: | ||||||
|  |       label: "Target platform:" | ||||||
|  |       description: > | ||||||
|  |         Give a description of the target platform (CPU, network, compiler). | ||||||
|  |         Please give the full CPU part description, using for example | ||||||
|  |         `cat /proc/cpuinfo | grep 'model name' | uniq` (Linux) | ||||||
|  |         or `sysctl machdep.cpu.brand_string` (macOS) and the full output | ||||||
|  |         the `--version` option of your compiler. | ||||||
|  |     validations: | ||||||
|  |       required: true | ||||||
|  |  | ||||||
|  |   - type: textarea | ||||||
|  |     attributes: | ||||||
|  |       label: "Configure options:" | ||||||
|  |       description: > | ||||||
|  |         Please give the exact configure command used and attach | ||||||
|  |         `config.log`, `grid.config.summary` and the output of `make V=1`. | ||||||
|  |       render: shell | ||||||
|  |     validations: | ||||||
|  |       required: true | ||||||
| @@ -55,6 +55,7 @@ NAMESPACE_CHECK(BiCGSTAB); | |||||||
| #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h> | #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h> | ||||||
| #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h> | #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h> | ||||||
| #include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h> | #include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h> | ||||||
|  | #include <Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h> | ||||||
| #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h> | #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h> | ||||||
| #include <Grid/algorithms/iterative/BlockConjugateGradient.h> | #include <Grid/algorithms/iterative/BlockConjugateGradient.h> | ||||||
| #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h> | #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h> | ||||||
|   | |||||||
| @@ -542,6 +542,7 @@ public: | |||||||
|       (*this)(in[i], out[i]); |       (*this)(in[i], out[i]); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |   virtual ~LinearFunction(){}; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| template<class Field> class IdentityLinearFunction : public LinearFunction<Field> { | template<class Field> class IdentityLinearFunction : public LinearFunction<Field> { | ||||||
|   | |||||||
| @@ -191,7 +191,7 @@ public: | |||||||
| 	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl; | 	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl; | ||||||
| 	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl; | 	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl; | ||||||
|  |  | ||||||
| 	std::cout << GridLogMessage << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl; | 	std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl; | ||||||
|  |  | ||||||
|         if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); |         if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										213
									
								
								Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										213
									
								
								Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,213 @@ | |||||||
|  | /************************************************************************************* | ||||||
|  |  | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  |     Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h | ||||||
|  |  | ||||||
|  |     Copyright (C) 2015 | ||||||
|  |  | ||||||
|  |     Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk> | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |  | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |  | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  | *************************************************************************************/ | ||||||
|  | /*  END LEGAL */ | ||||||
|  | #ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H | ||||||
|  | #define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H | ||||||
|  |  | ||||||
|  | NAMESPACE_BEGIN(Grid); | ||||||
|  |  | ||||||
|  | //Mixed precision restarted defect correction CG | ||||||
|  | template<class FieldD,class FieldF,  | ||||||
|  |   typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, | ||||||
|  |   typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>  | ||||||
|  | class MixedPrecisionConjugateGradientBatched : public LinearFunction<FieldD> { | ||||||
|  | public: | ||||||
|  |   using LinearFunction<FieldD>::operator(); | ||||||
|  |   RealD   Tolerance; | ||||||
|  |   RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed | ||||||
|  |   Integer MaxInnerIterations; | ||||||
|  |   Integer MaxOuterIterations; | ||||||
|  |   Integer MaxPatchupIterations; | ||||||
|  |   GridBase* SinglePrecGrid; //Grid for single-precision fields | ||||||
|  |   RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance | ||||||
|  |   LinearOperatorBase<FieldF> &Linop_f; | ||||||
|  |   LinearOperatorBase<FieldD> &Linop_d; | ||||||
|  |  | ||||||
|  |   //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess | ||||||
|  |   LinearFunction<FieldF> *guesser; | ||||||
|  |   bool updateResidual; | ||||||
|  |    | ||||||
|  |   MixedPrecisionConjugateGradientBatched(RealD tol,  | ||||||
|  |           Integer maxinnerit,  | ||||||
|  |           Integer maxouterit,  | ||||||
|  |           Integer maxpatchit, | ||||||
|  |           GridBase* _sp_grid,  | ||||||
|  |           LinearOperatorBase<FieldF> &_Linop_f,  | ||||||
|  |           LinearOperatorBase<FieldD> &_Linop_d, | ||||||
|  |           bool _updateResidual=true) : | ||||||
|  |     Linop_f(_Linop_f), Linop_d(_Linop_d), | ||||||
|  |     Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid), | ||||||
|  |     OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { }; | ||||||
|  |  | ||||||
|  |   void useGuesser(LinearFunction<FieldF> &g){ | ||||||
|  |     guesser = &g; | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   void operator() (const FieldD &src_d_in, FieldD &sol_d){ | ||||||
|  |     std::vector<FieldD> srcs_d_in{src_d_in}; | ||||||
|  |     std::vector<FieldD> sols_d{sol_d}; | ||||||
|  |  | ||||||
|  |     (*this)(srcs_d_in,sols_d); | ||||||
|  |  | ||||||
|  |     sol_d = sols_d[0]; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){ | ||||||
|  |     assert(src_d_in.size() == sol_d.size()); | ||||||
|  |     int NBatch = src_d_in.size(); | ||||||
|  |  | ||||||
|  |     std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl; | ||||||
|  |  | ||||||
|  |     Integer TotalOuterIterations = 0; //Number of restarts | ||||||
|  |     std::vector<Integer> TotalInnerIterations(NBatch,0);     //Number of inner CG iterations | ||||||
|  |     std::vector<Integer> TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step | ||||||
|  |    | ||||||
|  |     GridStopWatch TotalTimer; | ||||||
|  |     TotalTimer.Start(); | ||||||
|  |  | ||||||
|  |     GridStopWatch InnerCGtimer; | ||||||
|  |     GridStopWatch PrecChangeTimer; | ||||||
|  |      | ||||||
|  |     int cb = src_d_in[0].Checkerboard(); | ||||||
|  |      | ||||||
|  |     std::vector<RealD> src_norm; | ||||||
|  |     std::vector<RealD> norm; | ||||||
|  |     std::vector<RealD> stop; | ||||||
|  |      | ||||||
|  |     GridBase* DoublePrecGrid = src_d_in[0].Grid(); | ||||||
|  |     FieldD tmp_d(DoublePrecGrid); | ||||||
|  |     tmp_d.Checkerboard() = cb; | ||||||
|  |      | ||||||
|  |     FieldD tmp2_d(DoublePrecGrid); | ||||||
|  |     tmp2_d.Checkerboard() = cb; | ||||||
|  |  | ||||||
|  |     std::vector<FieldD> src_d; | ||||||
|  |     std::vector<FieldF> src_f; | ||||||
|  |     std::vector<FieldF> sol_f; | ||||||
|  |  | ||||||
|  |     for (int i=0; i<NBatch; i++) { | ||||||
|  |       sol_d[i].Checkerboard() = cb; | ||||||
|  |  | ||||||
|  |       src_norm.push_back(norm2(src_d_in[i])); | ||||||
|  |       norm.push_back(0.); | ||||||
|  |       stop.push_back(src_norm[i] * Tolerance*Tolerance); | ||||||
|  |  | ||||||
|  |       src_d.push_back(src_d_in[i]); //source for next inner iteration, computed from residual during operation | ||||||
|  |  | ||||||
|  |       src_f.push_back(SinglePrecGrid); | ||||||
|  |       src_f[i].Checkerboard() = cb; | ||||||
|  |  | ||||||
|  |       sol_f.push_back(SinglePrecGrid); | ||||||
|  |       sol_f[i].Checkerboard() = cb; | ||||||
|  |     } | ||||||
|  |      | ||||||
|  |     RealD inner_tol = InnerTolerance; | ||||||
|  |      | ||||||
|  |     ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations); | ||||||
|  |     CG_f.ErrorOnNoConverge = false; | ||||||
|  |      | ||||||
|  |     Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count | ||||||
|  |        | ||||||
|  |     for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){ | ||||||
|  |       std::cout << GridLogMessage << std::endl; | ||||||
|  |       std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl; | ||||||
|  |        | ||||||
|  |       bool allConverged = true; | ||||||
|  |        | ||||||
|  |       for (int i=0; i<NBatch; i++) { | ||||||
|  |         //Compute double precision rsd and also new RHS vector. | ||||||
|  |         Linop_d.HermOp(sol_d[i], tmp_d); | ||||||
|  |         norm[i] = axpy_norm(src_d[i], -1., tmp_d, src_d_in[i]); //src_d is residual vector | ||||||
|  |          | ||||||
|  |         std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Outer iteration " << outer_iter <<" solve " << i << " residual "<< norm[i] << " target "<< stop[i] <<std::endl; | ||||||
|  |  | ||||||
|  |         PrecChangeTimer.Start(); | ||||||
|  |         precisionChange(src_f[i], src_d[i]); | ||||||
|  |         PrecChangeTimer.Stop(); | ||||||
|  |          | ||||||
|  |         sol_f[i] = Zero(); | ||||||
|  |        | ||||||
|  |         if(norm[i] > OuterLoopNormMult * stop[i]) { | ||||||
|  |           allConverged = false; | ||||||
|  |         } | ||||||
|  |       } | ||||||
|  |       if (allConverged) break; | ||||||
|  |  | ||||||
|  |       if (updateResidual) { | ||||||
|  |         RealD normMax = *std::max_element(std::begin(norm), std::end(norm)); | ||||||
|  |         RealD stopMax = *std::max_element(std::begin(stop), std::end(stop)); | ||||||
|  |         while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ?? | ||||||
|  |         CG_f.Tolerance = inner_tol; | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       //Optionally improve inner solver guess (eg using known eigenvectors) | ||||||
|  |       if(guesser != NULL) { | ||||||
|  |         (*guesser)(src_f, sol_f); | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       for (int i=0; i<NBatch; i++) { | ||||||
|  |         //Inner CG | ||||||
|  |         InnerCGtimer.Start(); | ||||||
|  |         CG_f(Linop_f, src_f[i], sol_f[i]); | ||||||
|  |         InnerCGtimer.Stop(); | ||||||
|  |         TotalInnerIterations[i] += CG_f.IterationsToComplete; | ||||||
|  |          | ||||||
|  |         //Convert sol back to double and add to double prec solution | ||||||
|  |         PrecChangeTimer.Start(); | ||||||
|  |         precisionChange(tmp_d, sol_f[i]); | ||||||
|  |         PrecChangeTimer.Stop(); | ||||||
|  |          | ||||||
|  |         axpy(sol_d[i], 1.0, tmp_d, sol_d[i]); | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |     } | ||||||
|  |      | ||||||
|  |     //Final trial CG | ||||||
|  |     std::cout << GridLogMessage << std::endl; | ||||||
|  |     std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Starting final patch-up double-precision solve"<<std::endl; | ||||||
|  |      | ||||||
|  |     for (int i=0; i<NBatch; i++) { | ||||||
|  |       ConjugateGradient<FieldD> CG_d(Tolerance, MaxPatchupIterations); | ||||||
|  |       CG_d(Linop_d, src_d_in[i], sol_d[i]); | ||||||
|  |       TotalFinalStepIterations[i] += CG_d.IterationsToComplete; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     TotalTimer.Stop(); | ||||||
|  |  | ||||||
|  |     std::cout << GridLogMessage << std::endl; | ||||||
|  |     for (int i=0; i<NBatch; i++) { | ||||||
|  |       std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: solve " << i << " Inner CG iterations " << TotalInnerIterations[i] << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations[i] << std::endl; | ||||||
|  |     } | ||||||
|  |     std::cout << GridLogMessage << std::endl; | ||||||
|  |     std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl; | ||||||
|  |      | ||||||
|  |   } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
|  | #endif | ||||||
| @@ -166,16 +166,16 @@ public: | |||||||
|       rsqf[s] =rsq[s]; |       rsqf[s] =rsq[s]; | ||||||
|       std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift "<< s <<" target resid "<<rsq[s]<<std::endl; |       std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift "<< s <<" target resid "<<rsq[s]<<std::endl; | ||||||
|       //      ps_d[s] = src_d; |       //      ps_d[s] = src_d; | ||||||
|       precisionChangeFast(ps_f[s],src_d); |       precisionChange(ps_f[s],src_d); | ||||||
|     } |     } | ||||||
|     // r and p for primary |     // r and p for primary | ||||||
|     p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys |     p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys | ||||||
|     r_d = p_d; |     r_d = p_d; | ||||||
|      |      | ||||||
|     //MdagM+m[0] |     //MdagM+m[0] | ||||||
|     precisionChangeFast(p_f,p_d); |     precisionChange(p_f,p_d); | ||||||
|     Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp) |     Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp) | ||||||
|     precisionChangeFast(tmp_d,mmp_f); |     precisionChange(tmp_d,mmp_f); | ||||||
|     Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp) |     Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp) | ||||||
|     tmp_d = tmp_d - mmp_d; |     tmp_d = tmp_d - mmp_d; | ||||||
|     std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl; |     std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl; | ||||||
| @@ -204,7 +204,7 @@ public: | |||||||
|    |    | ||||||
|     for(int s=0;s<nshift;s++) { |     for(int s=0;s<nshift;s++) { | ||||||
|       axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d); |       axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d); | ||||||
|       precisionChangeFast(psi_f[s],psi_d[s]); |       precisionChange(psi_f[s],psi_d[s]); | ||||||
|     } |     } | ||||||
|    |    | ||||||
|     /////////////////////////////////////// |     /////////////////////////////////////// | ||||||
| @@ -225,7 +225,7 @@ public: | |||||||
|       AXPYTimer.Stop(); |       AXPYTimer.Stop(); | ||||||
|  |  | ||||||
|       PrecChangeTimer.Start(); |       PrecChangeTimer.Start(); | ||||||
|       precisionChangeFast(r_f, r_d); |       precisionChange(r_f, r_d); | ||||||
|       PrecChangeTimer.Stop(); |       PrecChangeTimer.Stop(); | ||||||
|  |  | ||||||
|       AXPYTimer.Start(); |       AXPYTimer.Start(); | ||||||
| @@ -243,13 +243,13 @@ public: | |||||||
|  |  | ||||||
|       cp=c; |       cp=c; | ||||||
|       PrecChangeTimer.Start(); |       PrecChangeTimer.Start(); | ||||||
|       precisionChangeFast(p_f, p_d); //get back single prec search direction for linop |       precisionChange(p_f, p_d); //get back single prec search direction for linop | ||||||
|       PrecChangeTimer.Stop(); |       PrecChangeTimer.Stop(); | ||||||
|       MatrixTimer.Start();   |       MatrixTimer.Start();   | ||||||
|       Linop_f.HermOp(p_f,mmp_f); |       Linop_f.HermOp(p_f,mmp_f); | ||||||
|       MatrixTimer.Stop();   |       MatrixTimer.Stop();   | ||||||
|       PrecChangeTimer.Start(); |       PrecChangeTimer.Start(); | ||||||
|       precisionChangeFast(mmp_d, mmp_f); // From Float to Double |       precisionChange(mmp_d, mmp_f); // From Float to Double | ||||||
|       PrecChangeTimer.Stop(); |       PrecChangeTimer.Stop(); | ||||||
|  |  | ||||||
|       d=real(innerProduct(p_d,mmp_d));     |       d=real(innerProduct(p_d,mmp_d));     | ||||||
| @@ -311,7 +311,7 @@ public: | |||||||
| 	SolverTimer.Stop(); | 	SolverTimer.Stop(); | ||||||
|  |  | ||||||
| 	for(int s=0;s<nshift;s++){ | 	for(int s=0;s<nshift;s++){ | ||||||
| 	  precisionChangeFast(psi_d[s],psi_f[s]); | 	  precisionChange(psi_d[s],psi_f[s]); | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	 | 	 | ||||||
|   | |||||||
| @@ -211,7 +211,7 @@ public: | |||||||
|     Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp) |     Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp) | ||||||
|     tmp_d = tmp_d - mmp_d; |     tmp_d = tmp_d - mmp_d; | ||||||
|     std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl; |     std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl; | ||||||
|     //    assert(norm2(tmp_d)< 1.0e-4); |     assert(norm2(tmp_d)< 1.0); | ||||||
|  |  | ||||||
|     axpy(mmp_d,mass[0],p_d,mmp_d); |     axpy(mmp_d,mass[0],p_d,mmp_d); | ||||||
|     RealD rn = norm2(p_d); |     RealD rn = norm2(p_d); | ||||||
|   | |||||||
| @@ -4,11 +4,14 @@ NAMESPACE_BEGIN(Grid); | |||||||
|  |  | ||||||
| /*Allocation types, saying which pointer cache should be used*/ | /*Allocation types, saying which pointer cache should be used*/ | ||||||
| #define Cpu      (0) | #define Cpu      (0) | ||||||
| #define CpuSmall (1) | #define CpuHuge  (1) | ||||||
| #define Acc      (2) | #define CpuSmall (2) | ||||||
| #define AccSmall (3) | #define Acc      (3) | ||||||
| #define Shared   (4) | #define AccHuge  (4) | ||||||
| #define SharedSmall (5) | #define AccSmall (5) | ||||||
|  | #define Shared   (6) | ||||||
|  | #define SharedHuge  (7) | ||||||
|  | #define SharedSmall (8) | ||||||
| #undef GRID_MM_VERBOSE  | #undef GRID_MM_VERBOSE  | ||||||
| uint64_t total_shared; | uint64_t total_shared; | ||||||
| uint64_t total_device; | uint64_t total_device; | ||||||
| @@ -35,12 +38,15 @@ void MemoryManager::PrintBytes(void) | |||||||
|    |    | ||||||
| } | } | ||||||
|  |  | ||||||
|  | uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; } | ||||||
|  | uint64_t MemoryManager::HostCacheBytes()   { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; } | ||||||
|  |  | ||||||
| ////////////////////////////////////////////////////////////////////// | ////////////////////////////////////////////////////////////////////// | ||||||
| // Data tables for recently freed pooiniter caches | // Data tables for recently freed pooiniter caches | ||||||
| ////////////////////////////////////////////////////////////////////// | ////////////////////////////////////////////////////////////////////// | ||||||
| MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax]; | MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax]; | ||||||
| int MemoryManager::Victim[MemoryManager::NallocType]; | int MemoryManager::Victim[MemoryManager::NallocType]; | ||||||
| int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 }; | int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 }; | ||||||
| uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType]; | uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType]; | ||||||
| ////////////////////////////////////////////////////////////////////// | ////////////////////////////////////////////////////////////////////// | ||||||
| // Actual allocation and deallocation utils | // Actual allocation and deallocation utils | ||||||
| @@ -170,6 +176,16 @@ void MemoryManager::Init(void) | |||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   str= getenv("GRID_ALLOC_NCACHE_HUGE"); | ||||||
|  |   if ( str ) { | ||||||
|  |     Nc = atoi(str); | ||||||
|  |     if ( (Nc>=0) && (Nc < NallocCacheMax)) { | ||||||
|  |       Ncache[CpuHuge]=Nc; | ||||||
|  |       Ncache[AccHuge]=Nc; | ||||||
|  |       Ncache[SharedHuge]=Nc; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|   str= getenv("GRID_ALLOC_NCACHE_SMALL"); |   str= getenv("GRID_ALLOC_NCACHE_SMALL"); | ||||||
|   if ( str ) { |   if ( str ) { | ||||||
|     Nc = atoi(str); |     Nc = atoi(str); | ||||||
| @@ -190,7 +206,9 @@ void MemoryManager::InitMessage(void) { | |||||||
|    |    | ||||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl; |   std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl; | ||||||
| #ifdef ALLOCATION_CACHE | #ifdef ALLOCATION_CACHE | ||||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl; |   std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host   allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<" HUGE "<<Ncache[CpuHuge]<<std::endl; | ||||||
|  |   std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<" Huge "<<Ncache[AccHuge]<<std::endl; | ||||||
|  |   std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<" Huge "<<Ncache[SharedHuge]<<std::endl; | ||||||
| #endif | #endif | ||||||
|    |    | ||||||
| #ifdef GRID_UVM | #ifdef GRID_UVM | ||||||
| @@ -222,8 +240,11 @@ void MemoryManager::InitMessage(void) { | |||||||
| void *MemoryManager::Insert(void *ptr,size_t bytes,int type)  | void *MemoryManager::Insert(void *ptr,size_t bytes,int type)  | ||||||
| { | { | ||||||
| #ifdef ALLOCATION_CACHE | #ifdef ALLOCATION_CACHE | ||||||
|   bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); |   int cache; | ||||||
|   int cache = type + small; |   if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2; | ||||||
|  |   else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1; | ||||||
|  |   else                                     cache = type; | ||||||
|  |  | ||||||
|   return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);   |   return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);   | ||||||
| #else | #else | ||||||
|   return ptr; |   return ptr; | ||||||
| @@ -232,11 +253,12 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type) | |||||||
|  |  | ||||||
| void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)  | void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)  | ||||||
| { | { | ||||||
|   assert(ncache>0); |  | ||||||
| #ifdef GRID_OMP | #ifdef GRID_OMP | ||||||
|   assert(omp_in_parallel()==0); |   assert(omp_in_parallel()==0); | ||||||
| #endif  | #endif  | ||||||
|  |  | ||||||
|  |   if (ncache == 0) return ptr; | ||||||
|  |  | ||||||
|   void * ret = NULL; |   void * ret = NULL; | ||||||
|   int v = -1; |   int v = -1; | ||||||
|  |  | ||||||
| @@ -271,8 +293,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries | |||||||
| void *MemoryManager::Lookup(size_t bytes,int type) | void *MemoryManager::Lookup(size_t bytes,int type) | ||||||
| { | { | ||||||
| #ifdef ALLOCATION_CACHE | #ifdef ALLOCATION_CACHE | ||||||
|   bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); |   int cache; | ||||||
|   int cache = type+small; |   if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2; | ||||||
|  |   else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1; | ||||||
|  |   else                                     cache = type; | ||||||
|  |  | ||||||
|   return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]); |   return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]); | ||||||
| #else | #else | ||||||
|   return NULL; |   return NULL; | ||||||
| @@ -281,7 +306,6 @@ void *MemoryManager::Lookup(size_t bytes,int type) | |||||||
|  |  | ||||||
| void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)  | void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)  | ||||||
| { | { | ||||||
|   assert(ncache>0); |  | ||||||
| #ifdef GRID_OMP | #ifdef GRID_OMP | ||||||
|   assert(omp_in_parallel()==0); |   assert(omp_in_parallel()==0); | ||||||
| #endif  | #endif  | ||||||
|   | |||||||
| @@ -35,6 +35,7 @@ NAMESPACE_BEGIN(Grid); | |||||||
| // Move control to configure.ac and Config.h? | // Move control to configure.ac and Config.h? | ||||||
|  |  | ||||||
| #define GRID_ALLOC_SMALL_LIMIT (4096) | #define GRID_ALLOC_SMALL_LIMIT (4096) | ||||||
|  | #define GRID_ALLOC_HUGE_LIMIT  (2147483648) | ||||||
|  |  | ||||||
| #define STRINGIFY(x) #x | #define STRINGIFY(x) #x | ||||||
| #define TOSTRING(x) STRINGIFY(x) | #define TOSTRING(x) STRINGIFY(x) | ||||||
| @@ -70,6 +71,21 @@ enum ViewMode { | |||||||
|   CpuWriteDiscard = 0x10 // same for now |   CpuWriteDiscard = 0x10 // same for now | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | struct MemoryStatus { | ||||||
|  |   uint64_t     DeviceBytes; | ||||||
|  |   uint64_t     DeviceLRUBytes; | ||||||
|  |   uint64_t     DeviceMaxBytes; | ||||||
|  |   uint64_t     HostToDeviceBytes; | ||||||
|  |   uint64_t     DeviceToHostBytes; | ||||||
|  |   uint64_t     HostToDeviceXfer; | ||||||
|  |   uint64_t     DeviceToHostXfer; | ||||||
|  |   uint64_t     DeviceEvictions; | ||||||
|  |   uint64_t     DeviceDestroy; | ||||||
|  |   uint64_t     DeviceAllocCacheBytes; | ||||||
|  |   uint64_t     HostAllocCacheBytes; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  |  | ||||||
| class MemoryManager { | class MemoryManager { | ||||||
| private: | private: | ||||||
|  |  | ||||||
| @@ -83,7 +99,7 @@ private: | |||||||
|   } AllocationCacheEntry; |   } AllocationCacheEntry; | ||||||
|  |  | ||||||
|   static const int NallocCacheMax=128;  |   static const int NallocCacheMax=128;  | ||||||
|   static const int NallocType=6; |   static const int NallocType=9; | ||||||
|   static AllocationCacheEntry Entries[NallocType][NallocCacheMax]; |   static AllocationCacheEntry Entries[NallocType][NallocCacheMax]; | ||||||
|   static int Victim[NallocType]; |   static int Victim[NallocType]; | ||||||
|   static int Ncache[NallocType]; |   static int Ncache[NallocType]; | ||||||
| @@ -122,6 +138,25 @@ private: | |||||||
|   static uint64_t     DeviceEvictions; |   static uint64_t     DeviceEvictions; | ||||||
|   static uint64_t     DeviceDestroy; |   static uint64_t     DeviceDestroy; | ||||||
|    |    | ||||||
|  |   static uint64_t     DeviceCacheBytes(); | ||||||
|  |   static uint64_t     HostCacheBytes(); | ||||||
|  |  | ||||||
|  |   static MemoryStatus GetFootprint(void) { | ||||||
|  |     MemoryStatus stat; | ||||||
|  |     stat.DeviceBytes       = DeviceBytes; | ||||||
|  |     stat.DeviceLRUBytes    = DeviceLRUBytes; | ||||||
|  |     stat.DeviceMaxBytes    = DeviceMaxBytes; | ||||||
|  |     stat.HostToDeviceBytes = HostToDeviceBytes; | ||||||
|  |     stat.DeviceToHostBytes = DeviceToHostBytes; | ||||||
|  |     stat.HostToDeviceXfer  = HostToDeviceXfer; | ||||||
|  |     stat.DeviceToHostXfer  = DeviceToHostXfer; | ||||||
|  |     stat.DeviceEvictions   = DeviceEvictions; | ||||||
|  |     stat.DeviceDestroy     = DeviceDestroy; | ||||||
|  |     stat.DeviceAllocCacheBytes = DeviceCacheBytes(); | ||||||
|  |     stat.HostAllocCacheBytes   = HostCacheBytes(); | ||||||
|  |     return stat; | ||||||
|  |   }; | ||||||
|  |    | ||||||
|  private: |  private: | ||||||
| #ifndef GRID_UVM | #ifndef GRID_UVM | ||||||
|   ////////////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -519,7 +519,6 @@ void MemoryManager::Audit(std::string s) | |||||||
|   uint64_t LruBytes1=0; |   uint64_t LruBytes1=0; | ||||||
|   uint64_t LruBytes2=0; |   uint64_t LruBytes2=0; | ||||||
|   uint64_t LruCnt=0; |   uint64_t LruCnt=0; | ||||||
|   uint64_t LockedBytes=0; |  | ||||||
|    |    | ||||||
|   std::cout << " Memory Manager::Audit() from "<<s<<std::endl; |   std::cout << " Memory Manager::Audit() from "<<s<<std::endl; | ||||||
|   for(auto it=LRU.begin();it!=LRU.end();it++){ |   for(auto it=LRU.begin();it!=LRU.end();it++){ | ||||||
|   | |||||||
| @@ -400,9 +400,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques | |||||||
| } | } | ||||||
| void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir) | void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir) | ||||||
| { | { | ||||||
|   acceleratorCopySynchronise(); |  | ||||||
|   StencilBarrier();// Synch shared memory on a single nodes |  | ||||||
|  |  | ||||||
|   int nreq=list.size(); |   int nreq=list.size(); | ||||||
|  |  | ||||||
|   if (nreq==0) return; |   if (nreq==0) return; | ||||||
|   | |||||||
| @@ -128,7 +128,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques | |||||||
| 							 int recv_from_rank,int dor, | 							 int recv_from_rank,int dor, | ||||||
| 							 int xbytes,int rbytes, int dir) | 							 int xbytes,int rbytes, int dir) | ||||||
| { | { | ||||||
|   return 2.0*bytes; |   return xbytes+rbytes; | ||||||
| } | } | ||||||
| void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) | void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) | ||||||
| { | { | ||||||
|   | |||||||
| @@ -91,6 +91,59 @@ void *SharedMemory::ShmBufferSelf(void) | |||||||
|   //std::cerr << "ShmBufferSelf "<<ShmRank<<" "<<std::hex<< ShmCommBufs[ShmRank] <<std::dec<<std::endl; |   //std::cerr << "ShmBufferSelf "<<ShmRank<<" "<<std::hex<< ShmCommBufs[ShmRank] <<std::dec<<std::endl; | ||||||
|   return ShmCommBufs[ShmRank]; |   return ShmCommBufs[ShmRank]; | ||||||
| } | } | ||||||
|  | static inline int divides(int a,int b) | ||||||
|  | { | ||||||
|  |   return ( b == ( (b/a)*a ) ); | ||||||
|  | } | ||||||
|  | void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims) | ||||||
|  | { | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   // Allow user to configure through environment variable | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str()); | ||||||
|  |   if ( str ) { | ||||||
|  |     std::vector<int> IntShmDims; | ||||||
|  |     GridCmdOptionIntVector(std::string(str),IntShmDims); | ||||||
|  |     assert(IntShmDims.size() == WorldDims.size()); | ||||||
|  |     long ShmSize = 1; | ||||||
|  |     for (int dim=0;dim<WorldDims.size();dim++) { | ||||||
|  |       ShmSize *= (ShmDims[dim] = IntShmDims[dim]); | ||||||
|  |       assert(divides(ShmDims[dim],WorldDims[dim])); | ||||||
|  |     } | ||||||
|  |     assert(ShmSize == WorldShmSize); | ||||||
|  |     return; | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   // Powers of 2,3,5 only in prime decomposition for now | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   int ndimension = WorldDims.size(); | ||||||
|  |   ShmDims=Coordinate(ndimension,1); | ||||||
|  |  | ||||||
|  |   std::vector<int> primes({2,3,5}); | ||||||
|  |  | ||||||
|  |   int dim = 0; | ||||||
|  |   int last_dim = ndimension - 1; | ||||||
|  |   int AutoShmSize = 1; | ||||||
|  |   while(AutoShmSize != WorldShmSize) { | ||||||
|  |     int p; | ||||||
|  |     for(p=0;p<primes.size();p++) { | ||||||
|  |       int prime=primes[p]; | ||||||
|  |       if ( divides(prime,WorldDims[dim]/ShmDims[dim]) | ||||||
|  |         && divides(prime,WorldShmSize/AutoShmSize)  ) { | ||||||
|  |   AutoShmSize*=prime; | ||||||
|  |   ShmDims[dim]*=prime; | ||||||
|  |   last_dim = dim; | ||||||
|  |   break; | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     if (p == primes.size() && last_dim == dim) { | ||||||
|  |       std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl; | ||||||
|  |       exit(EXIT_FAILURE); | ||||||
|  |     } | ||||||
|  |     dim=(dim+1) %ndimension; | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid);  | NAMESPACE_END(Grid);  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -27,9 +27,10 @@ Author: Christoph Lehner <christoph@lhnr.de> | |||||||
| *************************************************************************************/ | *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
|  |  | ||||||
|  | #define header "SharedMemoryMpi: " | ||||||
|  |  | ||||||
| #include <Grid/GridCore.h> | #include <Grid/GridCore.h> | ||||||
| #include <pwd.h> | #include <pwd.h> | ||||||
| #include <syscall.h> |  | ||||||
|  |  | ||||||
| #ifdef GRID_CUDA | #ifdef GRID_CUDA | ||||||
| #include <cuda_runtime_api.h> | #include <cuda_runtime_api.h> | ||||||
| @@ -37,12 +38,120 @@ Author: Christoph Lehner <christoph@lhnr.de> | |||||||
| #ifdef GRID_HIP | #ifdef GRID_HIP | ||||||
| #include <hip/hip_runtime_api.h> | #include <hip/hip_runtime_api.h> | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_SYCl | #ifdef GRID_SYCL | ||||||
|  | #define GRID_SYCL_LEVEL_ZERO_IPC | ||||||
|  | #include <syscall.h> | ||||||
|  | #define SHM_SOCKETS  | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | #include <sys/socket.h> | ||||||
|  | #include <sys/un.h> | ||||||
|  |  | ||||||
| NAMESPACE_BEGIN(Grid);  | NAMESPACE_BEGIN(Grid);  | ||||||
| #define header "SharedMemoryMpi: " |  | ||||||
|  | #ifdef SHM_SOCKETS | ||||||
|  |  | ||||||
|  | /* | ||||||
|  |  * Barbaric extra intranode communication route in case we need sockets to pass FDs | ||||||
|  |  * Forced by level_zero not being nicely designed | ||||||
|  |  */ | ||||||
|  | static int sock; | ||||||
|  | static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d"; | ||||||
|  | static char sock_path[256]; | ||||||
|  | class UnixSockets { | ||||||
|  | public: | ||||||
|  |   static void Open(int rank) | ||||||
|  |   { | ||||||
|  |     int errnum; | ||||||
|  |  | ||||||
|  |     sock = socket(AF_UNIX, SOCK_DGRAM, 0);  assert(sock>0); | ||||||
|  |  | ||||||
|  |     struct sockaddr_un sa_un = { 0 }; | ||||||
|  |     sa_un.sun_family = AF_UNIX; | ||||||
|  |     snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank); | ||||||
|  |     unlink(sa_un.sun_path); | ||||||
|  |     if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) { | ||||||
|  |       perror("bind failure"); | ||||||
|  |       exit(EXIT_FAILURE); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   static int RecvFileDescriptor(void) | ||||||
|  |   { | ||||||
|  |     int n; | ||||||
|  |     int fd; | ||||||
|  |     char buf[1]; | ||||||
|  |     struct iovec iov; | ||||||
|  |     struct msghdr msg; | ||||||
|  |     struct cmsghdr *cmsg; | ||||||
|  |     char cms[CMSG_SPACE(sizeof(int))]; | ||||||
|  |  | ||||||
|  |     iov.iov_base = buf; | ||||||
|  |     iov.iov_len = 1; | ||||||
|  |  | ||||||
|  |     memset(&msg, 0, sizeof msg); | ||||||
|  |     msg.msg_name = 0; | ||||||
|  |     msg.msg_namelen = 0; | ||||||
|  |     msg.msg_iov = &iov; | ||||||
|  |     msg.msg_iovlen = 1; | ||||||
|  |  | ||||||
|  |     msg.msg_control = (caddr_t)cms; | ||||||
|  |     msg.msg_controllen = sizeof cms; | ||||||
|  |  | ||||||
|  |     if((n=recvmsg(sock, &msg, 0)) < 0) { | ||||||
|  |       perror("recvmsg failed"); | ||||||
|  |       return -1; | ||||||
|  |     } | ||||||
|  |     if(n == 0){ | ||||||
|  |       perror("recvmsg returned 0"); | ||||||
|  |       return -1; | ||||||
|  |     } | ||||||
|  |     cmsg = CMSG_FIRSTHDR(&msg); | ||||||
|  |  | ||||||
|  |     memmove(&fd, CMSG_DATA(cmsg), sizeof(int)); | ||||||
|  |  | ||||||
|  |     return fd; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   static void SendFileDescriptor(int fildes,int xmit_to_rank) | ||||||
|  |   { | ||||||
|  |     struct msghdr msg; | ||||||
|  |     struct iovec iov; | ||||||
|  |     struct cmsghdr *cmsg = NULL; | ||||||
|  |     char ctrl[CMSG_SPACE(sizeof(int))]; | ||||||
|  |     char data = ' '; | ||||||
|  |  | ||||||
|  |     memset(&msg, 0, sizeof(struct msghdr)); | ||||||
|  |     memset(ctrl, 0, CMSG_SPACE(sizeof(int))); | ||||||
|  |     iov.iov_base = &data; | ||||||
|  |     iov.iov_len = sizeof(data); | ||||||
|  |      | ||||||
|  |     sprintf(sock_path,sock_path_fmt,xmit_to_rank); | ||||||
|  |      | ||||||
|  |     struct sockaddr_un sa_un = { 0 }; | ||||||
|  |     sa_un.sun_family = AF_UNIX; | ||||||
|  |     snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank); | ||||||
|  |  | ||||||
|  |     msg.msg_name = (void *)&sa_un; | ||||||
|  |     msg.msg_namelen = sizeof(sa_un); | ||||||
|  |     msg.msg_iov = &iov; | ||||||
|  |     msg.msg_iovlen = 1; | ||||||
|  |     msg.msg_controllen =  CMSG_SPACE(sizeof(int)); | ||||||
|  |     msg.msg_control = ctrl; | ||||||
|  |  | ||||||
|  |     cmsg = CMSG_FIRSTHDR(&msg); | ||||||
|  |     cmsg->cmsg_level = SOL_SOCKET; | ||||||
|  |     cmsg->cmsg_type = SCM_RIGHTS; | ||||||
|  |     cmsg->cmsg_len = CMSG_LEN(sizeof(int)); | ||||||
|  |  | ||||||
|  |     *((int *) CMSG_DATA(cmsg)) = fildes; | ||||||
|  |  | ||||||
|  |     sendmsg(sock, &msg, 0); | ||||||
|  |   }; | ||||||
|  | }; | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |  | ||||||
| /*Construct from an MPI communicator*/ | /*Construct from an MPI communicator*/ | ||||||
| void GlobalSharedMemory::Init(Grid_MPI_Comm comm) | void GlobalSharedMemory::Init(Grid_MPI_Comm comm) | ||||||
| { | { | ||||||
| @@ -169,59 +278,7 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M | |||||||
|   if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM); |   if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM); | ||||||
|   else                          OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM); |   else                          OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM); | ||||||
| } | } | ||||||
| static inline int divides(int a,int b) |  | ||||||
| { |  | ||||||
|   return ( b == ( (b/a)*a ) ); |  | ||||||
| } |  | ||||||
| void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims) |  | ||||||
| { |  | ||||||
|   //////////////////////////////////////////////////////////////// |  | ||||||
|   // Allow user to configure through environment variable |  | ||||||
|   //////////////////////////////////////////////////////////////// |  | ||||||
|   char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str()); |  | ||||||
|   if ( str ) { |  | ||||||
|     std::vector<int> IntShmDims; |  | ||||||
|     GridCmdOptionIntVector(std::string(str),IntShmDims); |  | ||||||
|     assert(IntShmDims.size() == WorldDims.size()); |  | ||||||
|     long ShmSize = 1; |  | ||||||
|     for (int dim=0;dim<WorldDims.size();dim++) { |  | ||||||
|       ShmSize *= (ShmDims[dim] = IntShmDims[dim]); |  | ||||||
|       assert(divides(ShmDims[dim],WorldDims[dim])); |  | ||||||
|     } |  | ||||||
|     assert(ShmSize == WorldShmSize); |  | ||||||
|     return; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////////// |  | ||||||
|   // Powers of 2,3,5 only in prime decomposition for now |  | ||||||
|   //////////////////////////////////////////////////////////////// |  | ||||||
|   int ndimension = WorldDims.size(); |  | ||||||
|   ShmDims=Coordinate(ndimension,1); |  | ||||||
|  |  | ||||||
|   std::vector<int> primes({2,3,5}); |  | ||||||
|  |  | ||||||
|   int dim = 0; |  | ||||||
|   int last_dim = ndimension - 1; |  | ||||||
|   int AutoShmSize = 1; |  | ||||||
|   while(AutoShmSize != WorldShmSize) { |  | ||||||
|     int p; |  | ||||||
|     for(p=0;p<primes.size();p++) { |  | ||||||
|       int prime=primes[p]; |  | ||||||
|       if ( divides(prime,WorldDims[dim]/ShmDims[dim]) |  | ||||||
|         && divides(prime,WorldShmSize/AutoShmSize)  ) { |  | ||||||
| 	AutoShmSize*=prime; |  | ||||||
| 	ShmDims[dim]*=prime; |  | ||||||
| 	last_dim = dim; |  | ||||||
| 	break; |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     if (p == primes.size() && last_dim == dim) { |  | ||||||
|       std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl; |  | ||||||
|       exit(EXIT_FAILURE); |  | ||||||
|     } |  | ||||||
|     dim=(dim+1) %ndimension; |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM) | void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM) | ||||||
| { | { | ||||||
|   //////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////// | ||||||
| @@ -531,8 +588,13 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   // Loop over ranks/gpu's on our node |   // Loop over ranks/gpu's on our node | ||||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | #ifdef SHM_SOCKETS | ||||||
|  |   UnixSockets::Open(WorldShmRank); | ||||||
|  | #endif | ||||||
|   for(int r=0;r<WorldShmSize;r++){ |   for(int r=0;r<WorldShmSize;r++){ | ||||||
|  |  | ||||||
|  |     MPI_Barrier(WorldShmComm); | ||||||
|  |  | ||||||
| #ifndef GRID_MPI3_SHM_NONE | #ifndef GRID_MPI3_SHM_NONE | ||||||
|     ////////////////////////////////////////////////// |     ////////////////////////////////////////////////// | ||||||
|     // If it is me, pass around the IPC access key |     // If it is me, pass around the IPC access key | ||||||
| @@ -540,7 +602,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
|     void * thisBuf = ShmCommBuf; |     void * thisBuf = ShmCommBuf; | ||||||
|     if(!Stencil_force_mpi) { |     if(!Stencil_force_mpi) { | ||||||
| #ifdef GRID_SYCL_LEVEL_ZERO_IPC | #ifdef GRID_SYCL_LEVEL_ZERO_IPC | ||||||
|     typedef struct { int fd; pid_t pid ; } clone_mem_t; |     typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t; | ||||||
|  |  | ||||||
|     auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device()); |     auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device()); | ||||||
|     auto zeContext   = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context()); |     auto zeContext   = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context()); | ||||||
| @@ -551,13 +613,21 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
|     if ( r==WorldShmRank ) {  |     if ( r==WorldShmRank ) {  | ||||||
|       auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle); |       auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle); | ||||||
|       if ( err != ZE_RESULT_SUCCESS ) { |       if ( err != ZE_RESULT_SUCCESS ) { | ||||||
| 	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; | 	std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; | ||||||
| 	exit(EXIT_FAILURE); | 	exit(EXIT_FAILURE); | ||||||
|       } else { |       } else { | ||||||
| 	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; | 	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; | ||||||
|       } |       } | ||||||
|       memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int)); |       memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int)); | ||||||
|       handle.pid = getpid(); |       handle.pid = getpid(); | ||||||
|  |       memcpy((void *)&handle.ze,(void *)&ihandle,sizeof(ihandle)); | ||||||
|  | #ifdef SHM_SOCKETS | ||||||
|  |       for(int rr=0;rr<WorldShmSize;rr++){ | ||||||
|  | 	if(rr!=r){ | ||||||
|  | 	  UnixSockets::SendFileDescriptor(handle.fd,rr); | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  | #endif | ||||||
|     } |     } | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_CUDA | #ifdef GRID_CUDA | ||||||
| @@ -585,6 +655,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
|     // Share this IPC handle across the Shm Comm |     // Share this IPC handle across the Shm Comm | ||||||
|     ////////////////////////////////////////////////// |     ////////////////////////////////////////////////// | ||||||
|     {  |     {  | ||||||
|  |       MPI_Barrier(WorldShmComm); | ||||||
|       int ierr=MPI_Bcast(&handle, |       int ierr=MPI_Bcast(&handle, | ||||||
| 			 sizeof(handle), | 			 sizeof(handle), | ||||||
| 			 MPI_BYTE, | 			 MPI_BYTE, | ||||||
| @@ -600,6 +671,10 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
| #ifdef GRID_SYCL_LEVEL_ZERO_IPC | #ifdef GRID_SYCL_LEVEL_ZERO_IPC | ||||||
|     if ( r!=WorldShmRank ) { |     if ( r!=WorldShmRank ) { | ||||||
|       thisBuf = nullptr; |       thisBuf = nullptr; | ||||||
|  |       int myfd; | ||||||
|  | #ifdef SHM_SOCKETS | ||||||
|  |       myfd=UnixSockets::RecvFileDescriptor(); | ||||||
|  | #else | ||||||
|       std::cout<<"mapping seeking remote pid/fd " |       std::cout<<"mapping seeking remote pid/fd " | ||||||
| 	       <<handle.pid<<"/" | 	       <<handle.pid<<"/" | ||||||
| 	       <<handle.fd<<std::endl; | 	       <<handle.fd<<std::endl; | ||||||
| @@ -607,16 +682,22 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
|       int pidfd = syscall(SYS_pidfd_open,handle.pid,0); |       int pidfd = syscall(SYS_pidfd_open,handle.pid,0); | ||||||
|       std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n"; |       std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n"; | ||||||
|       //      int myfd  = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0); |       //      int myfd  = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0); | ||||||
|       int myfd  = syscall(438,pidfd,handle.fd,0); |       myfd  = syscall(438,pidfd,handle.fd,0); | ||||||
|  |       int err_t = errno; | ||||||
|       std::cout<<"Using IpcHandle myfd "<<myfd<<"\n"; |       if (myfd < 0) { | ||||||
|        |         fprintf(stderr,"pidfd_getfd returned %d errno was %d\n", myfd,err_t); fflush(stderr); | ||||||
|  | 	perror("pidfd_getfd failed "); | ||||||
|  | 	assert(0); | ||||||
|  |       } | ||||||
|  | #endif | ||||||
|  |       std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n"; | ||||||
|  |       memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle)); | ||||||
|       memcpy((void *)&ihandle,(void *)&myfd,sizeof(int)); |       memcpy((void *)&ihandle,(void *)&myfd,sizeof(int)); | ||||||
|  |  | ||||||
|       auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf); |       auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf); | ||||||
|       if ( err != ZE_RESULT_SUCCESS ) { |       if ( err != ZE_RESULT_SUCCESS ) { | ||||||
| 	std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl; | 	std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl; | ||||||
| 	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;  | 	std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;  | ||||||
| 	exit(EXIT_FAILURE); | 	exit(EXIT_FAILURE); | ||||||
|       } else { |       } else { | ||||||
| 	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl; | 	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl; | ||||||
| @@ -651,6 +732,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
| #else | #else | ||||||
|     WorldShmCommBufs[r] = ShmCommBuf; |     WorldShmCommBufs[r] = ShmCommBuf; | ||||||
| #endif | #endif | ||||||
|  |     MPI_Barrier(WorldShmComm); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   _ShmAllocBytes=bytes; |   _ShmAllocBytes=bytes; | ||||||
|   | |||||||
| @@ -297,6 +297,30 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) | ||||||
|  |  | ||||||
|  | template <typename T> | ||||||
|  | T iDivUp(T a, T b) // Round a / b to nearest higher integer value | ||||||
|  | { return (a % b != 0) ? (a / b + 1) : (a / b); } | ||||||
|  |  | ||||||
|  | template <typename T> | ||||||
|  | __global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride) | ||||||
|  | { | ||||||
|  |     int idx = blockIdx.x*blockDim.x + threadIdx.x; | ||||||
|  |     if (idx >= e1*e2) return; | ||||||
|  |  | ||||||
|  |     int n, b, o; | ||||||
|  |  | ||||||
|  |     n = idx / e2; | ||||||
|  |     b = idx % e2; | ||||||
|  |     o = n*stride + b; | ||||||
|  |  | ||||||
|  |     vector[2*idx + 0] = lo + o; | ||||||
|  |     vector[2*idx + 1] = ro + o; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #endif | ||||||
|  |  | ||||||
| ////////////////////////////////////////////////////// | ////////////////////////////////////////////////////// | ||||||
| // local to node block strided copies | // local to node block strided copies | ||||||
| ////////////////////////////////////////////////////// | ////////////////////////////////////////////////////// | ||||||
| @@ -321,12 +345,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs | |||||||
|   int ent=0; |   int ent=0; | ||||||
|  |  | ||||||
|   if(cbmask == 0x3 ){ |   if(cbmask == 0x3 ){ | ||||||
|  | #if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) | ||||||
|  |     ent = e1*e2; | ||||||
|  |     dim3 blockSize(acceleratorThreads()); | ||||||
|  |     dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x)); | ||||||
|  |     populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); | ||||||
|  |     accelerator_barrier(); | ||||||
|  | #else | ||||||
|     for(int n=0;n<e1;n++){ |     for(int n=0;n<e1;n++){ | ||||||
|       for(int b=0;b<e2;b++){ |       for(int b=0;b<e2;b++){ | ||||||
|         int o =n*stride+b; |         int o =n*stride+b; | ||||||
| 	Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o); | 	Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o); | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  | #endif | ||||||
|   } else {  |   } else {  | ||||||
|     for(int n=0;n<e1;n++){ |     for(int n=0;n<e1;n++){ | ||||||
|       for(int b=0;b<e2;b++){ |       for(int b=0;b<e2;b++){ | ||||||
| @@ -377,11 +409,19 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo | |||||||
|   int ent=0; |   int ent=0; | ||||||
|  |  | ||||||
|   if ( cbmask == 0x3 ) { |   if ( cbmask == 0x3 ) { | ||||||
|  | #if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) | ||||||
|  |     ent = e1*e2; | ||||||
|  |     dim3 blockSize(acceleratorThreads()); | ||||||
|  |     dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x)); | ||||||
|  |     populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); | ||||||
|  |     accelerator_barrier(); | ||||||
|  | #else | ||||||
|     for(int n=0;n<e1;n++){ |     for(int n=0;n<e1;n++){ | ||||||
|     for(int b=0;b<e2;b++){ |     for(int b=0;b<e2;b++){ | ||||||
|       int o  =n*stride; |       int o  =n*stride; | ||||||
|       Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); |       Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); | ||||||
|     }} |     }} | ||||||
|  | #endif | ||||||
|   } else { |   } else { | ||||||
|     for(int n=0;n<e1;n++){ |     for(int n=0;n<e1;n++){ | ||||||
|     for(int b=0;b<e2;b++){ |     for(int b=0;b<e2;b++){ | ||||||
|   | |||||||
| @@ -153,33 +153,44 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites) | |||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> | template<class vobj> | ||||||
| inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) | inline typename vobj::scalar_object rankSum(const Lattice<vobj> &arg) | ||||||
| { | { | ||||||
|   Integer osites = arg.Grid()->oSites(); |   Integer osites = arg.Grid()->oSites(); | ||||||
| #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) | #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) | ||||||
|   typename vobj::scalar_object ssum; |  | ||||||
|   autoView( arg_v, arg, AcceleratorRead); |   autoView( arg_v, arg, AcceleratorRead); | ||||||
|   ssum= sum_gpu(&arg_v[0],osites); |   return sum_gpu(&arg_v[0],osites); | ||||||
| #else | #else | ||||||
|   autoView(arg_v, arg, CpuRead); |   autoView(arg_v, arg, CpuRead); | ||||||
|   auto ssum= sum_cpu(&arg_v[0],osites); |   return sum_cpu(&arg_v[0],osites); | ||||||
| #endif   | #endif   | ||||||
|  | } | ||||||
|  |  | ||||||
|  | template<class vobj> | ||||||
|  | inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) | ||||||
|  | { | ||||||
|  |   auto ssum = rankSum(arg); | ||||||
|   arg.Grid()->GlobalSum(ssum); |   arg.Grid()->GlobalSum(ssum); | ||||||
|   return ssum; |   return ssum; | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> | template<class vobj> | ||||||
| inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg) | inline typename vobj::scalar_object rankSumLarge(const Lattice<vobj> &arg) | ||||||
| { | { | ||||||
| #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) | #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) | ||||||
|   autoView( arg_v, arg, AcceleratorRead); |   autoView( arg_v, arg, AcceleratorRead); | ||||||
|   Integer osites = arg.Grid()->oSites(); |   Integer osites = arg.Grid()->oSites(); | ||||||
|   auto ssum= sum_gpu_large(&arg_v[0],osites); |   return sum_gpu_large(&arg_v[0],osites); | ||||||
| #else | #else | ||||||
|   autoView(arg_v, arg, CpuRead); |   autoView(arg_v, arg, CpuRead); | ||||||
|   Integer osites = arg.Grid()->oSites(); |   Integer osites = arg.Grid()->oSites(); | ||||||
|   auto ssum= sum_cpu(&arg_v[0],osites); |   return sum_cpu(&arg_v[0],osites); | ||||||
| #endif | #endif | ||||||
|  | } | ||||||
|  |  | ||||||
|  | template<class vobj> | ||||||
|  | inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg) | ||||||
|  | { | ||||||
|  |   auto ssum = rankSumLarge(arg); | ||||||
|   arg.Grid()->GlobalSum(ssum); |   arg.Grid()->GlobalSum(ssum); | ||||||
|   return ssum; |   return ssum; | ||||||
| } | } | ||||||
|   | |||||||
| @@ -211,25 +211,22 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi | |||||||
|   assert(ok); |   assert(ok); | ||||||
|  |  | ||||||
|   Integer smemSize = numThreads * sizeof(sobj); |   Integer smemSize = numThreads * sizeof(sobj); | ||||||
|   // UVM seems to be buggy under later CUDA drivers |   // Move out of UVM | ||||||
|   // This fails on A100 and driver 5.30.02 / CUDA 12.1 |   // Turns out I had messed up the synchronise after move to compute stream | ||||||
|   // Fails with multiple NVCC versions back to 11.4, |   // as running this on the default stream fools the synchronise | ||||||
|   // which worked with earlier drivers. |  | ||||||
|   // Not sure which driver had first fail and this bears checking |  | ||||||
|   // Is awkward as must install multiple driver versions |  | ||||||
| #undef UVM_BLOCK_BUFFER   | #undef UVM_BLOCK_BUFFER   | ||||||
| #ifndef UVM_BLOCK_BUFFER   | #ifndef UVM_BLOCK_BUFFER   | ||||||
|   commVector<sobj> buffer(numBlocks); |   commVector<sobj> buffer(numBlocks); | ||||||
|   sobj *buffer_v = &buffer[0]; |   sobj *buffer_v = &buffer[0]; | ||||||
|   sobj result; |   sobj result; | ||||||
|   reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); |   reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); | ||||||
|   accelerator_barrier(); |   accelerator_barrier(); | ||||||
|   acceleratorCopyFromDevice(buffer_v,&result,sizeof(result)); |   acceleratorCopyFromDevice(buffer_v,&result,sizeof(result)); | ||||||
| #else | #else | ||||||
|   Vector<sobj> buffer(numBlocks); |   Vector<sobj> buffer(numBlocks); | ||||||
|   sobj *buffer_v = &buffer[0]; |   sobj *buffer_v = &buffer[0]; | ||||||
|   sobj result; |   sobj result; | ||||||
|   reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); |   reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); | ||||||
|   accelerator_barrier(); |   accelerator_barrier(); | ||||||
|   result = *buffer_v; |   result = *buffer_v; | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -440,17 +440,8 @@ public: | |||||||
| 	_grid->GlobalCoorToGlobalIndex(gcoor,gidx); | 	_grid->GlobalCoorToGlobalIndex(gcoor,gidx); | ||||||
|  |  | ||||||
| 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); | 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); | ||||||
| #if 1 |  | ||||||
| 	assert(rank == _grid->ThisRank() ); |  | ||||||
| #else |  | ||||||
| //  |  | ||||||
| 	if (rank != _grid->ThisRank() ){ |  | ||||||
| 	std::cout <<"rank "<<rank<<" _grid->ThisRank() "<<_grid->ThisRank()<< std::endl; |  | ||||||
| //	exit(-42); |  | ||||||
| //	assert(0); |  | ||||||
| 	} |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | 	assert(rank == _grid->ThisRank() ); | ||||||
| 	 | 	 | ||||||
| 	int l_idx=generator_idx(o_idx,i_idx); | 	int l_idx=generator_idx(o_idx,i_idx); | ||||||
| 	_generators[l_idx] = master_engine; | 	_generators[l_idx] = master_engine; | ||||||
|   | |||||||
| @@ -288,7 +288,36 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData, | |||||||
|     blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);  |     blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);  | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  | template<class vobj,class CComplex,int nbasis,class VLattice> | ||||||
|  | inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData, | ||||||
|  |                                const std::vector<Lattice<vobj>> &fineData, | ||||||
|  |                                const VLattice &Basis) | ||||||
|  | { | ||||||
|  |   int NBatch = fineData.size(); | ||||||
|  |   assert(coarseData.size() == NBatch); | ||||||
|  |  | ||||||
|  |   GridBase * fine  = fineData[0].Grid(); | ||||||
|  |   GridBase * coarse= coarseData[0].Grid(); | ||||||
|  |  | ||||||
|  |   Lattice<iScalar<CComplex>> ip(coarse); | ||||||
|  |   std::vector<Lattice<vobj>> fineDataCopy = fineData; | ||||||
|  |  | ||||||
|  |   autoView(ip_, ip, AcceleratorWrite); | ||||||
|  |   for(int v=0;v<nbasis;v++) { | ||||||
|  |     for (int k=0; k<NBatch; k++) { | ||||||
|  |       autoView( coarseData_ , coarseData[k], AcceleratorWrite); | ||||||
|  |       blockInnerProductD(ip,Basis[v],fineDataCopy[k]); // ip = <basis|fine> | ||||||
|  |       accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), { | ||||||
|  |         convertType(coarseData_[sc](v),ip_[sc]); | ||||||
|  |       }); | ||||||
|  |  | ||||||
|  |       // improve numerical stability of projection | ||||||
|  |       // |fine> = |fine> - <basis|fine> |basis> | ||||||
|  |       ip=-ip; | ||||||
|  |       blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]);  | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
| template<class vobj,class vobj2,class CComplex> | template<class vobj,class vobj2,class CComplex> | ||||||
|   inline void blockZAXPY(Lattice<vobj> &fineZ, |   inline void blockZAXPY(Lattice<vobj> &fineZ, | ||||||
| @@ -590,6 +619,26 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData, | |||||||
| } | } | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | template<class vobj,class CComplex,int nbasis,class VLattice> | ||||||
|  | inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData, | ||||||
|  |                                std::vector<Lattice<vobj>> &fineData, | ||||||
|  |                                const VLattice &Basis) | ||||||
|  | { | ||||||
|  |   int NBatch = coarseData.size(); | ||||||
|  |   assert(fineData.size() == NBatch); | ||||||
|  |  | ||||||
|  |   GridBase * fine   = fineData[0].Grid(); | ||||||
|  |   GridBase * coarse = coarseData[0].Grid(); | ||||||
|  |   for (int k=0; k<NBatch; k++) | ||||||
|  |     fineData[k]=Zero(); | ||||||
|  |   for (int i=0;i<nbasis;i++) { | ||||||
|  |     for (int k=0; k<NBatch; k++) { | ||||||
|  |       Lattice<iScalar<CComplex>> ip = PeekIndex<0>(coarseData[k],i); | ||||||
|  |       blockZAXPY(fineData[k],ip,Basis[i],fineData[k]); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
| // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars. | // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars. | ||||||
| // Simd layouts need not match since we use peek/poke Local | // Simd layouts need not match since we use peek/poke Local | ||||||
| template<class vobj,class vvobj> | template<class vobj,class vvobj> | ||||||
|   | |||||||
| @@ -507,6 +507,7 @@ public: | |||||||
|     } |     } | ||||||
|     this->face_table_computed=1; |     this->face_table_computed=1; | ||||||
|     assert(this->u_comm_offset==this->_unified_buffer_size); |     assert(this->u_comm_offset==this->_unified_buffer_size); | ||||||
|  |     accelerator_barrier(); | ||||||
|   } |   } | ||||||
|  |  | ||||||
| }; | }; | ||||||
|   | |||||||
| @@ -196,7 +196,6 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in | |||||||
|    |    | ||||||
|   uint64_t Nsite = Umu.Grid()->oSites(); |   uint64_t Nsite = Umu.Grid()->oSites(); | ||||||
|   Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out,dirdisp,gamma); |   Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out,dirdisp,gamma); | ||||||
|  |  | ||||||
| }; | }; | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void WilsonFermion5D<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out) | void WilsonFermion5D<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out) | ||||||
| @@ -247,10 +246,14 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st, | |||||||
|  |  | ||||||
|     Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, Usites, B, Btilde, mu,gamma); |     Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, Usites, B, Btilde, mu,gamma); | ||||||
|  |  | ||||||
|  |     std::cout << " InsertForce Btilde "<< norm2(Btilde)<<std::endl; | ||||||
|  |  | ||||||
|     //////////////////////////// |     //////////////////////////// | ||||||
|     // spin trace outer product |     // spin trace outer product | ||||||
|     //////////////////////////// |     //////////////////////////// | ||||||
|     Impl::InsertForce5D(mat, Btilde, Atilde, mu); |     Impl::InsertForce5D(mat, Btilde, Atilde, mu); | ||||||
|  |  | ||||||
|  |     std::cout << " InsertForce "<< norm2(mat)<<std::endl; | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -332,8 +335,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg | |||||||
|   ///////////////////////////// |   ///////////////////////////// | ||||||
|   { |   { | ||||||
|     GRID_TRACE("Gather"); |     GRID_TRACE("Gather"); | ||||||
|     st.HaloExchangeOptGather(in,compressor); |     st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine | ||||||
|     accelerator_barrier(); |  | ||||||
|   } |   } | ||||||
|    |    | ||||||
|   std::vector<std::vector<CommsRequest_t> > requests; |   std::vector<std::vector<CommsRequest_t> > requests; | ||||||
|   | |||||||
| @@ -428,9 +428,10 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S | |||||||
|   auto ptr = &st.surface_list[0];					\ |   auto ptr = &st.surface_list[0];					\ | ||||||
|   accelerator_forNB( ss, sz, Simd::Nsimd(), {				\ |   accelerator_forNB( ss, sz, Simd::Nsimd(), {				\ | ||||||
|       int sF = ptr[ss];							\ |       int sF = ptr[ss];							\ | ||||||
|       int sU = ss/Ls;							\ |       int sU = sF/Ls;							\ | ||||||
|       WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\ |       WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\ | ||||||
|     });									 |     });									\ | ||||||
|  |   accelerator_barrier(); | ||||||
|  |  | ||||||
| #define ASM_CALL(A)							\ | #define ASM_CALL(A)							\ | ||||||
|   thread_for( sss, Nsite, {						\ |   thread_for( sss, Nsite, {						\ | ||||||
| @@ -463,11 +464,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | |||||||
|  |  | ||||||
|    if( interior && exterior ) { |    if( interior && exterior ) { | ||||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;} |      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;} | ||||||
| #ifdef SYCL_HACK      |  | ||||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteSycl);    return; } |  | ||||||
| #else |  | ||||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;} |      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;} | ||||||
| #endif      |  | ||||||
| #ifndef GRID_CUDA | #ifndef GRID_CUDA | ||||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite);    return;} |      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite);    return;} | ||||||
| #endif | #endif | ||||||
| @@ -478,8 +475,10 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | |||||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;} |      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;} | ||||||
| #endif | #endif | ||||||
|    } else if( exterior ) { |    } else if( exterior ) { | ||||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteExt); return;} |      // dependent on result of merge | ||||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt);    return;} |      acceleratorFenceComputeStream(); | ||||||
|  |      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;} | ||||||
|  |      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt);    return;} | ||||||
| #ifndef GRID_CUDA | #ifndef GRID_CUDA | ||||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt);    return;} |      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt);    return;} | ||||||
| #endif | #endif | ||||||
| @@ -502,21 +501,20 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | |||||||
| #ifndef GRID_CUDA | #ifndef GRID_CUDA | ||||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;} |      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;} | ||||||
| #endif | #endif | ||||||
|      acceleratorFenceComputeStream(); |  | ||||||
|    } else if( interior ) { |    } else if( interior ) { | ||||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagInt); return;} |      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALLNB(GenericDhopSiteDagInt); return;} | ||||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt);    return;} |      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteDagInt);    return;} | ||||||
| #ifndef GRID_CUDA | #ifndef GRID_CUDA | ||||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;} |      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;} | ||||||
| #endif | #endif | ||||||
|    } else if( exterior ) { |    } else if( exterior ) { | ||||||
|  |      // Dependent on result of merge | ||||||
|      acceleratorFenceComputeStream(); |      acceleratorFenceComputeStream(); | ||||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagExt); return;} |      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL_EXT(GenericDhopSiteDagExt); return;} | ||||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt);    return;} |      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteDagExt);    return;} | ||||||
| #ifndef GRID_CUDA | #ifndef GRID_CUDA | ||||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;} |      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;} | ||||||
| #endif | #endif | ||||||
|      acceleratorFenceComputeStream(); |  | ||||||
|    } |    } | ||||||
|    assert(0 && " Kernel optimisation case not covered "); |    assert(0 && " Kernel optimisation case not covered "); | ||||||
|   } |   } | ||||||
|   | |||||||
| @@ -1 +0,0 @@ | |||||||
| ../CayleyFermion5DInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../ContinuedFractionFermion5DInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../DomainWallEOFAFermionInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../MobiusEOFAFermionInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../PartialFractionFermion5DInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../WilsonCloverFermionInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../WilsonFermion5DInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../WilsonFermionInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../WilsonKernelsInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../WilsonTMFermionInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| #define IMPLEMENTATION WilsonImplD2 |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../CayleyFermion5DInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../ContinuedFractionFermion5DInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../DomainWallEOFAFermionInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../MobiusEOFAFermionInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../PartialFractionFermion5DInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../WilsonFermion5DInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| ../WilsonKernelsInstantiation.cc.master |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| #define IMPLEMENTATION ZWilsonImplD2 |  | ||||||
| @@ -119,13 +119,19 @@ public: | |||||||
|     //  X^dag Der_oe MeeInv Meo Y |     //  X^dag Der_oe MeeInv Meo Y | ||||||
|     // Use Mooee as nontrivial but gauge field indept |     // Use Mooee as nontrivial but gauge field indept | ||||||
|     this->_Mat.MeooeDag   (V,tmp1);      // odd->even -- implicit -0.5 factor to be applied |     this->_Mat.MeooeDag   (V,tmp1);      // odd->even -- implicit -0.5 factor to be applied | ||||||
|  |     std::cout << " tmp 1" << norm2(tmp1)<<std::endl; | ||||||
|     this->_Mat.MooeeInvDag(tmp1,tmp2);   // even->even  |     this->_Mat.MooeeInvDag(tmp1,tmp2);   // even->even  | ||||||
|  |     std::cout << " tmp 1" << norm2(tmp2)<<std::endl; | ||||||
|     this->_Mat.MoeDeriv(ForceO,U,tmp2,DaggerYes); |     this->_Mat.MoeDeriv(ForceO,U,tmp2,DaggerYes); | ||||||
|  |     std::cout << " ForceO " << norm2(ForceO)<<std::endl; | ||||||
|            |            | ||||||
|     //  Accumulate X^dag M_oe MeeInv Der_eo Y |     //  Accumulate X^dag M_oe MeeInv Der_eo Y | ||||||
|     this->_Mat.Meooe   (U,tmp1);    // even->odd -- implicit -0.5 factor to be applied |     this->_Mat.Meooe   (U,tmp1);    // even->odd -- implicit -0.5 factor to be applied | ||||||
|  |     std::cout << " tmp 1" << norm2(tmp1)<<std::endl; | ||||||
|     this->_Mat.MooeeInv(tmp1,tmp2); // even->even  |     this->_Mat.MooeeInv(tmp1,tmp2); // even->even  | ||||||
|  |     std::cout << " tmp 2" << norm2(tmp2)<<std::endl; | ||||||
|     this->_Mat.MeoDeriv(ForceE,tmp2,V,DaggerYes); |     this->_Mat.MeoDeriv(ForceE,tmp2,V,DaggerYes); | ||||||
|  |     std::cout << " ForceE " << norm2(ForceE)<<std::endl; | ||||||
|  |  | ||||||
|     assert(ForceE.Checkerboard()==Even); |     assert(ForceE.Checkerboard()==Even); | ||||||
|     assert(ForceO.Checkerboard()==Odd); |     assert(ForceO.Checkerboard()==Odd); | ||||||
|   | |||||||
| @@ -38,91 +38,73 @@ NAMESPACE_BEGIN(Grid); | |||||||
|     // cf. GeneralEvenOddRational.h for details |     // cf. GeneralEvenOddRational.h for details | ||||||
|     ///////////////////////////////////////////////////////////////////////////////////////////////////////////// |     ///////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|        |        | ||||||
|     template<class ImplD, class ImplF, class ImplD2> |     template<class ImplD, class ImplF> | ||||||
|     class GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<ImplD> { |     class GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<ImplD> { | ||||||
|     private: |     private: | ||||||
|       typedef typename ImplD2::FermionField FermionFieldD2; |  | ||||||
|       typedef typename ImplD::FermionField FermionFieldD; |       typedef typename ImplD::FermionField FermionFieldD; | ||||||
|       typedef typename ImplF::FermionField FermionFieldF; |       typedef typename ImplF::FermionField FermionFieldF; | ||||||
|  |  | ||||||
|       FermionOperator<ImplD> & NumOpD; |       FermionOperator<ImplD> & NumOpD; | ||||||
|       FermionOperator<ImplD> & DenOpD; |       FermionOperator<ImplD> & DenOpD; | ||||||
|  |  | ||||||
|       FermionOperator<ImplD2> & NumOpD2; |  | ||||||
|       FermionOperator<ImplD2> & DenOpD2; |  | ||||||
|       |  | ||||||
|       FermionOperator<ImplF> & NumOpF; |       FermionOperator<ImplF> & NumOpF; | ||||||
|       FermionOperator<ImplF> & DenOpF; |       FermionOperator<ImplF> & DenOpF; | ||||||
|  |  | ||||||
|       Integer ReliableUpdateFreq; |       Integer ReliableUpdateFreq; | ||||||
|     protected: |     protected: | ||||||
|  |  | ||||||
|  |       //Action evaluation | ||||||
|       //Allow derived classes to override the multishift CG |       //Allow derived classes to override the multishift CG | ||||||
|       virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){ |       virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){ | ||||||
| #if 0 | #if 1 | ||||||
| 	SchurDifferentiableOperator<ImplD> schurOp(numerator ? NumOpD : DenOpD); | 	SchurDifferentiableOperator<ImplD> schurOp(numerator ? NumOpD : DenOpD); | ||||||
| 	ConjugateGradientMultiShift<FermionFieldD> msCG(MaxIter, approx); | 	ConjugateGradientMultiShift<FermionFieldD> msCG(MaxIter, approx); | ||||||
| 	msCG(schurOp,in, out); | 	msCG(schurOp,in, out); | ||||||
| #else | #else | ||||||
| 	SchurDifferentiableOperator<ImplD2> schurOpD2(numerator ? NumOpD2 : DenOpD2); | 	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD); | ||||||
| 	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF); | 	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF); | ||||||
| 	FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid()); | 	FermionFieldD inD(NumOpD.FermionRedBlackGrid()); | ||||||
| 	FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid()); | 	FermionFieldD outD(NumOpD.FermionRedBlackGrid()); | ||||||
|  |  | ||||||
| 	// Action better with higher precision? | 	// Action better with higher precision? | ||||||
| 	ConjugateGradientMultiShiftMixedPrec<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); | 	ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); | ||||||
| 	precisionChange(inD2,in); | 	msCG(schurOpD, in, out); | ||||||
| 	std::cout << "msCG single solve "<<norm2(inD2)<<" " <<norm2(in)<<std::endl; |  | ||||||
| 	msCG(schurOpD2, inD2, outD2); |  | ||||||
| 	precisionChange(out,outD2); |  | ||||||
| #endif | #endif | ||||||
|       } |       } | ||||||
|  |       //Force evaluation | ||||||
|       virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){ |       virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){ | ||||||
| 	SchurDifferentiableOperator<ImplD2> schurOpD2(numerator ? NumOpD2 : DenOpD2); | 	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD); | ||||||
| 	SchurDifferentiableOperator<ImplF>  schurOpF (numerator ? NumOpF  : DenOpF); | 	SchurDifferentiableOperator<ImplF>  schurOpF(numerator ? NumOpF  : DenOpF); | ||||||
|  |  | ||||||
| 	FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid()); | 	FermionFieldD inD(NumOpD.FermionRedBlackGrid()); | ||||||
| 	FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid()); | 	FermionFieldD outD(NumOpD.FermionRedBlackGrid()); | ||||||
| 	std::vector<FermionFieldD2> out_elemsD2(out_elems.size(),NumOpD2.FermionRedBlackGrid()); | 	std::vector<FermionFieldD> out_elemsD(out_elems.size(),NumOpD.FermionRedBlackGrid()); | ||||||
| 	ConjugateGradientMultiShiftMixedPrecCleanup<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); | 	ConjugateGradientMultiShiftMixedPrecCleanup<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); | ||||||
| 	precisionChange(inD2,in); | 	msCG(schurOpD, in, out_elems, out); | ||||||
| 	std::cout << "msCG in "<<norm2(inD2)<<" " <<norm2(in)<<std::endl; |  | ||||||
| 	msCG(schurOpD2, inD2, out_elemsD2, outD2); |  | ||||||
| 	precisionChange(out,outD2); |  | ||||||
| 	for(int i=0;i<out_elems.size();i++){ |  | ||||||
| 	  precisionChange(out_elems[i],out_elemsD2[i]); |  | ||||||
| 	} |  | ||||||
|       } |       } | ||||||
|       //Allow derived classes to override the gauge import |       //Allow derived classes to override the gauge import | ||||||
|       virtual void ImportGauge(const typename ImplD::GaugeField &Ud){ |       virtual void ImportGauge(const typename ImplD::GaugeField &Ud){ | ||||||
|  |  | ||||||
| 	typename ImplF::GaugeField Uf(NumOpF.GaugeGrid()); | 	typename ImplF::GaugeField Uf(NumOpF.GaugeGrid()); | ||||||
| 	typename ImplD2::GaugeField Ud2(NumOpD2.GaugeGrid()); |  | ||||||
| 	precisionChange(Uf, Ud); | 	precisionChange(Uf, Ud); | ||||||
| 	precisionChange(Ud2, Ud); |  | ||||||
|  |  | ||||||
| 	std::cout << "Importing "<<norm2(Ud)<<" "<< norm2(Uf)<<" " << norm2(Ud2)<<std::endl; | 	std::cout << "Importing "<<norm2(Ud)<<" "<< norm2(Uf)<<" " <<std::endl; | ||||||
| 	 | 	 | ||||||
| 	NumOpD.ImportGauge(Ud); | 	NumOpD.ImportGauge(Ud); | ||||||
| 	DenOpD.ImportGauge(Ud); | 	DenOpD.ImportGauge(Ud); | ||||||
|  |  | ||||||
| 	NumOpF.ImportGauge(Uf); | 	NumOpF.ImportGauge(Uf); | ||||||
| 	DenOpF.ImportGauge(Uf); | 	DenOpF.ImportGauge(Uf); | ||||||
|  |  | ||||||
| 	NumOpD2.ImportGauge(Ud2); |  | ||||||
| 	DenOpD2.ImportGauge(Ud2); |  | ||||||
|       } |       } | ||||||
|        |        | ||||||
|     public: |     public: | ||||||
|       GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator<ImplD>  &_NumOpD, FermionOperator<ImplD>  &_DenOpD,  |       GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator<ImplD>  &_NumOpD, FermionOperator<ImplD>  &_DenOpD,  | ||||||
| 							      FermionOperator<ImplF>  &_NumOpF, FermionOperator<ImplF>  &_DenOpF,  | 							      FermionOperator<ImplF>  &_NumOpF, FermionOperator<ImplF>  &_DenOpF,  | ||||||
| 							      FermionOperator<ImplD2>  &_NumOpD2, FermionOperator<ImplD2>  &_DenOpD2,  |  | ||||||
| 							      const RationalActionParams & p, Integer _ReliableUpdateFreq | 							      const RationalActionParams & p, Integer _ReliableUpdateFreq | ||||||
| 							      ) : GeneralEvenOddRatioRationalPseudoFermionAction<ImplD>(_NumOpD, _DenOpD, p), | 							      ) : GeneralEvenOddRatioRationalPseudoFermionAction<ImplD>(_NumOpD, _DenOpD, p), | ||||||
| 								  ReliableUpdateFreq(_ReliableUpdateFreq), | 								  ReliableUpdateFreq(_ReliableUpdateFreq), | ||||||
| 								  NumOpD(_NumOpD), DenOpD(_DenOpD), | 								  NumOpD(_NumOpD), DenOpD(_DenOpD), | ||||||
| 								  NumOpF(_NumOpF), DenOpF(_DenOpF), | 								  NumOpF(_NumOpF), DenOpF(_DenOpF) | ||||||
| 								  NumOpD2(_NumOpD2), DenOpD2(_DenOpD2) |  | ||||||
|       {} |       {} | ||||||
|        |        | ||||||
|       virtual std::string action_name(){return "GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction";} |       virtual std::string action_name(){return "GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction";} | ||||||
|   | |||||||
| @@ -67,9 +67,9 @@ NAMESPACE_BEGIN(Grid); | |||||||
|       virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}       |       virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}       | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     template<class Impl,class ImplF,class ImplD2> |     template<class Impl,class ImplF> | ||||||
|     class OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction |     class OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction | ||||||
|       : public GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF,ImplD2> { |       : public GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF> { | ||||||
|     public: |     public: | ||||||
|       typedef OneFlavourRationalParams Params; |       typedef OneFlavourRationalParams Params; | ||||||
|     private: |     private: | ||||||
| @@ -91,11 +91,9 @@ NAMESPACE_BEGIN(Grid); | |||||||
| 								 FermionOperator<Impl>  &_DenOp,  | 								 FermionOperator<Impl>  &_DenOp,  | ||||||
| 								 FermionOperator<ImplF>  &_NumOpF,  | 								 FermionOperator<ImplF>  &_NumOpF,  | ||||||
| 								 FermionOperator<ImplF>  &_DenOpF,  | 								 FermionOperator<ImplF>  &_DenOpF,  | ||||||
| 								 FermionOperator<ImplD2>  &_NumOpD2,  |  | ||||||
| 								 FermionOperator<ImplD2>  &_DenOpD2,  |  | ||||||
| 								 const Params & p, Integer ReliableUpdateFreq | 								 const Params & p, Integer ReliableUpdateFreq | ||||||
| 							) :  | 							) :  | ||||||
| 	GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF,ImplD2>(_NumOp, _DenOp,_NumOpF, _DenOpF,_NumOpD2, _DenOpD2, transcribe(p),ReliableUpdateFreq){} | 	GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF>(_NumOp, _DenOp,_NumOpF, _DenOpF, transcribe(p),ReliableUpdateFreq){} | ||||||
|  |  | ||||||
|       virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}       |       virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}       | ||||||
|     }; |     }; | ||||||
|   | |||||||
| @@ -112,40 +112,27 @@ NAMESPACE_BEGIN(Grid); | |||||||
|         // NumOp == V |         // NumOp == V | ||||||
|         // DenOp == M |         // DenOp == M | ||||||
|         // |         // | ||||||
|     AUDIT(); |  | ||||||
|         FermionField etaOdd (NumOp.FermionRedBlackGrid()); |         FermionField etaOdd (NumOp.FermionRedBlackGrid()); | ||||||
|         FermionField etaEven(NumOp.FermionRedBlackGrid()); |         FermionField etaEven(NumOp.FermionRedBlackGrid()); | ||||||
|         FermionField tmp    (NumOp.FermionRedBlackGrid()); |         FermionField tmp    (NumOp.FermionRedBlackGrid()); | ||||||
|  |  | ||||||
|     AUDIT(); |  | ||||||
|         pickCheckerboard(Even,etaEven,eta); |         pickCheckerboard(Even,etaEven,eta); | ||||||
|     AUDIT(); |  | ||||||
|         pickCheckerboard(Odd,etaOdd,eta); |         pickCheckerboard(Odd,etaOdd,eta); | ||||||
|  |  | ||||||
|     AUDIT(); |  | ||||||
|         NumOp.ImportGauge(U); |         NumOp.ImportGauge(U); | ||||||
|     AUDIT(); |  | ||||||
|         DenOp.ImportGauge(U); |         DenOp.ImportGauge(U); | ||||||
| 	std::cout << " TwoFlavourRefresh:  Imported gauge "<<std::endl; | 	std::cout << " TwoFlavourRefresh:  Imported gauge "<<std::endl; | ||||||
|     AUDIT(); |  | ||||||
|  |  | ||||||
|         SchurDifferentiableOperator<Impl> Mpc(DenOp); |         SchurDifferentiableOperator<Impl> Mpc(DenOp); | ||||||
|     AUDIT(); |  | ||||||
|         SchurDifferentiableOperator<Impl> Vpc(NumOp); |         SchurDifferentiableOperator<Impl> Vpc(NumOp); | ||||||
|     AUDIT(); |  | ||||||
|  |  | ||||||
| 	std::cout << " TwoFlavourRefresh: Diff ops "<<std::endl; | 	std::cout << " TwoFlavourRefresh: Diff ops "<<std::endl; | ||||||
|     AUDIT(); |  | ||||||
|         // Odd det factors |         // Odd det factors | ||||||
|         Mpc.MpcDag(etaOdd,PhiOdd); |         Mpc.MpcDag(etaOdd,PhiOdd); | ||||||
|     AUDIT(); |  | ||||||
| 	std::cout << " TwoFlavourRefresh: MpcDag "<<std::endl; | 	std::cout << " TwoFlavourRefresh: MpcDag "<<std::endl; | ||||||
|         tmp=Zero(); |         tmp=Zero(); | ||||||
|     AUDIT(); |  | ||||||
| 	std::cout << " TwoFlavourRefresh: Zero() guess "<<std::endl; | 	std::cout << " TwoFlavourRefresh: Zero() guess "<<std::endl; | ||||||
|     AUDIT(); |  | ||||||
|         HeatbathSolver(Vpc,PhiOdd,tmp); |         HeatbathSolver(Vpc,PhiOdd,tmp); | ||||||
|     AUDIT(); |  | ||||||
| 	std::cout << " TwoFlavourRefresh: Heatbath solver "<<std::endl; | 	std::cout << " TwoFlavourRefresh: Heatbath solver "<<std::endl; | ||||||
|         Vpc.Mpc(tmp,PhiOdd);             |         Vpc.Mpc(tmp,PhiOdd);             | ||||||
| 	std::cout << " TwoFlavourRefresh: Mpc "<<std::endl; | 	std::cout << " TwoFlavourRefresh: Mpc "<<std::endl; | ||||||
| @@ -220,20 +207,27 @@ NAMESPACE_BEGIN(Grid); | |||||||
|         //X = (Mdag M)^-1 V^dag phi |         //X = (Mdag M)^-1 V^dag phi | ||||||
|         //Y = (Mdag)^-1 V^dag  phi |         //Y = (Mdag)^-1 V^dag  phi | ||||||
|         Vpc.MpcDag(PhiOdd,Y);          // Y= Vdag phi |         Vpc.MpcDag(PhiOdd,Y);          // Y= Vdag phi | ||||||
|  | 	std::cout << GridLogMessage <<" Y "<<norm2(Y)<<std::endl; | ||||||
|         X=Zero(); |         X=Zero(); | ||||||
|         DerivativeSolver(Mpc,Y,X);     // X= (MdagM)^-1 Vdag phi |         DerivativeSolver(Mpc,Y,X);     // X= (MdagM)^-1 Vdag phi | ||||||
|  | 	std::cout << GridLogMessage <<" X "<<norm2(X)<<std::endl; | ||||||
|         Mpc.Mpc(X,Y);                  // Y=  Mdag^-1 Vdag phi |         Mpc.Mpc(X,Y);                  // Y=  Mdag^-1 Vdag phi | ||||||
|  | 	std::cout << GridLogMessage <<" Y "<<norm2(Y)<<std::endl; | ||||||
|  |  | ||||||
|         // phi^dag V (Mdag M)^-1 dV^dag  phi |         // phi^dag V (Mdag M)^-1 dV^dag  phi | ||||||
|         Vpc.MpcDagDeriv(force , X, PhiOdd );   dSdU = force; |         Vpc.MpcDagDeriv(force , X, PhiOdd );   dSdU = force; | ||||||
|  | 	std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl; | ||||||
|    |    | ||||||
|         // phi^dag dV (Mdag M)^-1 V^dag  phi |         // phi^dag dV (Mdag M)^-1 V^dag  phi | ||||||
|         Vpc.MpcDeriv(force , PhiOdd, X );      dSdU = dSdU+force; |         Vpc.MpcDeriv(force , PhiOdd, X );      dSdU = dSdU+force; | ||||||
|  | 	std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl; | ||||||
|  |  | ||||||
|         //    -    phi^dag V (Mdag M)^-1 Mdag dM   (Mdag M)^-1 V^dag  phi |         //    -    phi^dag V (Mdag M)^-1 Mdag dM   (Mdag M)^-1 V^dag  phi | ||||||
|         //    -    phi^dag V (Mdag M)^-1 dMdag M   (Mdag M)^-1 V^dag  phi |         //    -    phi^dag V (Mdag M)^-1 dMdag M   (Mdag M)^-1 V^dag  phi | ||||||
|         Mpc.MpcDeriv(force,Y,X);              dSdU = dSdU-force; |         Mpc.MpcDeriv(force,Y,X);              dSdU = dSdU-force; | ||||||
|  | 	std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl; | ||||||
|         Mpc.MpcDagDeriv(force,X,Y);           dSdU = dSdU-force; |         Mpc.MpcDagDeriv(force,X,Y);           dSdU = dSdU-force; | ||||||
|  | 	std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl; | ||||||
|  |  | ||||||
|         // FIXME No force contribution from EvenEven assumed here |         // FIXME No force contribution from EvenEven assumed here | ||||||
|         // Needs a fix for clover. |         // Needs a fix for clover. | ||||||
|   | |||||||
| @@ -134,14 +134,12 @@ protected: | |||||||
|       double start_force = usecond(); |       double start_force = usecond(); | ||||||
|  |  | ||||||
|       std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] before"<<std::endl; |       std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] before"<<std::endl; | ||||||
|       AUDIT(); |  | ||||||
|        |        | ||||||
|       as[level].actions.at(a)->deriv_timer_start(); |       as[level].actions.at(a)->deriv_timer_start(); | ||||||
|       as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta |       as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta | ||||||
|       as[level].actions.at(a)->deriv_timer_stop(); |       as[level].actions.at(a)->deriv_timer_stop(); | ||||||
|  |  | ||||||
|       std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] after"<<std::endl; |       std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] after"<<std::endl; | ||||||
|       AUDIT(); |  | ||||||
|  |  | ||||||
|       std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl; |       std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl; | ||||||
|       auto name = as[level].actions.at(a)->action_name(); |       auto name = as[level].actions.at(a)->action_name(); | ||||||
| @@ -382,12 +380,12 @@ public: | |||||||
|         Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); |         Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); | ||||||
|  |  | ||||||
| 	std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] before"<<std::endl; | 	std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] before"<<std::endl; | ||||||
| 	AUDIT(); |  | ||||||
| 	as[level].actions.at(actionID)->refresh_timer_start(); | 	as[level].actions.at(actionID)->refresh_timer_start(); | ||||||
|         as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG); |         as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG); | ||||||
| 	as[level].actions.at(actionID)->refresh_timer_stop(); | 	as[level].actions.at(actionID)->refresh_timer_stop(); | ||||||
| 	std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] after"<<std::endl; | 	std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] after"<<std::endl; | ||||||
| 	AUDIT(); |  | ||||||
|       } |       } | ||||||
|  |  | ||||||
|       // Refresh the higher representation actions |       // Refresh the higher representation actions | ||||||
| @@ -424,7 +422,7 @@ public: | |||||||
|     // Actions |     // Actions | ||||||
|     for (int level = 0; level < as.size(); ++level) { |     for (int level = 0; level < as.size(); ++level) { | ||||||
|       for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { |       for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { | ||||||
| 	AUDIT(); |  | ||||||
|         // get gauge field from the SmearingPolicy and |         // get gauge field from the SmearingPolicy and | ||||||
|         // based on the boolean is_smeared in actionID |         // based on the boolean is_smeared in actionID | ||||||
|         Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); |         Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); | ||||||
| @@ -434,7 +432,7 @@ public: | |||||||
|    	        as[level].actions.at(actionID)->S_timer_stop(); |    	        as[level].actions.at(actionID)->S_timer_stop(); | ||||||
|         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; |         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; | ||||||
|         H += Hterm; |         H += Hterm; | ||||||
| 	AUDIT(); |  | ||||||
|       } |       } | ||||||
|       as[level].apply(S_hireps, Representations, level, H); |       as[level].apply(S_hireps, Representations, level, H); | ||||||
|     } |     } | ||||||
| @@ -447,9 +445,9 @@ public: | |||||||
|     void operator()(std::vector<Action<FieldType>*> repr_set, Repr& Rep, int level, RealD& H) { |     void operator()(std::vector<Action<FieldType>*> repr_set, Repr& Rep, int level, RealD& H) { | ||||||
|        |        | ||||||
|       for (int a = 0; a < repr_set.size(); ++a) { |       for (int a = 0; a < repr_set.size(); ++a) { | ||||||
| 	AUDIT(); |  | ||||||
|         RealD Hterm = repr_set.at(a)->Sinitial(Rep.U); |         RealD Hterm = repr_set.at(a)->Sinitial(Rep.U); | ||||||
| 	AUDIT(); |  | ||||||
|         std::cout << GridLogMessage << "Sinitial Level " << level << " term " << a << " H Hirep = " << Hterm << std::endl; |         std::cout << GridLogMessage << "Sinitial Level " << level << " term " << a << " H Hirep = " << Hterm << std::endl; | ||||||
|         H += Hterm; |         H += Hterm; | ||||||
|  |  | ||||||
| @@ -474,10 +472,10 @@ public: | |||||||
|         Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); |         Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); | ||||||
|         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; |         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; | ||||||
| 	        as[level].actions.at(actionID)->S_timer_start(); | 	        as[level].actions.at(actionID)->S_timer_start(); | ||||||
| 	AUDIT(); |  | ||||||
|         Hterm = as[level].actions.at(actionID)->Sinitial(Us); |         Hterm = as[level].actions.at(actionID)->Sinitial(Us); | ||||||
|    	        as[level].actions.at(actionID)->S_timer_stop(); |    	        as[level].actions.at(actionID)->S_timer_stop(); | ||||||
| 	AUDIT(); |  | ||||||
|         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; |         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; | ||||||
|         H += Hterm; |         H += Hterm; | ||||||
|       } |       } | ||||||
| @@ -490,7 +488,6 @@ public: | |||||||
|    |    | ||||||
|   void integrate(Field& U)  |   void integrate(Field& U)  | ||||||
|   { |   { | ||||||
|     AUDIT(); |  | ||||||
|     // reset the clocks |     // reset the clocks | ||||||
|     t_U = 0; |     t_U = 0; | ||||||
|     for (int level = 0; level < as.size(); ++level) { |     for (int level = 0; level < as.size(); ++level) { | ||||||
| @@ -508,10 +505,8 @@ public: | |||||||
|       assert(fabs(t_U - t_P[level]) < 1.0e-6);  // must be the same |       assert(fabs(t_U - t_P[level]) < 1.0e-6);  // must be the same | ||||||
|       std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl; |       std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl; | ||||||
|     } |     } | ||||||
|     AUDIT(); |  | ||||||
|  |  | ||||||
|     FieldImplementation::Project(U); |     FieldImplementation::Project(U); | ||||||
|     AUDIT(); |  | ||||||
|  |  | ||||||
|     // and that we indeed got to the end of the trajectory |     // and that we indeed got to the end of the trajectory | ||||||
|     assert(fabs(t_U - Params.trajL) < 1.0e-6); |     assert(fabs(t_U - Params.trajL) < 1.0e-6); | ||||||
|   | |||||||
| @@ -320,7 +320,7 @@ struct Conj{ | |||||||
|  |  | ||||||
| struct TimesMinusI{ | struct TimesMinusI{ | ||||||
|   //Complex single |   //Complex single | ||||||
|   inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ |   inline float32x4_t operator()(float32x4_t in){ | ||||||
|     // ar ai br bi -> ai -ar ai -br |     // ar ai br bi -> ai -ar ai -br | ||||||
|     float32x4_t r0, r1; |     float32x4_t r0, r1; | ||||||
|     r0 = vnegq_f32(in);        // -ar -ai -br -bi |     r0 = vnegq_f32(in);        // -ar -ai -br -bi | ||||||
| @@ -328,7 +328,7 @@ struct TimesMinusI{ | |||||||
|     return vtrn1q_f32(r1, r0); //  ar -ai  br -bi |     return vtrn1q_f32(r1, r0); //  ar -ai  br -bi | ||||||
|   } |   } | ||||||
|   //Complex double |   //Complex double | ||||||
|   inline float64x2_t operator()(float64x2_t in, float64x2_t ret){ |   inline float64x2_t operator()(float64x2_t in){ | ||||||
|     // a ib -> b -ia |     // a ib -> b -ia | ||||||
|     float64x2_t tmp; |     float64x2_t tmp; | ||||||
|     tmp = vnegq_f64(in); |     tmp = vnegq_f64(in); | ||||||
| @@ -338,7 +338,7 @@ struct TimesMinusI{ | |||||||
|  |  | ||||||
| struct TimesI{ | struct TimesI{ | ||||||
|   //Complex single |   //Complex single | ||||||
|   inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ |   inline float32x4_t operator()(float32x4_t in){ | ||||||
|     // ar ai br bi -> -ai ar -bi br |     // ar ai br bi -> -ai ar -bi br | ||||||
|     float32x4_t r0, r1; |     float32x4_t r0, r1; | ||||||
|     r0 = vnegq_f32(in);        // -ar -ai -br -bi |     r0 = vnegq_f32(in);        // -ar -ai -br -bi | ||||||
| @@ -346,7 +346,7 @@ struct TimesI{ | |||||||
|     return vtrn1q_f32(r1, in); // -ai  ar -bi  br |     return vtrn1q_f32(r1, in); // -ai  ar -bi  br | ||||||
|   } |   } | ||||||
|   //Complex double |   //Complex double | ||||||
|   inline float64x2_t operator()(float64x2_t in, float64x2_t ret){ |   inline float64x2_t operator()(float64x2_t in){ | ||||||
|     // a ib -> -b ia |     // a ib -> -b ia | ||||||
|     float64x2_t tmp; |     float64x2_t tmp; | ||||||
|     tmp = vnegq_f64(in); |     tmp = vnegq_f64(in); | ||||||
|   | |||||||
| @@ -339,8 +339,8 @@ public: | |||||||
|   // Vectors that live on the symmetric heap in case of SHMEM |   // Vectors that live on the symmetric heap in case of SHMEM | ||||||
|   // These are used; either SHM objects or refs to the above symmetric heap vectors |   // These are used; either SHM objects or refs to the above symmetric heap vectors | ||||||
|   // depending on comms target |   // depending on comms target | ||||||
|   Vector<cobj *> u_simd_send_buf; |   std::vector<cobj *> u_simd_send_buf; | ||||||
|   Vector<cobj *> u_simd_recv_buf; |   std::vector<cobj *> u_simd_recv_buf; | ||||||
|  |  | ||||||
|   int u_comm_offset; |   int u_comm_offset; | ||||||
|   int _unified_buffer_size; |   int _unified_buffer_size; | ||||||
| @@ -348,7 +348,7 @@ public: | |||||||
|   //////////////////////////////////////// |   //////////////////////////////////////// | ||||||
|   // Stencil query |   // Stencil query | ||||||
|   //////////////////////////////////////// |   //////////////////////////////////////// | ||||||
| #ifdef SHM_FAST_PATH | #if 1 | ||||||
|   inline int SameNode(int point) { |   inline int SameNode(int point) { | ||||||
|  |  | ||||||
|     int dimension    = this->_directions[point]; |     int dimension    = this->_directions[point]; | ||||||
| @@ -434,7 +434,6 @@ public: | |||||||
|   //////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////// | ||||||
|   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) |   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) | ||||||
|   { |   { | ||||||
|     accelerator_barrier(); |  | ||||||
|     for(int i=0;i<Packets.size();i++){ |     for(int i=0;i<Packets.size();i++){ | ||||||
|       _grid->StencilSendToRecvFromBegin(MpiReqs, |       _grid->StencilSendToRecvFromBegin(MpiReqs, | ||||||
| 					Packets[i].send_buf, | 					Packets[i].send_buf, | ||||||
| @@ -443,7 +442,6 @@ public: | |||||||
| 					Packets[i].from_rank,Packets[i].do_recv, | 					Packets[i].from_rank,Packets[i].do_recv, | ||||||
| 					Packets[i].xbytes,Packets[i].rbytes,i); | 					Packets[i].xbytes,Packets[i].rbytes,i); | ||||||
|     } |     } | ||||||
|     _grid->StencilBarrier();// Synch shared memory on a single nodes |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) |   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) | ||||||
| @@ -452,6 +450,9 @@ public: | |||||||
|     if   ( this->partialDirichlet ) DslashLogPartial(); |     if   ( this->partialDirichlet ) DslashLogPartial(); | ||||||
|     else if ( this->fullDirichlet ) DslashLogDirichlet(); |     else if ( this->fullDirichlet ) DslashLogDirichlet(); | ||||||
|     else DslashLogFull(); |     else DslashLogFull(); | ||||||
|  |     acceleratorCopySynchronise(); | ||||||
|  |     // Everyone agrees we are all done | ||||||
|  |     _grid->StencilBarrier();  | ||||||
|   } |   } | ||||||
|   //////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////// | ||||||
|   // Blocking send and receive. Either sequential or parallel. |   // Blocking send and receive. Either sequential or parallel. | ||||||
| @@ -529,7 +530,6 @@ public: | |||||||
|   { |   { | ||||||
|     _grid->StencilBarrier();// Synch shared memory on a single nodes |     _grid->StencilBarrier();// Synch shared memory on a single nodes | ||||||
|  |  | ||||||
|     // conformable(source.Grid(),_grid); |  | ||||||
|     assert(source.Grid()==_grid); |     assert(source.Grid()==_grid); | ||||||
|  |  | ||||||
|     u_comm_offset=0; |     u_comm_offset=0; | ||||||
| @@ -655,8 +655,8 @@ public: | |||||||
|     CommsMerge(decompress,Mergers,Decompressions); |     CommsMerge(decompress,Mergers,Decompressions); | ||||||
|   } |   } | ||||||
|   template<class decompressor>  void CommsMergeSHM(decompressor decompress) { |   template<class decompressor>  void CommsMergeSHM(decompressor decompress) { | ||||||
|     _grid->StencilBarrier();// Synch shared memory on a single nodes |     assert(MergersSHM.size()==0); | ||||||
|     CommsMerge(decompress,MergersSHM,DecompressionsSHM); |     assert(DecompressionsSHM.size()==0); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   template<class decompressor> |   template<class decompressor> | ||||||
| @@ -705,6 +705,7 @@ public: | |||||||
| 	} | 	} | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |     std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl; | ||||||
|   } |   } | ||||||
|   /// Introduce a block structure and switch off comms on boundaries |   /// Introduce a block structure and switch off comms on boundaries | ||||||
|   void DirichletBlock(const Coordinate &dirichlet_block) |   void DirichletBlock(const Coordinate &dirichlet_block) | ||||||
| @@ -1366,10 +1367,11 @@ public: | |||||||
| 	    int recv_from_rank; | 	    int recv_from_rank; | ||||||
| 	    int xmit_to_rank; | 	    int xmit_to_rank; | ||||||
| 	    int shm_send=0; | 	    int shm_send=0; | ||||||
| 	    int shm_recv=0; |  | ||||||
| 	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); | 	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); | ||||||
| #ifdef SHM_FAST_PATH | #ifdef SHM_FAST_PATH | ||||||
|   #warning STENCIL SHM FAST PATH SELECTED |   #warning STENCIL SHM FAST PATH SELECTED | ||||||
|  |   	  int shm_recv=0; | ||||||
| 	    // shm == receive pointer         if offnode | 	    // shm == receive pointer         if offnode | ||||||
| 	    // shm == Translate[send pointer] if on node -- my view of his send pointer | 	    // shm == Translate[send pointer] if on node -- my view of his send pointer | ||||||
| 	    cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp); | 	    cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp); | ||||||
| @@ -1402,7 +1404,6 @@ public: | |||||||
| 		acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero | 		acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero | ||||||
| 	      } | 	      } | ||||||
| 	      int do_send = (comms_send|comms_partial_send) && (!shm_send ); | 	      int do_send = (comms_send|comms_partial_send) && (!shm_send ); | ||||||
| 	      int do_recv = (comms_send|comms_partial_send) && (!shm_recv ); |  | ||||||
| 	      AddPacket((void *)sp,(void *)rp, | 	      AddPacket((void *)sp,(void *)rp, | ||||||
| 			xmit_to_rank,do_send, | 			xmit_to_rank,do_send, | ||||||
| 			recv_from_rank,do_send, | 			recv_from_rank,do_send, | ||||||
|   | |||||||
| @@ -133,7 +133,6 @@ typename vobj::scalar_object extractLane(int lane, const vobj & __restrict__ vec | |||||||
|   typedef scalar_type * pointer; |   typedef scalar_type * pointer; | ||||||
|  |  | ||||||
|   constexpr int words=sizeof(vobj)/sizeof(vector_type); |   constexpr int words=sizeof(vobj)/sizeof(vector_type); | ||||||
|   constexpr int Nsimd=vector_type::Nsimd(); |  | ||||||
|  |  | ||||||
|   scalar_object extracted; |   scalar_object extracted; | ||||||
|   pointer __restrict__  sp = (pointer)&extracted; // Type pun |   pointer __restrict__  sp = (pointer)&extracted; // Type pun | ||||||
| @@ -153,7 +152,6 @@ void insertLane(int lane, vobj & __restrict__ vec,const typename vobj::scalar_ob | |||||||
|   typedef scalar_type * pointer; |   typedef scalar_type * pointer; | ||||||
|  |  | ||||||
|   constexpr int words=sizeof(vobj)/sizeof(vector_type); |   constexpr int words=sizeof(vobj)/sizeof(vector_type); | ||||||
|   constexpr int Nsimd=vector_type::Nsimd(); |  | ||||||
|  |  | ||||||
|   pointer __restrict__ sp = (pointer)&extracted; |   pointer __restrict__ sp = (pointer)&extracted; | ||||||
|   vector_type *vp = (vector_type *)&vec; |   vector_type *vp = (vector_type *)&vec; | ||||||
| @@ -178,8 +176,6 @@ void extract(const vobj &vec,const ExtractPointerArray<sobj> &extracted, int off | |||||||
|   const int s = Nsimd/Nextr; |   const int s = Nsimd/Nextr; | ||||||
|  |  | ||||||
|   vector_type * vp = (vector_type *)&vec; |   vector_type * vp = (vector_type *)&vec; | ||||||
|   scalar_type      vtmp; |  | ||||||
|   sobj_scalar_type stmp; |  | ||||||
|   for(int w=0;w<words;w++){ |   for(int w=0;w<words;w++){ | ||||||
|     for(int i=0;i<Nextr;i++){ |     for(int i=0;i<Nextr;i++){ | ||||||
|       sobj_scalar_type * pointer = (sobj_scalar_type *)& extracted[i][offset]; |       sobj_scalar_type * pointer = (sobj_scalar_type *)& extracted[i][offset]; | ||||||
| @@ -205,7 +201,6 @@ void merge(vobj &vec,const ExtractPointerArray<sobj> &extracted, int offset) | |||||||
|  |  | ||||||
|   vector_type * vp = (vector_type *)&vec; |   vector_type * vp = (vector_type *)&vec; | ||||||
|   scalar_type      vtmp; |   scalar_type      vtmp; | ||||||
|   sobj_scalar_type stmp; |  | ||||||
|   for(int w=0;w<words;w++){ |   for(int w=0;w<words;w++){ | ||||||
|     for(int i=0;i<Nextr;i++){ |     for(int i=0;i<Nextr;i++){ | ||||||
|       sobj_scalar_type * pointer = (sobj_scalar_type *)& extracted[i][offset]; |       sobj_scalar_type * pointer = (sobj_scalar_type *)& extracted[i][offset]; | ||||||
| @@ -242,9 +237,6 @@ void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __rest | |||||||
|   typedef oextract_type * opointer; |   typedef oextract_type * opointer; | ||||||
|   typedef iextract_type * ipointer; |   typedef iextract_type * ipointer; | ||||||
|  |  | ||||||
|   constexpr int oNsimd=ovector_type::Nsimd(); |  | ||||||
|   constexpr int iNsimd=ivector_type::Nsimd(); |  | ||||||
|  |  | ||||||
|   iscalar_type itmp; |   iscalar_type itmp; | ||||||
|   oscalar_type otmp; |   oscalar_type otmp; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -458,6 +458,7 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); | |||||||
| // Common on all GPU targets | // Common on all GPU targets | ||||||
| ////////////////////////////////////////////// | ////////////////////////////////////////////// | ||||||
| #if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP) | #if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP) | ||||||
|  | // FIXME -- the non-blocking nature got broken March 30 2023 by PAB | ||||||
| #define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );   | #define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );   | ||||||
|  |  | ||||||
| #define accelerator_for( iter, num, nsimd, ... )		\ | #define accelerator_for( iter, num, nsimd, ... )		\ | ||||||
| @@ -525,7 +526,7 @@ inline void acceleratorFreeCpu  (void *ptr){free(ptr);}; | |||||||
| ////////////////////////////////////////////// | ////////////////////////////////////////////// | ||||||
|  |  | ||||||
| #ifdef GRID_SYCL | #ifdef GRID_SYCL | ||||||
| inline void acceleratorFenceComputeStream(void){ accelerator_barrier();}; | inline void acceleratorFenceComputeStream(void){ theGridAccelerator->ext_oneapi_submit_barrier(); }; | ||||||
| #else | #else | ||||||
| // Ordering within a stream guaranteed on Nvidia & AMD | // Ordering within a stream guaranteed on Nvidia & AMD | ||||||
| inline void acceleratorFenceComputeStream(void){ }; | inline void acceleratorFenceComputeStream(void){ }; | ||||||
|   | |||||||
| @@ -227,7 +227,7 @@ int main(int argc, char **argv) { | |||||||
|   //  std::vector<Real> hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated |   //  std::vector<Real> hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated | ||||||
|   //  std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass }); |   //  std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass }); | ||||||
|  |  | ||||||
|   int SP_iters=10000; |   int SP_iters=9000; | ||||||
|    |    | ||||||
|   RationalActionParams OFRp; // Up/down |   RationalActionParams OFRp; // Up/down | ||||||
|   OFRp.lo       = 6.0e-5; |   OFRp.lo       = 6.0e-5; | ||||||
| @@ -362,12 +362,12 @@ int main(int argc, char **argv) { | |||||||
|  |  | ||||||
|   // Probably dominates the force - back to EOFA. |   // Probably dominates the force - back to EOFA. | ||||||
|   OneFlavourRationalParams SFRp; |   OneFlavourRationalParams SFRp; | ||||||
|   SFRp.lo       = 0.25; |   SFRp.lo       = 0.1; | ||||||
|   SFRp.hi       = 25.0; |   SFRp.hi       = 25.0; | ||||||
|   SFRp.MaxIter  = 10000; |   SFRp.MaxIter  = 10000; | ||||||
|   SFRp.tolerance= 1.0e-5; |   SFRp.tolerance= 1.0e-8; | ||||||
|   SFRp.mdtolerance= 2.0e-4; |   SFRp.mdtolerance= 2.0e-4; | ||||||
|   SFRp.degree   = 8; |   SFRp.degree   = 12; | ||||||
|   SFRp.precision= 50; |   SFRp.precision= 50; | ||||||
|    |    | ||||||
|   MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); |   MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); | ||||||
| @@ -451,7 +451,7 @@ int main(int argc, char **argv) { | |||||||
|    |    | ||||||
| #define MIXED_PRECISION | #define MIXED_PRECISION | ||||||
| #ifdef MIXED_PRECISION | #ifdef MIXED_PRECISION | ||||||
|   std::vector<GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicy> *> Bdys; |   std::vector<GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF> *> Bdys; | ||||||
| #else | #else | ||||||
|   std::vector<GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys; |   std::vector<GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys; | ||||||
| #endif | #endif | ||||||
| @@ -526,15 +526,13 @@ int main(int argc, char **argv) { | |||||||
|       Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],CG)); |       Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],CG)); | ||||||
|     } else { |     } else { | ||||||
| #ifdef MIXED_PRECISION | #ifdef MIXED_PRECISION | ||||||
|       Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicy>( |       Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF>( | ||||||
| 			   *Numerators[h],*Denominators[h], | 			   *Numerators[h],*Denominators[h], | ||||||
| 			   *NumeratorsF[h],*DenominatorsF[h], | 			   *NumeratorsF[h],*DenominatorsF[h], | ||||||
| 			   *Numerators[h],*Denominators[h], |  | ||||||
| 			   OFRp, SP_iters) ); | 			   OFRp, SP_iters) ); | ||||||
|       Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicy>( |       Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF>( | ||||||
| 			   *Numerators[h],*Denominators[h], | 			   *Numerators[h],*Denominators[h], | ||||||
| 			   *NumeratorsF[h],*DenominatorsF[h], | 			   *NumeratorsF[h],*DenominatorsF[h], | ||||||
| 			   *Numerators[h],*Denominators[h], |  | ||||||
| 			   OFRp, SP_iters) ); | 			   OFRp, SP_iters) ); | ||||||
| #else | #else | ||||||
|       Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp)); |       Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp)); | ||||||
|   | |||||||
| @@ -329,7 +329,6 @@ int main(int argc, char **argv) { | |||||||
|  |  | ||||||
|      |      | ||||||
|     auto grid4= GridPtr; |     auto grid4= GridPtr; | ||||||
|     auto rbgrid4= GridRBPtr; |  | ||||||
|     auto rbgrid = StrangeOp.FermionRedBlackGrid(); |     auto rbgrid = StrangeOp.FermionRedBlackGrid(); | ||||||
|     auto grid = StrangeOp.FermionGrid(); |     auto grid = StrangeOp.FermionGrid(); | ||||||
|     if(1){ |     if(1){ | ||||||
|   | |||||||
| @@ -164,11 +164,6 @@ int main(int argc, char **argv) { | |||||||
|   typedef MobiusEOFAFermionF FermionEOFAActionF; |   typedef MobiusEOFAFermionF FermionEOFAActionF; | ||||||
|   typedef typename FermionActionF::FermionField FermionFieldF; |   typedef typename FermionActionF::FermionField FermionFieldF; | ||||||
|  |  | ||||||
|   typedef WilsonImplD2 FermionImplPolicyD2; |  | ||||||
|   typedef MobiusFermionD2 FermionActionD2; |  | ||||||
|   typedef MobiusEOFAFermionD2 FermionEOFAActionD2; |  | ||||||
|   typedef typename FermionActionD2::FermionField FermionFieldD2; |  | ||||||
|  |  | ||||||
|   typedef Grid::XmlReader       Serialiser; |   typedef Grid::XmlReader       Serialiser; | ||||||
|  |  | ||||||
|   //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: |   //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: | ||||||
| @@ -250,11 +245,6 @@ int main(int argc, char **argv) { | |||||||
|  |  | ||||||
|   GlobalSharedMemory::GetShmDims(mpi,shm); |   GlobalSharedMemory::GetShmDims(mpi,shm); | ||||||
|  |  | ||||||
|   Coordinate CommDim(Nd); |  | ||||||
|   for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0; |  | ||||||
|  |  | ||||||
|   Coordinate NonDirichlet(Nd+1,0); |  | ||||||
|  |  | ||||||
|   ////////////////////////// |   ////////////////////////// | ||||||
|   // Fermion Grids |   // Fermion Grids | ||||||
|   ////////////////////////// |   ////////////////////////// | ||||||
| @@ -272,7 +262,6 @@ int main(int argc, char **argv) { | |||||||
|   // temporarily need a gauge field |   // temporarily need a gauge field | ||||||
|   LatticeGaugeFieldD  U(GridPtr); U=Zero(); |   LatticeGaugeFieldD  U(GridPtr); U=Zero(); | ||||||
|   LatticeGaugeFieldF  UF(GridPtrF); UF=Zero(); |   LatticeGaugeFieldF  UF(GridPtrF); UF=Zero(); | ||||||
|   LatticeGaugeFieldD2 UD2(GridPtrF); UD2=Zero(); |  | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage << " Running the HMC "<< std::endl; |   std::cout << GridLogMessage << " Running the HMC "<< std::endl; | ||||||
|   TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file |   TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file | ||||||
| @@ -283,8 +272,6 @@ int main(int argc, char **argv) { | |||||||
|   std::vector<Complex> boundary = {1,1,1,-1}; |   std::vector<Complex> boundary = {1,1,1,-1}; | ||||||
|   FermionAction::ImplParams Params(boundary); |   FermionAction::ImplParams Params(boundary); | ||||||
|   FermionActionF::ImplParams ParamsF(boundary); |   FermionActionF::ImplParams ParamsF(boundary); | ||||||
|   Params.dirichlet=NonDirichlet; |  | ||||||
|   ParamsF.dirichlet=NonDirichlet; |  | ||||||
|  |  | ||||||
|   //  double StoppingCondition = 1e-14; |   //  double StoppingCondition = 1e-14; | ||||||
|   //  double MDStoppingCondition = 1e-9; |   //  double MDStoppingCondition = 1e-9; | ||||||
| @@ -311,12 +298,12 @@ int main(int argc, char **argv) { | |||||||
|  |  | ||||||
|   // Probably dominates the force - back to EOFA. |   // Probably dominates the force - back to EOFA. | ||||||
|   OneFlavourRationalParams SFRp; |   OneFlavourRationalParams SFRp; | ||||||
|   SFRp.lo       = 0.25; |   SFRp.lo       = 0.1; | ||||||
|   SFRp.hi       = 25.0; |   SFRp.hi       = 30.0; | ||||||
|   SFRp.MaxIter  = 10000; |   SFRp.MaxIter  = 10000; | ||||||
|   SFRp.tolerance= 1.0e-5; |   SFRp.tolerance= 1.0e-8; | ||||||
|   SFRp.mdtolerance= 2.0e-4; |   SFRp.mdtolerance= 2.0e-6; | ||||||
|   SFRp.degree   = 8; |   SFRp.degree   = 10; | ||||||
|   SFRp.precision= 50; |   SFRp.precision= 50; | ||||||
|    |    | ||||||
|   MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); |   MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); | ||||||
| @@ -376,33 +363,29 @@ int main(int argc, char **argv) { | |||||||
|   //////////////////////////////////// |   //////////////////////////////////// | ||||||
|   std::vector<Real> light_den; |   std::vector<Real> light_den; | ||||||
|   std::vector<Real> light_num; |   std::vector<Real> light_num; | ||||||
|   std::vector<int> dirichlet_den; |  | ||||||
|   std::vector<int> dirichlet_num; |  | ||||||
|  |  | ||||||
|   int n_hasenbusch = hasenbusch.size(); |   int n_hasenbusch = hasenbusch.size(); | ||||||
|   light_den.push_back(light_mass);  dirichlet_den.push_back(0); |   light_den.push_back(light_mass);  | ||||||
|   for(int h=0;h<n_hasenbusch;h++){ |   for(int h=0;h<n_hasenbusch;h++){ | ||||||
|     light_den.push_back(hasenbusch[h]); dirichlet_den.push_back(0); |     light_den.push_back(hasenbusch[h]); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   for(int h=0;h<n_hasenbusch;h++){ |   for(int h=0;h<n_hasenbusch;h++){ | ||||||
|     light_num.push_back(hasenbusch[h]); dirichlet_num.push_back(0); |     light_num.push_back(hasenbusch[h]); | ||||||
|   } |   } | ||||||
|   light_num.push_back(pv_mass);  dirichlet_num.push_back(0); |   light_num.push_back(pv_mass); | ||||||
|  |  | ||||||
|   std::vector<FermionAction *> Numerators; |   std::vector<FermionAction *> Numerators; | ||||||
|   std::vector<FermionAction *> Denominators; |   std::vector<FermionAction *> Denominators; | ||||||
|   std::vector<FermionActionF *> NumeratorsF; |   std::vector<FermionActionF *> NumeratorsF; | ||||||
|   std::vector<FermionActionF *> DenominatorsF; |   std::vector<FermionActionF *> DenominatorsF; | ||||||
|   std::vector<FermionActionD2 *> NumeratorsD2; |  | ||||||
|   std::vector<FermionActionD2 *> DenominatorsD2; |  | ||||||
|   std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients; |   std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients; | ||||||
|   std::vector<MxPCG *> ActionMPCG; |   std::vector<MxPCG *> ActionMPCG; | ||||||
|   std::vector<MxPCG *> MPCG; |   std::vector<MxPCG *> MPCG; | ||||||
|    |    | ||||||
| #define MIXED_PRECISION | #define MIXED_PRECISION | ||||||
| #ifdef MIXED_PRECISION | #ifdef MIXED_PRECISION | ||||||
|   std::vector<OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicyD2> *> Bdys; |   std::vector<OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF> *> Bdys; | ||||||
| #else | #else | ||||||
|   std::vector<OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys; |   std::vector<OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys; | ||||||
| #endif | #endif | ||||||
| @@ -416,9 +399,7 @@ int main(int argc, char **argv) { | |||||||
|     std::cout << GridLogMessage |     std::cout << GridLogMessage | ||||||
| 	      << " 2f quotient Action "; | 	      << " 2f quotient Action "; | ||||||
|     std::cout << "det D("<<light_den[h]<<")"; |     std::cout << "det D("<<light_den[h]<<")"; | ||||||
|     if ( dirichlet_den[h] ) std::cout << "^dirichlet    "; |  | ||||||
|     std::cout << "/ det D("<<light_num[h]<<")"; |     std::cout << "/ det D("<<light_num[h]<<")"; | ||||||
|     if ( dirichlet_num[h] ) std::cout << "^dirichlet    "; |  | ||||||
|     std::cout << std::endl; |     std::cout << std::endl; | ||||||
|  |  | ||||||
|     FermionAction::ImplParams ParamsNum(boundary); |     FermionAction::ImplParams ParamsNum(boundary); | ||||||
| @@ -426,21 +407,11 @@ int main(int argc, char **argv) { | |||||||
|     FermionActionF::ImplParams ParamsDenF(boundary); |     FermionActionF::ImplParams ParamsDenF(boundary); | ||||||
|     FermionActionF::ImplParams ParamsNumF(boundary); |     FermionActionF::ImplParams ParamsNumF(boundary); | ||||||
|      |      | ||||||
|     ParamsNum.dirichlet = NonDirichlet; |  | ||||||
|     ParamsDen.dirichlet = NonDirichlet; |  | ||||||
|  |  | ||||||
|     ParamsNum.partialDirichlet = 0; |  | ||||||
|     ParamsDen.partialDirichlet = 0; |  | ||||||
|      |  | ||||||
|     Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum)); |     Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum)); | ||||||
|     Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen)); |     Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen)); | ||||||
|  |  | ||||||
|     ParamsDenF.dirichlet = ParamsDen.dirichlet; |  | ||||||
|     ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet; |  | ||||||
|     DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF)); |     DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF)); | ||||||
|  |  | ||||||
|     ParamsNumF.dirichlet = ParamsNum.dirichlet; |  | ||||||
|     ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet; |  | ||||||
|     NumeratorsF.push_back  (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF)); |     NumeratorsF.push_back  (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF)); | ||||||
|  |  | ||||||
|     LinOpD.push_back(new LinearOperatorD(*Denominators[h])); |     LinOpD.push_back(new LinearOperatorD(*Denominators[h])); | ||||||
| @@ -477,7 +448,6 @@ int main(int argc, char **argv) { | |||||||
|   // Gauge action |   // Gauge action | ||||||
|   ///////////////////////////////////////////////////////////// |   ///////////////////////////////////////////////////////////// | ||||||
|   Level3.push_back(&GaugeAction); |   Level3.push_back(&GaugeAction); | ||||||
|   //  TheHMC.TheAction.push_back(Level1); |  | ||||||
|   TheHMC.TheAction.push_back(Level2); |   TheHMC.TheAction.push_back(Level2); | ||||||
|   TheHMC.TheAction.push_back(Level3); |   TheHMC.TheAction.push_back(Level3); | ||||||
|   std::cout << GridLogMessage << " Action complete "<< std::endl; |   std::cout << GridLogMessage << " Action complete "<< std::endl; | ||||||
|   | |||||||
| @@ -1,7 +1,8 @@ | |||||||
| # Grid [),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=GridBasedSoftware_Grid&tab=projectOverview)  | # Grid  | ||||||
|  |  | ||||||
| **Data parallel C++ mathematical object library.** | **Data parallel C++ mathematical object library.** | ||||||
|  |  | ||||||
|  | [),branch:default:true)/statusIcon.svg)](https://ci.dev.dirac.ed.ac.uk/project/GridBasedSoftware_Grid?mode=builds)  | ||||||
|  |  | ||||||
| License: GPL v2. | License: GPL v2. | ||||||
|  |  | ||||||
| Last update June 2017. | Last update June 2017. | ||||||
|   | |||||||
| @@ -425,7 +425,7 @@ void Benchmark(int Ls, Coordinate Dirichlet) | |||||||
|  |  | ||||||
|   err = r_eo-result; |   err = r_eo-result; | ||||||
|   n2e= norm2(err); |   n2e= norm2(err); | ||||||
|   std::cout<<GridLogMessage << "norm diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl; |   std::cout<<GridLogMessage << "norm diff   "<< n2e<<std::endl; | ||||||
|   assert(n2e<1.0e-4); |   assert(n2e<1.0e-4); | ||||||
|  |  | ||||||
|   pickCheckerboard(Even,src_e,err); |   pickCheckerboard(Even,src_e,err); | ||||||
|   | |||||||
							
								
								
									
										387
									
								
								benchmarks/Benchmark_dwf_fp32_paranoid.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										387
									
								
								benchmarks/Benchmark_dwf_fp32_paranoid.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,387 @@ | |||||||
|  |  /************************************************************************************* | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid | ||||||
|  |     Source file: ./benchmarks/Benchmark_dwf.cc | ||||||
|  |     Copyright (C) 2015 | ||||||
|  |  | ||||||
|  |     Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
|  |     Author: paboyle <paboyle@ph.ed.ac.uk> | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  |     *************************************************************************************/ | ||||||
|  |     /*  END LEGAL */ | ||||||
|  | #include <Grid/Grid.h> | ||||||
|  | #ifdef GRID_CUDA | ||||||
|  | #define CUDA_PROFILE | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | #ifdef CUDA_PROFILE | ||||||
|  | #include <cuda_profiler_api.h> | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | using namespace std; | ||||||
|  | using namespace Grid; | ||||||
|  |  | ||||||
|  | template<class d> | ||||||
|  | struct scal { | ||||||
|  |   d internal; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  |   Gamma::Algebra Gmu [] = { | ||||||
|  |     Gamma::Algebra::GammaX, | ||||||
|  |     Gamma::Algebra::GammaY, | ||||||
|  |     Gamma::Algebra::GammaZ, | ||||||
|  |     Gamma::Algebra::GammaT | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |  | ||||||
|  | int main (int argc, char ** argv) | ||||||
|  | { | ||||||
|  |   Grid_init(&argc,&argv); | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   int threads = GridThread::GetThreads(); | ||||||
|  |  | ||||||
|  |   Coordinate latt4 = GridDefaultLatt(); | ||||||
|  |   int Ls=16; | ||||||
|  |   for(int i=0;i<argc;i++) | ||||||
|  |     if(std::string(argv[i]) == "-Ls"){ | ||||||
|  |       std::stringstream ss(argv[i+1]); ss >> Ls; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |   GridLogLayout(); | ||||||
|  |  | ||||||
|  |   long unsigned int single_site_flops = 8*Nc*(7+16*Nc); | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); | ||||||
|  |   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); | ||||||
|  |   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); | ||||||
|  |   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage << "Making s innermost grids"<<std::endl; | ||||||
|  |   GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi()); | ||||||
|  |   GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid); | ||||||
|  |   GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid); | ||||||
|  |   GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid); | ||||||
|  |  | ||||||
|  |   std::vector<int> seeds4({1,2,3,4}); | ||||||
|  |   std::vector<int> seeds5({5,6,7,8}); | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; | ||||||
|  |   GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG")); | ||||||
|  |   std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; | ||||||
|  |   GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG")); | ||||||
|  |   std::cout << GridLogMessage << "Initialised RNGs" << std::endl; | ||||||
|  |  | ||||||
|  |   LatticeFermionF src   (FGrid); random(RNG5,src); | ||||||
|  |   LatticeFermionF src1   (FGrid); random(RNG5,src1); | ||||||
|  | #if 0 | ||||||
|  |   src = Zero(); | ||||||
|  |   { | ||||||
|  |     Coordinate origin({0,0,0,latt4[2]-1,0}); | ||||||
|  |     SpinColourVectorF tmp; | ||||||
|  |     tmp=Zero(); | ||||||
|  |     tmp()(0)(0)=Complex(-2.0,0.0); | ||||||
|  |     std::cout << " source site 0 " << tmp<<std::endl; | ||||||
|  |     pokeSite(tmp,src,origin); | ||||||
|  |   } | ||||||
|  | #else | ||||||
|  |   RealD N2 = 1.0/::sqrt(norm2(src)); | ||||||
|  |   src = src*N2; | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   LatticeFermionF result(FGrid); result=Zero(); | ||||||
|  |   LatticeFermionF    ref(FGrid);    ref=Zero(); | ||||||
|  |   LatticeFermionF    tmp(FGrid); | ||||||
|  |   LatticeFermionF    err(FGrid); | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage << "Drawing gauge field" << std::endl; | ||||||
|  |   LatticeGaugeFieldF Umu(UGrid); | ||||||
|  |   SU<Nc>::HotConfiguration(RNG4,Umu); | ||||||
|  |   std::cout << GridLogMessage << "Random gauge initialised " << std::endl; | ||||||
|  | #if 0 | ||||||
|  |   Umu=1.0; | ||||||
|  |   for(int mu=0;mu<Nd;mu++){ | ||||||
|  |     LatticeColourMatrixF ttmp(UGrid); | ||||||
|  |     ttmp = PeekIndex<LorentzIndex>(Umu,mu); | ||||||
|  |     //    if (mu !=2 ) ttmp = 0; | ||||||
|  |     //    ttmp = ttmp* pow(10.0,mu); | ||||||
|  |     PokeIndex<LorentzIndex>(Umu,ttmp,mu); | ||||||
|  |   } | ||||||
|  |   std::cout << GridLogMessage << "Forced to diagonal " << std::endl; | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |   //////////////////////////////////// | ||||||
|  |   // Naive wilson implementation | ||||||
|  |   //////////////////////////////////// | ||||||
|  |   // replicate across fifth dimension | ||||||
|  |   //  LatticeGaugeFieldF Umu5d(FGrid); | ||||||
|  |   std::vector<LatticeColourMatrixF> U(4,UGrid); | ||||||
|  |   for(int mu=0;mu<Nd;mu++){ | ||||||
|  |     U[mu] = PeekIndex<LorentzIndex>(Umu,mu); | ||||||
|  |   } | ||||||
|  |   std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl; | ||||||
|  |  | ||||||
|  |   if (1) | ||||||
|  |   { | ||||||
|  |     ref = Zero(); | ||||||
|  |     for(int mu=0;mu<Nd;mu++){ | ||||||
|  |  | ||||||
|  |       tmp = Cshift(src,mu+1,1); | ||||||
|  |       { | ||||||
|  | 	autoView( tmp_v  , tmp  , CpuWrite); | ||||||
|  | 	autoView( U_v  , U[mu]  , CpuRead); | ||||||
|  | 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){ | ||||||
|  | 	  for(int s=0;s<Ls;s++){ | ||||||
|  | 	    tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s]; | ||||||
|  | 	  } | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |       ref=ref + tmp - Gamma(Gmu[mu])*tmp; | ||||||
|  |  | ||||||
|  |       { | ||||||
|  | 	autoView( tmp_v  , tmp  , CpuWrite); | ||||||
|  | 	autoView( U_v  , U[mu]  , CpuRead); | ||||||
|  | 	autoView( src_v, src    , CpuRead); | ||||||
|  | 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){ | ||||||
|  | 	  for(int s=0;s<Ls;s++){ | ||||||
|  | 	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s]; | ||||||
|  | 	  } | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |       tmp =Cshift(tmp,mu+1,-1); | ||||||
|  |       ref=ref + tmp + Gamma(Gmu[mu])*tmp; | ||||||
|  |     } | ||||||
|  |     ref = -0.5*ref; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   RealD mass=0.1; | ||||||
|  |   RealD M5  =1.8; | ||||||
|  |  | ||||||
|  |   RealD NP = UGrid->_Nprocessors; | ||||||
|  |   RealD NN = UGrid->NodeCount(); | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; | ||||||
|  |   std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl; | ||||||
|  |   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; | ||||||
|  |   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; | ||||||
|  |   std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl; | ||||||
|  |   std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl; | ||||||
|  |   std::cout << GridLogMessage<< "* VComplexF size is "<<sizeof(vComplexF)<< " B"<<std::endl; | ||||||
|  |   if ( sizeof(RealF)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; | ||||||
|  |   if ( sizeof(RealF)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl; | ||||||
|  | #ifdef GRID_OMP | ||||||
|  |   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl; | ||||||
|  |   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl; | ||||||
|  | #endif | ||||||
|  |   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl; | ||||||
|  |   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl; | ||||||
|  |   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl; | ||||||
|  |   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; | ||||||
|  |  | ||||||
|  |   DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); | ||||||
|  |   int ncall =100; | ||||||
|  |  | ||||||
|  |   if (1) { | ||||||
|  |     FGrid->Barrier(); | ||||||
|  |     Dw.Dhop(src,result,0); | ||||||
|  |     std::cout<<GridLogMessage<<"Called warmup"<<std::endl; | ||||||
|  |     double t0=usecond(); | ||||||
|  |     for(int i=0;i<ncall;i++){ | ||||||
|  |       Dw.Dhop(src1,result,0); | ||||||
|  |       Dw.Dhop(src,result,0); | ||||||
|  |       err = ref-result; | ||||||
|  |       std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; | ||||||
|  |       assert (norm2(err)< 1.0e-4 ); | ||||||
|  |     } | ||||||
|  |     double t1=usecond(); | ||||||
|  |     FGrid->Barrier(); | ||||||
|  |  | ||||||
|  |     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||||
|  |     double flops=single_site_flops*volume*ncall; | ||||||
|  |  | ||||||
|  |     auto nsimd = vComplex::Nsimd(); | ||||||
|  |     auto simdwidth = sizeof(vComplex); | ||||||
|  |  | ||||||
|  |     // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors | ||||||
|  |     double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.); | ||||||
|  |  | ||||||
|  |     // mem: Nd Wilson * Ls, Nd gauge, Nc colors | ||||||
|  |     double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.); | ||||||
|  |  | ||||||
|  |     std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; | ||||||
|  |     //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; | ||||||
|  |     //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "RF  GiB/s (base 2) =   "<< 1000000. * data_rf/((t1-t0))<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "mem GiB/s (base 2) =   "<< 1000000. * data_mem/((t1-t0))<<std::endl; | ||||||
|  |     err = ref-result; | ||||||
|  |     std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; | ||||||
|  |     //exit(0); | ||||||
|  |  | ||||||
|  |     if(( norm2(err)>1.0e-4) ) { | ||||||
|  |  | ||||||
|  |       /* | ||||||
|  |       std::cout << "RESULT\n " << result<<std::endl; | ||||||
|  |       std::cout << "REF   \n " << ref   <<std::endl; | ||||||
|  |       std::cout << "ERR   \n " << err   <<std::endl; | ||||||
|  |       */ | ||||||
|  |       std::cout<<GridLogMessage << "WRONG RESULT" << std::endl; | ||||||
|  |       FGrid->Barrier(); | ||||||
|  |       exit(-1); | ||||||
|  |     } | ||||||
|  |     assert (norm2(err)< 1.0e-4 ); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   if (1) | ||||||
|  |   { // Naive wilson dag implementation | ||||||
|  |     ref = Zero(); | ||||||
|  |     for(int mu=0;mu<Nd;mu++){ | ||||||
|  |  | ||||||
|  |       //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x | ||||||
|  |       tmp = Cshift(src,mu+1,1); | ||||||
|  |       { | ||||||
|  | 	autoView( ref_v, ref, CpuWrite); | ||||||
|  | 	autoView( tmp_v, tmp, CpuRead); | ||||||
|  | 	autoView( U_v  , U[mu]  , CpuRead); | ||||||
|  | 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){ | ||||||
|  | 	  for(int s=0;s<Ls;s++){ | ||||||
|  | 	    int i=s+Ls*ss; | ||||||
|  | 	    ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ; | ||||||
|  | 	  } | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |        | ||||||
|  |       { | ||||||
|  | 	autoView( tmp_v  , tmp  , CpuWrite); | ||||||
|  | 	autoView( U_v  , U[mu]  , CpuRead); | ||||||
|  | 	autoView( src_v, src    , CpuRead); | ||||||
|  | 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){ | ||||||
|  | 	  for(int s=0;s<Ls;s++){ | ||||||
|  | 	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s]; | ||||||
|  | 	  } | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |       //      tmp =adj(U[mu])*src; | ||||||
|  |       tmp =Cshift(tmp,mu+1,-1); | ||||||
|  |       { | ||||||
|  | 	autoView( ref_v, ref, CpuWrite); | ||||||
|  | 	autoView( tmp_v, tmp, CpuRead); | ||||||
|  | 	for(int i=0;i<ref_v.size();i++){ | ||||||
|  | 	  ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ; | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     ref = -0.5*ref; | ||||||
|  |   } | ||||||
|  |   //  dump=1; | ||||||
|  |   Dw.Dhop(src,result,1); | ||||||
|  |   std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "Called DwDag"<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl; | ||||||
|  |   err = ref-result; | ||||||
|  |   std::cout<<GridLogMessage << "norm dag diff   "<< norm2(err)<<std::endl; | ||||||
|  |   if((norm2(err)>1.0e-4)){ | ||||||
|  | /* | ||||||
|  | 	std::cout<< "DAG RESULT\n "  <<ref     << std::endl; | ||||||
|  | 	std::cout<< "DAG sRESULT\n " <<result  << std::endl; | ||||||
|  | 	std::cout<< "DAG ERR   \n "  << err    <<std::endl; | ||||||
|  | */ | ||||||
|  |   } | ||||||
|  |   LatticeFermionF src_e (FrbGrid); | ||||||
|  |   LatticeFermionF src_o (FrbGrid); | ||||||
|  |   LatticeFermionF r_e   (FrbGrid); | ||||||
|  |   LatticeFermionF r_o   (FrbGrid); | ||||||
|  |   LatticeFermionF r_eo  (FGrid); | ||||||
|  |  | ||||||
|  |   std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl; | ||||||
|  |   pickCheckerboard(Even,src_e,src); | ||||||
|  |   pickCheckerboard(Odd,src_o,src); | ||||||
|  |  | ||||||
|  |   std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   // S-direction is INNERMOST and takes no part in the parity. | ||||||
|  |   std::cout << GridLogMessage<< "*********************************************************" <<std::endl; | ||||||
|  |   std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::DhopEO                "<<std::endl; | ||||||
|  |   std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl; | ||||||
|  |   if ( sizeof(RealF)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; | ||||||
|  |   if ( sizeof(RealF)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl; | ||||||
|  | #ifdef GRID_OMP | ||||||
|  |   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl; | ||||||
|  |   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl; | ||||||
|  | #endif | ||||||
|  |   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl; | ||||||
|  |   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl; | ||||||
|  |   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl; | ||||||
|  |   std::cout << GridLogMessage<< "*********************************************************" <<std::endl; | ||||||
|  |   { | ||||||
|  |     FGrid->Barrier(); | ||||||
|  |     Dw.DhopEO(src_o,r_e,DaggerNo); | ||||||
|  |     double t0=usecond(); | ||||||
|  |     for(int i=0;i<ncall;i++){ | ||||||
|  | #ifdef CUDA_PROFILE | ||||||
|  |       if(i==10) cudaProfilerStart(); | ||||||
|  | #endif | ||||||
|  |       Dw.DhopEO(src_o,r_e,DaggerNo); | ||||||
|  | #ifdef CUDA_PROFILE | ||||||
|  |       if(i==20) cudaProfilerStop(); | ||||||
|  | #endif | ||||||
|  |     } | ||||||
|  |     double t1=usecond(); | ||||||
|  |     FGrid->Barrier(); | ||||||
|  |  | ||||||
|  |     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||||
|  |     double flops=(single_site_flops*volume*ncall)/2.0; | ||||||
|  |  | ||||||
|  |     std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl; | ||||||
|  |   } | ||||||
|  |   Dw.DhopEO(src_o,r_e,DaggerNo); | ||||||
|  |   Dw.DhopOE(src_e,r_o,DaggerNo); | ||||||
|  |   Dw.Dhop  (src  ,result,DaggerNo); | ||||||
|  |  | ||||||
|  |   std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl; | ||||||
|  |  | ||||||
|  |   setCheckerboard(r_eo,r_o); | ||||||
|  |   setCheckerboard(r_eo,r_e); | ||||||
|  |  | ||||||
|  |   err = r_eo-result; | ||||||
|  |   std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; | ||||||
|  |   if((norm2(err)>1.0e-4)){ | ||||||
|  |     /* | ||||||
|  | 	std::cout<< "Deo RESULT\n " <<r_eo << std::endl; | ||||||
|  | 	std::cout<< "Deo REF\n " <<result  << std::endl; | ||||||
|  | 	std::cout<< "Deo ERR   \n " << err <<std::endl; | ||||||
|  |     */ | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   pickCheckerboard(Even,src_e,err); | ||||||
|  |   pickCheckerboard(Odd,src_o,err); | ||||||
|  |   std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl; | ||||||
|  |  | ||||||
|  |   assert(norm2(src_e)<1.0e-4); | ||||||
|  |   assert(norm2(src_o)<1.0e-4); | ||||||
|  |   Grid_finalize(); | ||||||
|  |   exit(0); | ||||||
|  | } | ||||||
							
								
								
									
										133
									
								
								examples/socket_grid.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										133
									
								
								examples/socket_grid.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,133 @@ | |||||||
|  | #include <sys/socket.h> | ||||||
|  | #include <sys/un.h> | ||||||
|  | #include <unistd.h> | ||||||
|  | #include <stdio.h> | ||||||
|  | #include <err.h> | ||||||
|  | #include <fcntl.h> | ||||||
|  | #include <assert.h> | ||||||
|  | #include <string.h> | ||||||
|  | #include <stdlib.h> | ||||||
|  |  | ||||||
|  | static int sock; | ||||||
|  | static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d"; | ||||||
|  | static char sock_path[256]; | ||||||
|  |  | ||||||
|  | class UnixSockets { | ||||||
|  | public: | ||||||
|  |   static void Open(int rank) | ||||||
|  |   { | ||||||
|  |     int errnum; | ||||||
|  |  | ||||||
|  |     sock = socket(AF_UNIX, SOCK_DGRAM, 0);  assert(sock>0); | ||||||
|  |     printf("allocated socket %d\n",sock); | ||||||
|  |  | ||||||
|  |     struct sockaddr_un sa_un = { 0 }; | ||||||
|  |     sa_un.sun_family = AF_UNIX; | ||||||
|  |     snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank); | ||||||
|  |     unlink(sa_un.sun_path); | ||||||
|  |     if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) { | ||||||
|  |       perror("bind failure"); | ||||||
|  |       exit(EXIT_FAILURE); | ||||||
|  |     } | ||||||
|  |     printf("bound socket %d to %s\n",sock,sa_un.sun_path); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   static int RecvFileDescriptor(void) | ||||||
|  |   { | ||||||
|  |     int n; | ||||||
|  |     int fd; | ||||||
|  |     char buf[1]; | ||||||
|  |     struct iovec iov; | ||||||
|  |     struct msghdr msg; | ||||||
|  |     struct cmsghdr *cmsg; | ||||||
|  |     char cms[CMSG_SPACE(sizeof(int))]; | ||||||
|  |  | ||||||
|  |     iov.iov_base = buf; | ||||||
|  |     iov.iov_len = 1; | ||||||
|  |  | ||||||
|  |     memset(&msg, 0, sizeof msg); | ||||||
|  |     msg.msg_name = 0; | ||||||
|  |     msg.msg_namelen = 0; | ||||||
|  |     msg.msg_iov = &iov; | ||||||
|  |     msg.msg_iovlen = 1; | ||||||
|  |  | ||||||
|  |     msg.msg_control = (caddr_t)cms; | ||||||
|  |     msg.msg_controllen = sizeof cms; | ||||||
|  |  | ||||||
|  |     if((n=recvmsg(sock, &msg, 0)) < 0) { | ||||||
|  |       perror("recvmsg failed"); | ||||||
|  |       return -1; | ||||||
|  |     } | ||||||
|  |     if(n == 0){ | ||||||
|  |       perror("recvmsg returned 0"); | ||||||
|  |       return -1; | ||||||
|  |     } | ||||||
|  |     cmsg = CMSG_FIRSTHDR(&msg); | ||||||
|  |     memmove(&fd, CMSG_DATA(cmsg), sizeof(int)); | ||||||
|  |     printf("received fd %d from socket %d\n",fd,sock); | ||||||
|  |     return fd; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   static void SendFileDescriptor(int fildes,int xmit_to_rank) | ||||||
|  |   { | ||||||
|  |     struct msghdr msg; | ||||||
|  |     struct iovec iov; | ||||||
|  |     struct cmsghdr *cmsg = NULL; | ||||||
|  |     char ctrl[CMSG_SPACE(sizeof(int))]; | ||||||
|  |     char data = ' '; | ||||||
|  |  | ||||||
|  |     memset(&msg, 0, sizeof(struct msghdr)); | ||||||
|  |     memset(ctrl, 0, CMSG_SPACE(sizeof(int))); | ||||||
|  |     iov.iov_base = &data; | ||||||
|  |     iov.iov_len = sizeof(data); | ||||||
|  |      | ||||||
|  |     sprintf(sock_path,sock_path_fmt,xmit_to_rank); | ||||||
|  |     printf("sending FD %d over socket %d to rank %d AF_UNIX path %s\n",fildes,sock,xmit_to_rank,sock_path);fflush(stdout); | ||||||
|  |      | ||||||
|  |     struct sockaddr_un sa_un = { 0 }; | ||||||
|  |     sa_un.sun_family = AF_UNIX; | ||||||
|  |     snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank); | ||||||
|  |  | ||||||
|  |     msg.msg_name = (void *)&sa_un; | ||||||
|  |     msg.msg_namelen = sizeof(sa_un); | ||||||
|  |     msg.msg_iov = &iov; | ||||||
|  |     msg.msg_iovlen = 1; | ||||||
|  |     msg.msg_controllen =  CMSG_SPACE(sizeof(int)); | ||||||
|  |     msg.msg_control = ctrl; | ||||||
|  |  | ||||||
|  |     cmsg = CMSG_FIRSTHDR(&msg); | ||||||
|  |     cmsg->cmsg_level = SOL_SOCKET; | ||||||
|  |     cmsg->cmsg_type = SCM_RIGHTS; | ||||||
|  |     cmsg->cmsg_len = CMSG_LEN(sizeof(int)); | ||||||
|  |  | ||||||
|  |     *((int *) CMSG_DATA(cmsg)) = fildes; | ||||||
|  |  | ||||||
|  |     if ( sendmsg(sock, &msg, 0) == -1 ) perror("sendmsg failed"); | ||||||
|  |   }; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | int main(int argc, char **argv) | ||||||
|  | { | ||||||
|  |   int me = fork()?0:1; | ||||||
|  |    | ||||||
|  |   UnixSockets::Open(me); | ||||||
|  |    | ||||||
|  |   // need MPI barrier | ||||||
|  |   sleep(10); | ||||||
|  |   const char * message = "Hello, World\n"; | ||||||
|  |   if( me ) { | ||||||
|  |     int fd = open("foo",O_RDWR|O_CREAT,0666); | ||||||
|  |     if ( fd < 0 ) { | ||||||
|  |       perror("failed to open file"); | ||||||
|  |       exit(EXIT_FAILURE); | ||||||
|  |     } | ||||||
|  |     // rank 1 sends ot rank 0 | ||||||
|  |     UnixSockets::SendFileDescriptor(fd,0); | ||||||
|  |     close(fd); | ||||||
|  |   } else { | ||||||
|  |     // rank 0 sends receives frmo rank 1 | ||||||
|  |     int fd = UnixSockets::RecvFileDescriptor(); | ||||||
|  |     write(fd,(const void *)message,strlen(message)); | ||||||
|  |     close(fd); | ||||||
|  |   } | ||||||
|  | } | ||||||
| @@ -1,7 +1,7 @@ | |||||||
| CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-` | CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-` | ||||||
| ../../configure --enable-comms=mpi-auto \ | ../../configure --enable-comms=mpi-auto \ | ||||||
| --with-lime=$CLIME \ | --with-lime=$CLIME \ | ||||||
| --enable-unified=yes \ | --enable-unified=no \ | ||||||
| --enable-shm=nvlink \ | --enable-shm=nvlink \ | ||||||
| --enable-tracing=timer \ | --enable-tracing=timer \ | ||||||
| --enable-accelerator=hip \ | --enable-accelerator=hip \ | ||||||
|   | |||||||
| @@ -5,8 +5,8 @@ module load emacs | |||||||
| #module load gperftools | #module load gperftools | ||||||
| module load PrgEnv-gnu | module load PrgEnv-gnu | ||||||
| module load rocm/5.3.0 | module load rocm/5.3.0 | ||||||
| module load cray-mpich/8.1.16 | #module load cray-mpich/8.1.16 | ||||||
| #module load cray-mpich/8.1.17 | module load cray-mpich/8.1.17 | ||||||
| module load gmp | module load gmp | ||||||
| module load cray-fftw | module load cray-fftw | ||||||
| module load craype-accel-amd-gfx90a | module load craype-accel-amd-gfx90a | ||||||
|   | |||||||
| @@ -4,7 +4,7 @@ | |||||||
| #SBATCH -p QZ1J-ICX-PVC | #SBATCH -p QZ1J-ICX-PVC | ||||||
| ##SBATCH -p QZ1J-SPR-PVC-2C | ##SBATCH -p QZ1J-SPR-PVC-2C | ||||||
|  |  | ||||||
| source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh | #source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh | ||||||
|  |  | ||||||
| export NT=8 | export NT=8 | ||||||
|  |  | ||||||
|   | |||||||
| @@ -4,7 +4,7 @@ | |||||||
|  |  | ||||||
| #SBATCH -p QZ1J-ICX-PVC | #SBATCH -p QZ1J-ICX-PVC | ||||||
|  |  | ||||||
| source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh | #source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh | ||||||
|  |  | ||||||
| export NT=16 | export NT=16 | ||||||
|  |  | ||||||
| @@ -19,11 +19,15 @@ export SYCL_DEVICE_FILTER=gpu,level_zero | |||||||
| export I_MPI_OFFLOAD_CELL=tile | export I_MPI_OFFLOAD_CELL=tile | ||||||
| export EnableImplicitScaling=0 | export EnableImplicitScaling=0 | ||||||
| export EnableWalkerPartition=0 | export EnableWalkerPartition=0 | ||||||
| export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1 | #export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1 | ||||||
| export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 | #export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 | ||||||
| export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0 | export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0 | ||||||
|  |  | ||||||
| #mpiexec -launcher ssh -n 1 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 > 1tile.log | for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | ||||||
|  | do | ||||||
|  | mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 0  --device-mem 32768 > 1.1.1.2.log$i | ||||||
|  | mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 0  --device-mem 32768 > 2.1.1.1.log$i  | ||||||
|  | done | ||||||
|  |  | ||||||
| mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 | mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 | ||||||
|  |  | ||||||
|   | |||||||
| @@ -5,10 +5,5 @@ export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID | |||||||
| echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK | echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK | ||||||
|  |  | ||||||
|  |  | ||||||
| #if [ $MPI_LOCALRANKID = "0" ]  |  | ||||||
| #then |  | ||||||
| #  ~psteinbr/build_pti/ze_tracer -c $@ |  | ||||||
| #  onetrace --chrome-kernel-timeline $@ |  | ||||||
| #else |  | ||||||
|   $@ |   $@ | ||||||
| #fi |  | ||||||
|   | |||||||
| @@ -3,8 +3,14 @@ export https_proxy=http://proxy-chain.intel.com:911 | |||||||
| export LD_LIBRARY_PATH=$HOME/prereqs/lib/:$LD_LIBRARY_PATH | export LD_LIBRARY_PATH=$HOME/prereqs/lib/:$LD_LIBRARY_PATH | ||||||
|  |  | ||||||
| module load intel-release | module load intel-release | ||||||
| source /opt/intel/oneapi/PVC_setup.sh | module load intel-comp-rt/embargo-ci-neo | ||||||
|  |  | ||||||
|  | #source /opt/intel/oneapi/PVC_setup.sh | ||||||
| #source /opt/intel/oneapi/ATS_setup.sh | #source /opt/intel/oneapi/ATS_setup.sh | ||||||
|  | #module load intel-nightly/20230331 | ||||||
|  | #module load intel-comp-rt/ci-neo-master/026093 | ||||||
|  |  | ||||||
|  | #module load intel/mpich | ||||||
| module load intel/mpich/pvc45.3 | module load intel/mpich/pvc45.3 | ||||||
| export PATH=~/ATS/pti-gpu/tools/onetrace/:$PATH | export PATH=~/ATS/pti-gpu/tools/onetrace/:$PATH | ||||||
|  |  | ||||||
|   | |||||||
| @@ -73,12 +73,12 @@ int main (int argc, char ** argv) | |||||||
|   RealD M5  =1.8; |   RealD M5  =1.8; | ||||||
|  |  | ||||||
|   std::cout<<GridLogMessage<<"**************************************************************"<<std::endl; |   std::cout<<GridLogMessage<<"**************************************************************"<<std::endl; | ||||||
|   std::cout<<GridLogMessage <<"DomainWallFermion vectorised test"<<std::endl; |   std::cout<<GridLogMessage <<"DomainWallFermion test"<<std::endl; | ||||||
|   std::cout<<GridLogMessage<<"**************************************************************"<<std::endl; |   std::cout<<GridLogMessage<<"**************************************************************"<<std::endl; | ||||||
|   std::vector<Complex> boundary = {1,1,1,-1}; |   std::vector<Complex> boundary = {1,1,1,-1}; | ||||||
|   DomainWallFermionD::ImplParams Params(boundary); |   DomainWallFermionD::ImplParams Params(boundary); | ||||||
|   Coordinate Dirichlet({0,8,8,16,32}); |   //  Coordinate Dirichlet({0,8,8,16,32}); | ||||||
|   Params.dirichlet=Dirichlet; |   //  Params.dirichlet=Dirichlet; | ||||||
|  |  | ||||||
|   DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,Params); |   DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,Params); | ||||||
|   TestWhat<DomainWallFermionD>(Ddwf,FGrid,FrbGrid,UGrid,mass,M5,&RNG4,&RNG5); |   TestWhat<DomainWallFermionD>(Ddwf,FGrid,FrbGrid,UGrid,mass,M5,&RNG4,&RNG5); | ||||||
|   | |||||||
| @@ -53,7 +53,7 @@ static int readInt(int* argc, char*** argv, std::string&& option, int defaultVal | |||||||
|  |  | ||||||
| static float readFloat(int* argc, char*** argv, std::string&& option, float defaultValue) { | static float readFloat(int* argc, char*** argv, std::string&& option, float defaultValue) { | ||||||
|   std::string arg; |   std::string arg; | ||||||
|   float       ret = defaultValue; |   double      ret = defaultValue; | ||||||
|   if(checkPresent(argc, argv, option)) { |   if(checkPresent(argc, argv, option)) { | ||||||
|     arg = getContent(argc, argv, option); |     arg = getContent(argc, argv, option); | ||||||
|     GridCmdOptionFloat(arg, ret); |     GridCmdOptionFloat(arg, ret); | ||||||
|   | |||||||
| @@ -1,244 +0,0 @@ | |||||||
|     /************************************************************************************* |  | ||||||
|  |  | ||||||
| Gamma::Algebra Gmu [] = { |  | ||||||
|   Gamma::Algebra::GammaX, |  | ||||||
|   Gamma::Algebra::GammaY, |  | ||||||
|   Gamma::Algebra::GammaZ, |  | ||||||
|   Gamma::Algebra::GammaT, |  | ||||||
|   Gamma::Algebra::Gamma5 |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| int main (int argc, char ** argv) |  | ||||||
| { |  | ||||||
|   Grid_init(&argc,&argv); |  | ||||||
|  |  | ||||||
|   int threads = GridThread::GetThreads(); |  | ||||||
|   std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; |  | ||||||
|  |  | ||||||
|   Coordinate latt_size   = GridDefaultLatt(); |  | ||||||
|   Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd()); |  | ||||||
|   Coordinate mpi_layout  = GridDefaultMpi(); |  | ||||||
|  |  | ||||||
|   int vol = 1; |  | ||||||
|   for(int d=0;d<latt_size.size();d++){ |  | ||||||
|     vol = vol * latt_size[d]; |  | ||||||
|   } |  | ||||||
|   GridCartesian         GRID(latt_size,simd_layout,mpi_layout); |  | ||||||
|   GridRedBlackCartesian RBGRID(&GRID); |  | ||||||
|  |  | ||||||
|   LatticeComplexD    coor(&GRID); |  | ||||||
|  |  | ||||||
|   ComplexD ci(0.0,1.0); |  | ||||||
|  |  | ||||||
|   std::vector<int> seeds({1,2,3,4}); |  | ||||||
|   GridSerialRNG          sRNG;  sRNG.SeedFixedIntegers(seeds); // naughty seeding |  | ||||||
|   GridParallelRNG          pRNG(&GRID); |  | ||||||
|   pRNG.SeedFixedIntegers(seeds); |  | ||||||
|  |  | ||||||
|   LatticeGaugeFieldD Umu(&GRID); |  | ||||||
|   SU<Nc>::ColdConfiguration(pRNG,Umu); // Unit gauge |  | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////// |  | ||||||
|   // Wilson test |  | ||||||
|   //////////////////////////////////////////////////// |  | ||||||
|   { |  | ||||||
|     LatticeFermionD    src(&GRID); gaussian(pRNG,src); |  | ||||||
|     LatticeFermionD    src_p(&GRID); |  | ||||||
|     LatticeFermionD    tmp(&GRID); |  | ||||||
|     LatticeFermionD    ref(&GRID); |  | ||||||
|     LatticeFermionD    result(&GRID); |  | ||||||
|      |  | ||||||
|     RealD mass=0.1; |  | ||||||
|     WilsonFermionD Dw(Umu,GRID,RBGRID,mass); |  | ||||||
|      |  | ||||||
|     Dw.M(src,ref); |  | ||||||
|     std::cout << "Norm src "<<norm2(src)<<std::endl; |  | ||||||
|     std::cout << "Norm Dw x src "<<norm2(ref)<<std::endl; |  | ||||||
|     { |  | ||||||
|       FFT theFFT(&GRID); |  | ||||||
|  |  | ||||||
|       //////////////// |  | ||||||
|       // operator in Fourier space |  | ||||||
|       //////////////// |  | ||||||
|       tmp =ref; |  | ||||||
|       theFFT.FFT_all_dim(result,tmp,FFT::forward); |  | ||||||
|       std::cout<<"FFT[ Dw x src ]  "<< norm2(result)<<std::endl;     |  | ||||||
|  |  | ||||||
|       tmp = src; |  | ||||||
|       theFFT.FFT_all_dim(src_p,tmp,FFT::forward); |  | ||||||
|       std::cout<<"FFT[ src      ]  "<< norm2(src_p)<<std::endl; |  | ||||||
|        |  | ||||||
|       ///////////////////////////////////////////////////////////////// |  | ||||||
|       // work out the predicted FT from Fourier |  | ||||||
|       ///////////////////////////////////////////////////////////////// |  | ||||||
|       auto FGrid = &GRID; |  | ||||||
|       LatticeFermionD    Kinetic(FGrid); Kinetic = Zero(); |  | ||||||
|       LatticeComplexD    kmu(FGrid);  |  | ||||||
|       LatticeInteger     scoor(FGrid);  |  | ||||||
|       LatticeComplexD    sk (FGrid); sk = Zero(); |  | ||||||
|       LatticeComplexD    sk2(FGrid); sk2= Zero(); |  | ||||||
|       LatticeComplexD    W(FGrid); W= Zero(); |  | ||||||
|       LatticeComplexD    one(FGrid); one =ComplexD(1.0,0.0); |  | ||||||
|       ComplexD ci(0.0,1.0); |  | ||||||
|      |  | ||||||
|       for(int mu=0;mu<Nd;mu++) { |  | ||||||
| 	 |  | ||||||
| 	RealD TwoPiL =  M_PI * 2.0/ latt_size[mu]; |  | ||||||
|  |  | ||||||
| 	LatticeCoordinate(kmu,mu); |  | ||||||
|  |  | ||||||
| 	kmu = TwoPiL * kmu; |  | ||||||
|        |  | ||||||
| 	sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5); |  | ||||||
| 	sk  = sk  +     sin(kmu)    *sin(kmu);  |  | ||||||
|        |  | ||||||
| 	// -1/2 Dw ->  1/2 gmu (eip - emip) = i sinp gmu |  | ||||||
| 	Kinetic = Kinetic + sin(kmu)*ci*(Gamma(Gmu[mu])*src_p); |  | ||||||
| 	 |  | ||||||
|       } |  | ||||||
|      |  | ||||||
|       W = mass + sk2;  |  | ||||||
|       Kinetic = Kinetic + W * src_p; |  | ||||||
|      |  | ||||||
|       std::cout<<"Momentum space src         "<< norm2(src_p)<<std::endl; |  | ||||||
|       std::cout<<"Momentum space Dw x src    "<< norm2(Kinetic)<<std::endl; |  | ||||||
|       std::cout<<"FT[Coordinate space Dw]    "<< norm2(result)<<std::endl; |  | ||||||
|      |  | ||||||
|       result = result - Kinetic; |  | ||||||
|       std::cout<<"diff "<< norm2(result)<<std::endl; |  | ||||||
|        |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     std::cout << " =======================================" <<std::endl; |  | ||||||
|     std::cout << " Checking FourierFreePropagator x Dw = 1" <<std::endl; |  | ||||||
|     std::cout << " =======================================" <<std::endl; |  | ||||||
|     std::cout << "Dw src = " <<norm2(src)<<std::endl; |  | ||||||
|     std::cout << "Dw tmp = " <<norm2(tmp)<<std::endl; |  | ||||||
|     Dw.M(src,tmp); |  | ||||||
|  |  | ||||||
|     Dw.FreePropagator(tmp,ref,mass); |  | ||||||
|  |  | ||||||
|     std::cout << "Dw ref = " <<norm2(ref)<<std::endl; |  | ||||||
|      |  | ||||||
|     ref = ref - src; |  | ||||||
|      |  | ||||||
|     std::cout << "Dw ref-src = " <<norm2(ref)<<std::endl; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////// |  | ||||||
|   // Wilson prop |  | ||||||
|   //////////////////////////////////////////////////// |  | ||||||
|   { |  | ||||||
|     std::cout<<"****************************************"<<std::endl; |  | ||||||
|     std::cout << "Wilson Mom space 4d propagator \n"; |  | ||||||
|     std::cout<<"****************************************"<<std::endl; |  | ||||||
|  |  | ||||||
|     LatticeFermionD    src(&GRID); gaussian(pRNG,src); |  | ||||||
|     LatticeFermionD    tmp(&GRID); |  | ||||||
|     LatticeFermionD    ref(&GRID); |  | ||||||
|     LatticeFermionD    diff(&GRID); |  | ||||||
|  |  | ||||||
|     src=Zero(); |  | ||||||
|     Coordinate point(4,0); // 0,0,0,0 |  | ||||||
|     SpinColourVectorD ferm; |  | ||||||
|     ferm=Zero(); |  | ||||||
|     ferm()(0)(0) = ComplexD(1.0); |  | ||||||
|     pokeSite(ferm,src,point); |  | ||||||
|  |  | ||||||
|     RealD mass=0.1; |  | ||||||
|  |  | ||||||
|     WilsonFermionD Dw(Umu,GRID,RBGRID,mass); |  | ||||||
|  |  | ||||||
|     // Momentum space prop |  | ||||||
|     std::cout << " Solving by FFT and Feynman rules" <<std::endl; |  | ||||||
|     Dw.FreePropagator(src,ref,mass) ; |  | ||||||
|  |  | ||||||
|     Gamma G5(Gamma::Algebra::Gamma5); |  | ||||||
|  |  | ||||||
|     LatticeFermionD    result(&GRID);  |  | ||||||
|     const int sdir=0; |  | ||||||
|      |  | ||||||
|     //////////////////////////////////////////////////////////////////////// |  | ||||||
|     // Conjugate gradient on normal equations system |  | ||||||
|     //////////////////////////////////////////////////////////////////////// |  | ||||||
|     std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl; |  | ||||||
|     Dw.Mdag(src,tmp); |  | ||||||
|     src=tmp; |  | ||||||
|     MdagMLinearOperator<WilsonFermionD,LatticeFermionD> HermOp(Dw); |  | ||||||
|     ConjugateGradient<LatticeFermionD> CG(1.0e-10,10000); |  | ||||||
|     CG(HermOp,src,result); |  | ||||||
|      |  | ||||||
|     //////////////////////////////////////////////////////////////////////// |  | ||||||
|     std::cout << " Taking difference" <<std::endl; |  | ||||||
|     std::cout << "Dw result "<<norm2(result)<<std::endl; |  | ||||||
|     std::cout << "Dw ref     "<<norm2(ref)<<std::endl; |  | ||||||
|      |  | ||||||
|     diff = ref - result; |  | ||||||
|     std::cout << "result - ref     "<<norm2(diff)<<std::endl; |  | ||||||
|  |  | ||||||
|     DumpSliceNorm("Slice Norm Solution ",result,Nd-1); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////// |  | ||||||
|   //Gauge invariance test |  | ||||||
|   //////////////////////////////////////////////////// |  | ||||||
|   { |  | ||||||
|     std::cout<<"****************************************"<<std::endl; |  | ||||||
|     std::cout << "Gauge invariance test \n"; |  | ||||||
|     std::cout<<"****************************************"<<std::endl; |  | ||||||
|     LatticeGaugeField     U_GT(&GRID); // Gauge transformed field |  | ||||||
|     LatticeColourMatrix   g(&GRID);    // local Gauge xform matrix |  | ||||||
|     U_GT = Umu; |  | ||||||
|     // Make a random xform to teh gauge field |  | ||||||
|     SU<Nc>::RandomGaugeTransform(pRNG,U_GT,g); // Unit gauge |  | ||||||
|  |  | ||||||
|     LatticeFermionD    src(&GRID); |  | ||||||
|     LatticeFermionD    tmp(&GRID); |  | ||||||
|     LatticeFermionD    ref(&GRID); |  | ||||||
|     LatticeFermionD    diff(&GRID); |  | ||||||
|  |  | ||||||
|     // could loop over colors |  | ||||||
|     src=Zero(); |  | ||||||
|     Coordinate point(4,0); // 0,0,0,0 |  | ||||||
|     SpinColourVectorD ferm; |  | ||||||
|     ferm=Zero(); |  | ||||||
|     ferm()(0)(0) = ComplexD(1.0); |  | ||||||
|     pokeSite(ferm,src,point); |  | ||||||
|  |  | ||||||
|     RealD mass=0.1; |  | ||||||
|     WilsonFermionD Dw(U_GT,GRID,RBGRID,mass); |  | ||||||
|  |  | ||||||
|     // Momentum space prop |  | ||||||
|     std::cout << " Solving by FFT and Feynman rules" <<std::endl; |  | ||||||
|     Dw.FreePropagator(src,ref,mass) ; |  | ||||||
|  |  | ||||||
|     Gamma G5(Gamma::Algebra::Gamma5); |  | ||||||
|  |  | ||||||
|     LatticeFermionD    result(&GRID);  |  | ||||||
|     const int sdir=0; |  | ||||||
|      |  | ||||||
|     //////////////////////////////////////////////////////////////////////// |  | ||||||
|     // Conjugate gradient on normal equations system |  | ||||||
|     //////////////////////////////////////////////////////////////////////// |  | ||||||
|     std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl; |  | ||||||
|     Dw.Mdag(src,tmp); |  | ||||||
|     src=tmp; |  | ||||||
|     MdagMLinearOperator<WilsonFermionD,LatticeFermionD> HermOp(Dw); |  | ||||||
|     ConjugateGradient<LatticeFermionD> CG(1.0e-10,10000); |  | ||||||
|     CG(HermOp,src,result); |  | ||||||
|      |  | ||||||
|     //////////////////////////////////////////////////////////////////////// |  | ||||||
|     std::cout << " Taking difference" <<std::endl; |  | ||||||
|     std::cout << "Dw result "<<norm2(result)<<std::endl; |  | ||||||
|     std::cout << "Dw ref     "<<norm2(ref)<<std::endl; |  | ||||||
|      |  | ||||||
|     diff = ref - result; |  | ||||||
|     std::cout << "result - ref     "<<norm2(diff)<<std::endl; |  | ||||||
|  |  | ||||||
|     DumpSliceNorm("Slice Norm Solution ",result,Nd-1); |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|    |  | ||||||
|   Grid_finalize(); |  | ||||||
| } |  | ||||||
| @@ -476,7 +476,9 @@ int main (int argc, char ** argv) | |||||||
|   //  ForceTest<GimplTypesR>(BdyNf2eo,U,DDHMCFilter); |   //  ForceTest<GimplTypesR>(BdyNf2eo,U,DDHMCFilter); | ||||||
|  |  | ||||||
|   //////////////////// One flavour boundary det  //////////////////// |   //////////////////// One flavour boundary det  //////////////////// | ||||||
|  |   /* | ||||||
|   RationalActionParams OFRp; // Up/down |   RationalActionParams OFRp; // Up/down | ||||||
|  |   int SP_iters = 3000; | ||||||
|   OFRp.lo       = 6.0e-5; |   OFRp.lo       = 6.0e-5; | ||||||
|   OFRp.hi       = 90.0; |   OFRp.hi       = 90.0; | ||||||
|   OFRp.inv_pow  = 2; |   OFRp.inv_pow  = 2; | ||||||
| @@ -489,7 +491,7 @@ int main (int argc, char ** argv) | |||||||
|   //  OFRp.degree   = 16; |   //  OFRp.degree   = 16; | ||||||
|   OFRp.precision= 80; |   OFRp.precision= 80; | ||||||
|   OFRp.BoundsCheckFreq=0; |   OFRp.BoundsCheckFreq=0; | ||||||
|   /* |   */ | ||||||
|   OneFlavourRationalParams OFRp; // Up/down |   OneFlavourRationalParams OFRp; // Up/down | ||||||
|   OFRp.lo       = 4.0e-5; |   OFRp.lo       = 4.0e-5; | ||||||
|   OFRp.hi       = 90.0; |   OFRp.hi       = 90.0; | ||||||
| @@ -499,7 +501,6 @@ int main (int argc, char ** argv) | |||||||
|   OFRp.degree   = 18; |   OFRp.degree   = 18; | ||||||
|   OFRp.precision= 80; |   OFRp.precision= 80; | ||||||
|   OFRp.BoundsCheckFreq=0; |   OFRp.BoundsCheckFreq=0; | ||||||
|   */ |  | ||||||
|   std::vector<RealD> ActionTolByPole({ |   std::vector<RealD> ActionTolByPole({ | ||||||
|       1.0e-7,1.0e-8,1.0e-8,1.0e-8, |       1.0e-7,1.0e-8,1.0e-8,1.0e-8, | ||||||
|       1.0e-8,1.0e-8,1.0e-8,1.0e-8, |       1.0e-8,1.0e-8,1.0e-8,1.0e-8, | ||||||
|   | |||||||
| @@ -85,7 +85,7 @@ int main(int argc, char **argv) { | |||||||
|   TheHMC.Resources.AddObservable<PlaqObs>(); |   TheHMC.Resources.AddObservable<PlaqObs>(); | ||||||
|   ////////////////////////////////////////////// |   ////////////////////////////////////////////// | ||||||
|  |  | ||||||
|   const int Ls      = 4; |   const int Ls      = 8; | ||||||
|   Real beta         = 2.13; |   Real beta         = 2.13; | ||||||
|   Real light_mass   = 0.01; |   Real light_mass   = 0.01; | ||||||
|   Real strange_mass = 0.04; |   Real strange_mass = 0.04; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user