mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-26 09:39:34 +00:00 
			
		
		
		
	Compare commits
	
		
			39 Commits
		
	
	
		
			feature/sh
			...
			feature/no
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 8e161152e4 | ||
|  | 3141ebac10 | ||
|  | 7ede696126 | ||
|  | bf516c3b81 | ||
|  | 441a52ee5d | ||
|  | a8db024c92 | ||
|  | a9c22d5f43 | ||
|  | 3ca41458a3 | ||
|  | 951be75292 | ||
|  | b9113ed310 | ||
|  | 42fb49d3fd | ||
|  | 2a54c9aaab | ||
|  | 0957378679 | ||
|  | 2ed6c76fc5 | ||
|  | d3b9a7fa14 | ||
|  | 75ea306ce9 | ||
|  | 4226c633c4 | ||
|  | 5a4eafbf7e | ||
|  | eb8e26018b | ||
|  | db5ea001a3 | ||
|  | 2846f079e5 | ||
|  | 1d502e4ed6 | ||
|  | 73cdf0fffe | ||
|  | 1c25773319 | ||
|  | c38400b26f | ||
|  | 9c3065b860 | ||
|  | 94eb829d08 | ||
|  | 68392ddb5b | ||
|  | cb6b81ae82 | ||
| 8ef4300412 | |||
| 98a24ebf31 | |||
|  | b12dc89d26 | ||
|  | d80d802f9d | ||
|  | 3d99b09dba | ||
|  | db5f6d3ae3 | ||
|  | 683550f116 | ||
|  | 55d0329624 | ||
|  | 86aaa35294 | ||
|  | 172d3dc93a | 
							
								
								
									
										13
									
								
								.travis.yml
									
									
									
									
									
								
							
							
						
						
									
										13
									
								
								.travis.yml
									
									
									
									
									
								
							| @@ -7,7 +7,7 @@ cache: | |||||||
| matrix: | matrix: | ||||||
|   include: |   include: | ||||||
|     - os:        osx |     - os:        osx | ||||||
|       osx_image: xcode7.2 |       osx_image: xcode8.3 | ||||||
|       compiler: clang |       compiler: clang | ||||||
|     - compiler: gcc |     - compiler: gcc | ||||||
|       addons: |       addons: | ||||||
| @@ -73,8 +73,6 @@ before_install: | |||||||
|     - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi |     - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi | ||||||
|     - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi |     - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi | ||||||
|     - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi |     - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi | ||||||
|     - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install openmpi; fi |  | ||||||
|     - if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi |  | ||||||
|      |      | ||||||
| install: | install: | ||||||
|     - export CC=$CC$VERSION |     - export CC=$CC$VERSION | ||||||
| @@ -92,15 +90,14 @@ script: | |||||||
|     - cd build |     - cd build | ||||||
|     - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none |     - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none | ||||||
|     - make -j4  |     - make -j4  | ||||||
|     - ./benchmarks/Benchmark_dwf --threads 1 |     - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals | ||||||
|     - echo make clean |     - echo make clean | ||||||
|     - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none |     - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none | ||||||
|     - make -j4 |     - make -j4 | ||||||
|     - ./benchmarks/Benchmark_dwf --threads 1 |     - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals | ||||||
|     - echo make clean |     - echo make clean | ||||||
|     - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi |     - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi | ||||||
|     - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto |     - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make -j4; fi | ||||||
|     - make -j4 |  | ||||||
|     - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi |     - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										63
									
								
								TODO
									
									
									
									
									
								
							
							
						
						
									
										63
									
								
								TODO
									
									
									
									
									
								
							| @@ -1,6 +1,28 @@ | |||||||
| TODO: | TODO: | ||||||
| --------------- | --------------- | ||||||
|  |  | ||||||
|  | Peter's work list: | ||||||
|  |  | ||||||
|  | -- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started | ||||||
|  | -- Merge high precision reduction into develop         <-- done | ||||||
|  | -- Precision conversion and sort out localConvert      <--  | ||||||
|  | -- Physical propagator interface | ||||||
|  |  | ||||||
|  | -- multiRHS DWF; benchmark on Cori/BNL for comms elimination | ||||||
|  |    -- slice* linalg routines for multiRHS, BlockCG        <-- started | ||||||
|  |  | ||||||
|  | -- Profile CG, BlockCG, etc... Flop count/rate | ||||||
|  | -- Binary I/O speed up & x-strips | ||||||
|  | -- Half-precision comms                                <-- started | ||||||
|  | -- GaugeFix into central location | ||||||
|  | -- FFTfix in sensible place | ||||||
|  | -- Multigrid Wilson and DWF, compare to other Multigrid implementations | ||||||
|  | -- quaternions                 -- Might not need | ||||||
|  |  | ||||||
|  |  | ||||||
|  | -- Conserved currents | ||||||
|  |  | ||||||
|  | ----- | ||||||
| * Forces; the UdSdU  term in gauge force term is half of what I think it should | * Forces; the UdSdU  term in gauge force term is half of what I think it should | ||||||
|   be. This is a consequence of taking ONLY the first term in: |   be. This is a consequence of taking ONLY the first term in: | ||||||
|  |  | ||||||
| @@ -21,16 +43,8 @@ TODO: | |||||||
|   This means we must double the force in the Test_xxx_force routines, and is the origin of the factor of two. |   This means we must double the force in the Test_xxx_force routines, and is the origin of the factor of two. | ||||||
|   This 2x is applied by hand in the fermion routines and in the Test_rect_force routine. |   This 2x is applied by hand in the fermion routines and in the Test_rect_force routine. | ||||||
|  |  | ||||||
|  |  | ||||||
| Policies: |  | ||||||
|  |  | ||||||
| * Link smearing/boundary conds; Policy class based implementation ; framework more in place |  | ||||||
|  |  | ||||||
| * Support different boundary conditions (finite temp, chem. potential ... ) | * Support different boundary conditions (finite temp, chem. potential ... ) | ||||||
|  |  | ||||||
| * Support different fermion representations?  |  | ||||||
|   - contained entirely within the integrator presently |  | ||||||
|  |  | ||||||
| - Sign of force term. | - Sign of force term. | ||||||
|  |  | ||||||
| - Reversibility test. | - Reversibility test. | ||||||
| @@ -41,11 +55,6 @@ Policies: | |||||||
|  |  | ||||||
| - Audit oIndex usage for cb behaviour | - Audit oIndex usage for cb behaviour | ||||||
|  |  | ||||||
| - Rectangle gauge actions. |  | ||||||
|   Iwasaki, |  | ||||||
|   Symanzik, |  | ||||||
|   ... etc... |  | ||||||
|  |  | ||||||
| - Prepare multigrid for HMC. - Alternate setup schemes. | - Prepare multigrid for HMC. - Alternate setup schemes. | ||||||
|  |  | ||||||
| - Support for ILDG --- ugly, not done | - Support for ILDG --- ugly, not done | ||||||
| @@ -55,9 +64,11 @@ Policies: | |||||||
| - FFTnD ? | - FFTnD ? | ||||||
|  |  | ||||||
| - Gparity; hand opt use template specialisation elegance to enable the optimised paths ? | - Gparity; hand opt use template specialisation elegance to enable the optimised paths ? | ||||||
|  |  | ||||||
| - Gparity force term; Gparity (R)HMC. | - Gparity force term; Gparity (R)HMC. | ||||||
| - Random number state save restore |  | ||||||
| - Mobius implementation clean up to rmove #if 0 stale code sequences | - Mobius implementation clean up to rmove #if 0 stale code sequences | ||||||
|  |  | ||||||
| - CG -- profile carefully, kernel fusion, whole CG performance measurements. | - CG -- profile carefully, kernel fusion, whole CG performance measurements. | ||||||
|  |  | ||||||
| ================================================================ | ================================================================ | ||||||
| @@ -90,6 +101,7 @@ Insert/Extract | |||||||
| Not sure of status of this -- reverify. Things are working nicely now though. | Not sure of status of this -- reverify. Things are working nicely now though. | ||||||
|  |  | ||||||
| * Make the Tensor types and Complex etc... play more nicely. | * Make the Tensor types and Complex etc... play more nicely. | ||||||
|  |  | ||||||
|   - TensorRemove is a hack, come up with a long term rationalised approach to Complex vs. Scalar<Scalar<Scalar<Complex > > > |   - TensorRemove is a hack, come up with a long term rationalised approach to Complex vs. Scalar<Scalar<Scalar<Complex > > > | ||||||
|     QDP forces use of "toDouble" to get back to non tensor scalar. This role is presently taken TensorRemove, but I |     QDP forces use of "toDouble" to get back to non tensor scalar. This role is presently taken TensorRemove, but I | ||||||
|     want to introduce a syntax that does not require this. |     want to introduce a syntax that does not require this. | ||||||
| @@ -112,6 +124,8 @@ Not sure of status of this -- reverify. Things are working nicely now though. | |||||||
| RECENT | RECENT | ||||||
| --------------- | --------------- | ||||||
|  |  | ||||||
|  |   - Support different fermion representations? -- DONE | ||||||
|  |   - contained entirely within the integrator presently | ||||||
|   - Clean up HMC                                                             -- DONE |   - Clean up HMC                                                             -- DONE | ||||||
|   - LorentzScalar<GaugeField> gets Gauge link type (cleaner).                -- DONE |   - LorentzScalar<GaugeField> gets Gauge link type (cleaner).                -- DONE | ||||||
|   - Simplified the integrators a bit.                                        -- DONE |   - Simplified the integrators a bit.                                        -- DONE | ||||||
| @@ -123,6 +137,26 @@ RECENT | |||||||
|   - Parallel io improvements                                  -- DONE |   - Parallel io improvements                                  -- DONE | ||||||
|   - Plaquette and link trace checks into nersc reader from the Grid_nersc_io.cc test. -- DONE |   - Plaquette and link trace checks into nersc reader from the Grid_nersc_io.cc test. -- DONE | ||||||
|  |  | ||||||
|  |  | ||||||
|  | DONE: | ||||||
|  | - MultiArray -- MultiRHS done | ||||||
|  | - ConjugateGradientMultiShift -- DONE | ||||||
|  | - MCR                         -- DONE | ||||||
|  | - Remez -- Mike or Boost?     -- DONE | ||||||
|  | - Proto (ET)                  -- DONE | ||||||
|  | - uBlas                       -- DONE ; Eigen | ||||||
|  | - Potentially Useful Boost libraries -- DONE ; Eigen | ||||||
|  | - Aligned allocator; memory pool -- DONE | ||||||
|  | - Multiprecision              -- DONE | ||||||
|  | - Serialization               -- DONE | ||||||
|  | - Regex -- Not needed | ||||||
|  | - Tokenize -- Why? | ||||||
|  |  | ||||||
|  | - Random number state save restore -- DONE | ||||||
|  | - Rectangle gauge actions. -- DONE | ||||||
|  |   Iwasaki, | ||||||
|  |   Symanzik, | ||||||
|  |   ... etc... | ||||||
| Done: Cayley, Partial , ContFrac force terms. | Done: Cayley, Partial , ContFrac force terms. | ||||||
|  |  | ||||||
| DONE | DONE | ||||||
| @@ -207,6 +241,7 @@ Done | |||||||
| FUNCTIONALITY: it pleases me to keep track of things I have done (keeps me arguably sane) | FUNCTIONALITY: it pleases me to keep track of things I have done (keeps me arguably sane) | ||||||
| ====================================================================================================== | ====================================================================================================== | ||||||
|  |  | ||||||
|  | * Link smearing/boundary conds; Policy class based implementation ; framework more in place -- DONE | ||||||
| * Command line args for geometry, simd, etc. layout. Is it necessary to have -- DONE | * Command line args for geometry, simd, etc. layout. Is it necessary to have -- DONE | ||||||
|   user pass these? Is this a QCD specific? |   user pass these? Is this a QCD specific? | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										21
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										21
									
								
								configure.ac
									
									
									
									
									
								
							| @@ -83,6 +83,19 @@ case ${ac_LAPACK} in | |||||||
|         AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);; |         AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);; | ||||||
| esac | esac | ||||||
|  |  | ||||||
|  | ############### FP16 conversions | ||||||
|  | AC_ARG_ENABLE([fp16], | ||||||
|  |     [AC_HELP_STRING([--enable-fp16=yes|no], [enable fp16 comms])],  | ||||||
|  |     [ac_FP16=${enable_fp16}], [ac_FP16=no]) | ||||||
|  | case ${ac_FP16} in | ||||||
|  |     no) | ||||||
|  |         ;; | ||||||
|  |     yes) | ||||||
|  |         AC_DEFINE([USE_FP16],[1],[conversion to fp16]);; | ||||||
|  |     *) | ||||||
|  | 	;; | ||||||
|  | esac | ||||||
|  |  | ||||||
| ############### MKL | ############### MKL | ||||||
| AC_ARG_ENABLE([mkl], | AC_ARG_ENABLE([mkl], | ||||||
|     [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])], |     [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])], | ||||||
| @@ -179,16 +192,16 @@ case ${ax_cv_cxx_compiler_vendor} in | |||||||
|         SIMD_FLAGS='-msse4.2';; |         SIMD_FLAGS='-msse4.2';; | ||||||
|       AVX) |       AVX) | ||||||
|         AC_DEFINE([AVX1],[1],[AVX intrinsics]) |         AC_DEFINE([AVX1],[1],[AVX intrinsics]) | ||||||
|         SIMD_FLAGS='-mavx';; |         SIMD_FLAGS='-mavx -mf16c';; | ||||||
|       AVXFMA4) |       AVXFMA4) | ||||||
|         AC_DEFINE([AVXFMA4],[1],[AVX intrinsics with FMA4]) |         AC_DEFINE([AVXFMA4],[1],[AVX intrinsics with FMA4]) | ||||||
|         SIMD_FLAGS='-mavx -mfma4';; |         SIMD_FLAGS='-mavx -mfma4 -mf16c';; | ||||||
|       AVXFMA) |       AVXFMA) | ||||||
|         AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3]) |         AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3]) | ||||||
|         SIMD_FLAGS='-mavx -mfma';; |         SIMD_FLAGS='-mavx -mfma -mf16c';; | ||||||
|       AVX2) |       AVX2) | ||||||
|         AC_DEFINE([AVX2],[1],[AVX2 intrinsics]) |         AC_DEFINE([AVX2],[1],[AVX2 intrinsics]) | ||||||
|         SIMD_FLAGS='-mavx2 -mfma';; |         SIMD_FLAGS='-mavx2 -mfma -mf16c';; | ||||||
|       AVX512) |       AVX512) | ||||||
|         AC_DEFINE([AVX512],[1],[AVX512 intrinsics]) |         AC_DEFINE([AVX512],[1],[AVX512 intrinsics]) | ||||||
|         SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';; |         SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';; | ||||||
|   | |||||||
| @@ -46,7 +46,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h> | #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h> | ||||||
|  |  | ||||||
| // Lanczos support | // Lanczos support | ||||||
| #include <Grid/algorithms/iterative/MatrixUtils.h> | //#include <Grid/algorithms/iterative/MatrixUtils.h> | ||||||
| #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h> | #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h> | ||||||
| #include <Grid/algorithms/CoarsenedMatrix.h> | #include <Grid/algorithms/CoarsenedMatrix.h> | ||||||
| #include <Grid/algorithms/FFT.h> | #include <Grid/algorithms/FFT.h> | ||||||
|   | |||||||
							
								
								
									
										366
									
								
								lib/algorithms/iterative/BlockConjugateGradient.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										366
									
								
								lib/algorithms/iterative/BlockConjugateGradient.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,366 @@ | |||||||
|  | /************************************************************************************* | ||||||
|  |  | ||||||
|  | Grid physics library, www.github.com/paboyle/Grid | ||||||
|  |  | ||||||
|  | Source file: ./lib/algorithms/iterative/BlockConjugateGradient.h | ||||||
|  |  | ||||||
|  | Copyright (C) 2017 | ||||||
|  |  | ||||||
|  | Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | ||||||
|  | Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
|  |  | ||||||
|  | This program is free software; you can redistribute it and/or modify | ||||||
|  | it under the terms of the GNU General Public License as published by | ||||||
|  | the Free Software Foundation; either version 2 of the License, or | ||||||
|  | (at your option) any later version. | ||||||
|  |  | ||||||
|  | This program is distributed in the hope that it will be useful, | ||||||
|  | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  | GNU General Public License for more details. | ||||||
|  |  | ||||||
|  | You should have received a copy of the GNU General Public License along | ||||||
|  | with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  | See the full license in the file "LICENSE" in the top level distribution | ||||||
|  | directory | ||||||
|  | *************************************************************************************/ | ||||||
|  | /*  END LEGAL */ | ||||||
|  | #ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H | ||||||
|  | #define GRID_BLOCK_CONJUGATE_GRADIENT_H | ||||||
|  |  | ||||||
|  |  | ||||||
|  | namespace Grid { | ||||||
|  |  | ||||||
|  | ////////////////////////////////////////////////////////////////////////// | ||||||
|  | // Block conjugate gradient. Dimension zero should be the block direction | ||||||
|  | ////////////////////////////////////////////////////////////////////////// | ||||||
|  | template <class Field> | ||||||
|  | class BlockConjugateGradient : public OperatorFunction<Field> { | ||||||
|  |  public: | ||||||
|  |  | ||||||
|  |   typedef typename Field::scalar_type scomplex; | ||||||
|  |  | ||||||
|  |   const int blockDim = 0; | ||||||
|  |  | ||||||
|  |   int Nblock; | ||||||
|  |   bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge. | ||||||
|  |                            // Defaults true. | ||||||
|  |   RealD Tolerance; | ||||||
|  |   Integer MaxIterations; | ||||||
|  |   Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion | ||||||
|  |    | ||||||
|  |   BlockConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true) | ||||||
|  |     : Tolerance(tol), | ||||||
|  |     MaxIterations(maxit), | ||||||
|  |     ErrorOnNoConverge(err_on_no_conv){}; | ||||||
|  |  | ||||||
|  | void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)  | ||||||
|  | { | ||||||
|  |   int Orthog = 0; // First dimension is block dim | ||||||
|  |   Nblock = Src._grid->_fdimensions[Orthog]; | ||||||
|  |  | ||||||
|  |   std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl; | ||||||
|  |  | ||||||
|  |   Psi.checkerboard = Src.checkerboard; | ||||||
|  |   conformable(Psi, Src); | ||||||
|  |  | ||||||
|  |   Field P(Src); | ||||||
|  |   Field AP(Src); | ||||||
|  |   Field R(Src); | ||||||
|  |    | ||||||
|  |   Eigen::MatrixXcd m_pAp    = Eigen::MatrixXcd::Identity(Nblock,Nblock); | ||||||
|  |   Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock); | ||||||
|  |   Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock); | ||||||
|  |   Eigen::MatrixXcd m_rr_inv = Eigen::MatrixXcd::Zero(Nblock,Nblock); | ||||||
|  |  | ||||||
|  |   Eigen::MatrixXcd m_alpha      = Eigen::MatrixXcd::Zero(Nblock,Nblock); | ||||||
|  |   Eigen::MatrixXcd m_beta   = Eigen::MatrixXcd::Zero(Nblock,Nblock); | ||||||
|  |  | ||||||
|  |   // Initial residual computation & set up | ||||||
|  |   std::vector<RealD> residuals(Nblock); | ||||||
|  |   std::vector<RealD> ssq(Nblock); | ||||||
|  |  | ||||||
|  |   sliceNorm(ssq,Src,Orthog); | ||||||
|  |   RealD sssum=0; | ||||||
|  |   for(int b=0;b<Nblock;b++) sssum+=ssq[b]; | ||||||
|  |  | ||||||
|  |   sliceNorm(residuals,Src,Orthog); | ||||||
|  |   for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); } | ||||||
|  |  | ||||||
|  |   sliceNorm(residuals,Psi,Orthog); | ||||||
|  |   for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); } | ||||||
|  |  | ||||||
|  |   // Initial search dir is guess | ||||||
|  |   Linop.HermOp(Psi, AP); | ||||||
|  |    | ||||||
|  |  | ||||||
|  |   /************************************************************************ | ||||||
|  |    * Block conjugate gradient (Stephen Pickles, thesis 1995, pp 71, O Leary 1980) | ||||||
|  |    ************************************************************************ | ||||||
|  |    * O'Leary : R = B - A X | ||||||
|  |    * O'Leary : P = M R ; preconditioner M = 1 | ||||||
|  |    * O'Leary : alpha = PAP^{-1} RMR | ||||||
|  |    * O'Leary : beta  = RMR^{-1}_old RMR_new | ||||||
|  |    * O'Leary : X=X+Palpha | ||||||
|  |    * O'Leary : R_new=R_old-AP alpha | ||||||
|  |    * O'Leary : P=MR_new+P beta | ||||||
|  |    */ | ||||||
|  |  | ||||||
|  |   R = Src - AP;   | ||||||
|  |   P = R; | ||||||
|  |   sliceInnerProductMatrix(m_rr,R,R,Orthog); | ||||||
|  |  | ||||||
|  |   GridStopWatch sliceInnerTimer; | ||||||
|  |   GridStopWatch sliceMaddTimer; | ||||||
|  |   GridStopWatch MatrixTimer; | ||||||
|  |   GridStopWatch SolverTimer; | ||||||
|  |   SolverTimer.Start(); | ||||||
|  |  | ||||||
|  |   int k; | ||||||
|  |   for (k = 1; k <= MaxIterations; k++) { | ||||||
|  |  | ||||||
|  |     RealD rrsum=0; | ||||||
|  |     for(int b=0;b<Nblock;b++) rrsum+=real(m_rr(b,b)); | ||||||
|  |  | ||||||
|  |     std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum | ||||||
|  | 	      <<" / "<<std::sqrt(rrsum/sssum) <<std::endl; | ||||||
|  |  | ||||||
|  |     MatrixTimer.Start(); | ||||||
|  |     Linop.HermOp(P, AP); | ||||||
|  |     MatrixTimer.Stop(); | ||||||
|  |  | ||||||
|  |     // Alpha | ||||||
|  |     sliceInnerTimer.Start(); | ||||||
|  |     sliceInnerProductMatrix(m_pAp,P,AP,Orthog); | ||||||
|  |     sliceInnerTimer.Stop(); | ||||||
|  |     m_pAp_inv = m_pAp.inverse(); | ||||||
|  |     m_alpha   = m_pAp_inv * m_rr ; | ||||||
|  |  | ||||||
|  |     // Psi, R update | ||||||
|  |     sliceMaddTimer.Start(); | ||||||
|  |     sliceMaddMatrix(Psi,m_alpha, P,Psi,Orthog);     // add alpha *  P to psi | ||||||
|  |     sliceMaddMatrix(R  ,m_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid | ||||||
|  |     sliceMaddTimer.Stop(); | ||||||
|  |  | ||||||
|  |     // Beta | ||||||
|  |     m_rr_inv = m_rr.inverse(); | ||||||
|  |     sliceInnerTimer.Start(); | ||||||
|  |     sliceInnerProductMatrix(m_rr,R,R,Orthog); | ||||||
|  |     sliceInnerTimer.Stop(); | ||||||
|  |     m_beta = m_rr_inv *m_rr; | ||||||
|  |  | ||||||
|  |     // Search update | ||||||
|  |     sliceMaddTimer.Start(); | ||||||
|  |     sliceMaddMatrix(AP,m_beta,P,R,Orthog); | ||||||
|  |     sliceMaddTimer.Stop(); | ||||||
|  |     P= AP; | ||||||
|  |  | ||||||
|  |     /********************* | ||||||
|  |      * convergence monitor | ||||||
|  |      ********************* | ||||||
|  |      */ | ||||||
|  |     RealD max_resid=0; | ||||||
|  |     for(int b=0;b<Nblock;b++){ | ||||||
|  |       RealD rr = real(m_rr(b,b))/ssq[b]; | ||||||
|  |       if ( rr > max_resid ) max_resid = rr; | ||||||
|  |     } | ||||||
|  |      | ||||||
|  |     if ( max_resid < Tolerance*Tolerance ) {  | ||||||
|  |  | ||||||
|  |       SolverTimer.Stop(); | ||||||
|  |  | ||||||
|  |       std::cout << GridLogMessage<<"BlockCG converged in "<<k<<" iterations"<<std::endl; | ||||||
|  |       for(int b=0;b<Nblock;b++){ | ||||||
|  | 	std::cout << GridLogMessage<< "\t\tblock "<<b<<" resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl; | ||||||
|  |       } | ||||||
|  |       std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl; | ||||||
|  |  | ||||||
|  |       Linop.HermOp(Psi, AP); | ||||||
|  |       AP = AP-Src; | ||||||
|  |       std::cout << GridLogMessage <<"\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl; | ||||||
|  |  | ||||||
|  |       std::cout << GridLogMessage << "Time Breakdown "<<std::endl; | ||||||
|  |       std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl; | ||||||
|  |       std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl; | ||||||
|  |       std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl; | ||||||
|  |       std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl; | ||||||
|  | 	     | ||||||
|  |       IterationsToComplete = k; | ||||||
|  |       return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |   } | ||||||
|  |   std::cout << GridLogMessage << "BlockConjugateGradient did NOT converge" << std::endl; | ||||||
|  |  | ||||||
|  |   if (ErrorOnNoConverge) assert(0); | ||||||
|  |   IterationsToComplete = k; | ||||||
|  | } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  |  | ||||||
|  | ////////////////////////////////////////////////////////////////////////// | ||||||
|  | // multiRHS conjugate gradient. Dimension zero should be the block direction | ||||||
|  | ////////////////////////////////////////////////////////////////////////// | ||||||
|  | template <class Field> | ||||||
|  | class MultiRHSConjugateGradient : public OperatorFunction<Field> { | ||||||
|  |  public: | ||||||
|  |  | ||||||
|  |   typedef typename Field::scalar_type scomplex; | ||||||
|  |  | ||||||
|  |   const int blockDim = 0; | ||||||
|  |  | ||||||
|  |   int Nblock; | ||||||
|  |   bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge. | ||||||
|  |                            // Defaults true. | ||||||
|  |   RealD Tolerance; | ||||||
|  |   Integer MaxIterations; | ||||||
|  |   Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion | ||||||
|  |    | ||||||
|  |    MultiRHSConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true) | ||||||
|  |     : Tolerance(tol), | ||||||
|  |     MaxIterations(maxit), | ||||||
|  |     ErrorOnNoConverge(err_on_no_conv){}; | ||||||
|  |  | ||||||
|  | void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)  | ||||||
|  | { | ||||||
|  |   int Orthog = 0; // First dimension is block dim | ||||||
|  |   Nblock = Src._grid->_fdimensions[Orthog]; | ||||||
|  |  | ||||||
|  |   std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl; | ||||||
|  |  | ||||||
|  |   Psi.checkerboard = Src.checkerboard; | ||||||
|  |   conformable(Psi, Src); | ||||||
|  |  | ||||||
|  |   Field P(Src); | ||||||
|  |   Field AP(Src); | ||||||
|  |   Field R(Src); | ||||||
|  |    | ||||||
|  |   std::vector<ComplexD> v_pAp(Nblock); | ||||||
|  |   std::vector<RealD> v_rr (Nblock); | ||||||
|  |   std::vector<RealD> v_rr_inv(Nblock); | ||||||
|  |   std::vector<RealD> v_alpha(Nblock); | ||||||
|  |   std::vector<RealD> v_beta(Nblock); | ||||||
|  |  | ||||||
|  |   // Initial residual computation & set up | ||||||
|  |   std::vector<RealD> residuals(Nblock); | ||||||
|  |   std::vector<RealD> ssq(Nblock); | ||||||
|  |  | ||||||
|  |   sliceNorm(ssq,Src,Orthog); | ||||||
|  |   RealD sssum=0; | ||||||
|  |   for(int b=0;b<Nblock;b++) sssum+=ssq[b]; | ||||||
|  |  | ||||||
|  |   sliceNorm(residuals,Src,Orthog); | ||||||
|  |   for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); } | ||||||
|  |  | ||||||
|  |   sliceNorm(residuals,Psi,Orthog); | ||||||
|  |   for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); } | ||||||
|  |  | ||||||
|  |   // Initial search dir is guess | ||||||
|  |   Linop.HermOp(Psi, AP); | ||||||
|  |  | ||||||
|  |   R = Src - AP;   | ||||||
|  |   P = R; | ||||||
|  |   sliceNorm(v_rr,R,Orthog); | ||||||
|  |  | ||||||
|  |   GridStopWatch sliceInnerTimer; | ||||||
|  |   GridStopWatch sliceMaddTimer; | ||||||
|  |   GridStopWatch sliceNormTimer; | ||||||
|  |   GridStopWatch MatrixTimer; | ||||||
|  |   GridStopWatch SolverTimer; | ||||||
|  |  | ||||||
|  |   SolverTimer.Start(); | ||||||
|  |   int k; | ||||||
|  |   for (k = 1; k <= MaxIterations; k++) { | ||||||
|  |  | ||||||
|  |     RealD rrsum=0; | ||||||
|  |     for(int b=0;b<Nblock;b++) rrsum+=real(v_rr[b]); | ||||||
|  |  | ||||||
|  |     std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum | ||||||
|  | 	      <<" / "<<std::sqrt(rrsum/sssum) <<std::endl; | ||||||
|  |  | ||||||
|  |     MatrixTimer.Start(); | ||||||
|  |     Linop.HermOp(P, AP); | ||||||
|  |     MatrixTimer.Stop(); | ||||||
|  |  | ||||||
|  |     // Alpha | ||||||
|  |     //    sliceInnerProductVectorTest(v_pAp_test,P,AP,Orthog); | ||||||
|  |     sliceInnerTimer.Start(); | ||||||
|  |     sliceInnerProductVector(v_pAp,P,AP,Orthog); | ||||||
|  |     sliceInnerTimer.Stop(); | ||||||
|  |     for(int b=0;b<Nblock;b++){ | ||||||
|  |       //      std::cout << " "<< v_pAp[b]<<" "<< v_pAp_test[b]<<std::endl; | ||||||
|  |       v_alpha[b] = v_rr[b]/real(v_pAp[b]); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // Psi, R update | ||||||
|  |     sliceMaddTimer.Start(); | ||||||
|  |     sliceMaddVector(Psi,v_alpha, P,Psi,Orthog);     // add alpha *  P to psi | ||||||
|  |     sliceMaddVector(R  ,v_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid | ||||||
|  |     sliceMaddTimer.Stop(); | ||||||
|  |  | ||||||
|  |     // Beta | ||||||
|  |     for(int b=0;b<Nblock;b++){ | ||||||
|  |       v_rr_inv[b] = 1.0/v_rr[b]; | ||||||
|  |     } | ||||||
|  |     sliceNormTimer.Start(); | ||||||
|  |     sliceNorm(v_rr,R,Orthog); | ||||||
|  |     sliceNormTimer.Stop(); | ||||||
|  |     for(int b=0;b<Nblock;b++){ | ||||||
|  |       v_beta[b] = v_rr_inv[b] *v_rr[b]; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // Search update | ||||||
|  |     sliceMaddTimer.Start(); | ||||||
|  |     sliceMaddVector(P,v_beta,P,R,Orthog); | ||||||
|  |     sliceMaddTimer.Stop(); | ||||||
|  |  | ||||||
|  |     /********************* | ||||||
|  |      * convergence monitor | ||||||
|  |      ********************* | ||||||
|  |      */ | ||||||
|  |     RealD max_resid=0; | ||||||
|  |     for(int b=0;b<Nblock;b++){ | ||||||
|  |       RealD rr = v_rr[b]/ssq[b]; | ||||||
|  |       if ( rr > max_resid ) max_resid = rr; | ||||||
|  |     } | ||||||
|  |      | ||||||
|  |     if ( max_resid < Tolerance*Tolerance ) {  | ||||||
|  |  | ||||||
|  |       SolverTimer.Stop(); | ||||||
|  |  | ||||||
|  |       std::cout << GridLogMessage<<"MultiRHS solver converged in " <<k<<" iterations"<<std::endl; | ||||||
|  |       for(int b=0;b<Nblock;b++){ | ||||||
|  | 	std::cout << GridLogMessage<< "\t\tBlock "<<b<<" resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl; | ||||||
|  |       } | ||||||
|  |       std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl; | ||||||
|  |  | ||||||
|  |       Linop.HermOp(Psi, AP); | ||||||
|  |       AP = AP-Src; | ||||||
|  |       std::cout <<GridLogMessage << "\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl; | ||||||
|  |  | ||||||
|  |       std::cout << GridLogMessage << "Time Breakdown "<<std::endl; | ||||||
|  |       std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl; | ||||||
|  |       std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl; | ||||||
|  |       std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl; | ||||||
|  |       std::cout << GridLogMessage << "\tNorm       " << sliceNormTimer.Elapsed() <<std::endl; | ||||||
|  |       std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |       IterationsToComplete = k; | ||||||
|  |       return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |   } | ||||||
|  |   std::cout << GridLogMessage << "MultiRHSConjugateGradient did NOT converge" << std::endl; | ||||||
|  |  | ||||||
|  |   if (ErrorOnNoConverge) assert(0); | ||||||
|  |   IterationsToComplete = k; | ||||||
|  | } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | } | ||||||
|  | #endif | ||||||
| @@ -78,18 +78,12 @@ class ConjugateGradient : public OperatorFunction<Field> { | |||||||
|     cp = a; |     cp = a; | ||||||
|     ssq = norm2(src); |     ssq = norm2(src); | ||||||
|  |  | ||||||
|     std::cout << GridLogIterative << std::setprecision(4) |     std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: guess " << guess << std::endl; | ||||||
|               << "ConjugateGradient: guess " << guess << std::endl; |     std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:   src " << ssq << std::endl; | ||||||
|     std::cout << GridLogIterative << std::setprecision(4) |     std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:    mp " << d << std::endl; | ||||||
|               << "ConjugateGradient:   src " << ssq << std::endl; |     std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:   mmp " << b << std::endl; | ||||||
|     std::cout << GridLogIterative << std::setprecision(4) |     std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:  cp,r " << cp << std::endl; | ||||||
|               << "ConjugateGradient:    mp " << d << std::endl; |     std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:     p " << a << std::endl; | ||||||
|     std::cout << GridLogIterative << std::setprecision(4) |  | ||||||
|               << "ConjugateGradient:   mmp " << b << std::endl; |  | ||||||
|     std::cout << GridLogIterative << std::setprecision(4) |  | ||||||
|               << "ConjugateGradient:  cp,r " << cp << std::endl; |  | ||||||
|     std::cout << GridLogIterative << std::setprecision(4) |  | ||||||
|               << "ConjugateGradient:     p " << a << std::endl; |  | ||||||
|  |  | ||||||
|     RealD rsq = Tolerance * Tolerance * ssq; |     RealD rsq = Tolerance * Tolerance * ssq; | ||||||
|  |  | ||||||
| @@ -99,8 +93,7 @@ class ConjugateGradient : public OperatorFunction<Field> { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     std::cout << GridLogIterative << std::setprecision(4) |     std::cout << GridLogIterative << std::setprecision(4) | ||||||
|               << "ConjugateGradient: k=0 residual " << cp << " target " << rsq |               << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl; | ||||||
|               << std::endl; |  | ||||||
|  |  | ||||||
|     GridStopWatch LinalgTimer; |     GridStopWatch LinalgTimer; | ||||||
|     GridStopWatch MatrixTimer; |     GridStopWatch MatrixTimer; | ||||||
| @@ -145,19 +138,20 @@ class ConjugateGradient : public OperatorFunction<Field> { | |||||||
|         RealD resnorm = sqrt(norm2(p)); |         RealD resnorm = sqrt(norm2(p)); | ||||||
|         RealD true_residual = resnorm / srcnorm; |         RealD true_residual = resnorm / srcnorm; | ||||||
|  |  | ||||||
|         std::cout << GridLogMessage |         std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl; | ||||||
|                   << "ConjugateGradient: Converged on iteration " << k << std::endl; |         std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl; | ||||||
|         std::cout << GridLogMessage << "Computed residual " << sqrt(cp / ssq) | 	std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl; | ||||||
|                   << " true residual " << true_residual << " target " | 	std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl; | ||||||
|                   << Tolerance << std::endl; |  | ||||||
|         std::cout << GridLogMessage << "Time elapsed: Iterations " |         std::cout << GridLogMessage << "Time breakdown "<<std::endl; | ||||||
|                   << SolverTimer.Elapsed() << " Matrix  " | 	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl; | ||||||
|                   << MatrixTimer.Elapsed() << " Linalg " | 	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl; | ||||||
|                   << LinalgTimer.Elapsed(); | 	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl; | ||||||
|         std::cout << std::endl; |  | ||||||
|  |  | ||||||
|         if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); |         if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); | ||||||
|  |  | ||||||
| 	IterationsToComplete = k;	 | 	IterationsToComplete = k;	 | ||||||
|  |  | ||||||
|         return; |         return; | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -30,6 +30,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
| #define GRID_IRL_H | #define GRID_IRL_H | ||||||
|  |  | ||||||
| #include <string.h> //memset | #include <string.h> //memset | ||||||
|  |  | ||||||
| #ifdef USE_LAPACK | #ifdef USE_LAPACK | ||||||
| void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e, | void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e, | ||||||
|                    double *vl, double *vu, int *il, int *iu, double *abstol, |                    double *vl, double *vu, int *il, int *iu, double *abstol, | ||||||
| @@ -37,8 +38,9 @@ void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e, | |||||||
|                    double *work, int *lwork, int *iwork, int *liwork, |                    double *work, int *lwork, int *iwork, int *liwork, | ||||||
|                    int *info); |                    int *info); | ||||||
| #endif | #endif | ||||||
| #include "DenseMatrix.h" |  | ||||||
| #include "EigenSort.h" | #include <Grid/algorithms/densematrix/DenseMatrix.h> | ||||||
|  | #include <Grid/algorithms/iterative/EigenSort.h> | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
| @@ -1088,8 +1090,6 @@ static void Lock(DenseMatrix<T> &H, 	// Hess mtx | |||||||
| 		 int dfg, | 		 int dfg, | ||||||
| 		 bool herm) | 		 bool herm) | ||||||
| {	 | {	 | ||||||
|  |  | ||||||
|  |  | ||||||
|   //ForceTridiagonal(H); |   //ForceTridiagonal(H); | ||||||
|  |  | ||||||
|   int M = H.dim; |   int M = H.dim; | ||||||
| @@ -1122,7 +1122,6 @@ static void Lock(DenseMatrix<T> &H, 	// Hess mtx | |||||||
|   AH = Hermitian(QQ)*AH; |   AH = Hermitian(QQ)*AH; | ||||||
|   AH = AH*QQ; |   AH = AH*QQ; | ||||||
|  |  | ||||||
|  |  | ||||||
|   for(int i=con;i<M;i++){ |   for(int i=con;i<M;i++){ | ||||||
|     for(int j=con;j<M;j++){ |     for(int j=con;j<M;j++){ | ||||||
|       Q[i][j]=QQ[i-con][j-con]; |       Q[i][j]=QQ[i-con][j-con]; | ||||||
|   | |||||||
| @@ -1,453 +0,0 @@ | |||||||
|     /************************************************************************************* |  | ||||||
|  |  | ||||||
|     Grid physics library, www.github.com/paboyle/Grid  |  | ||||||
|  |  | ||||||
|     Source file: ./lib/algorithms/iterative/Matrix.h |  | ||||||
|  |  | ||||||
|     Copyright (C) 2015 |  | ||||||
|  |  | ||||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> |  | ||||||
|  |  | ||||||
|     This program is free software; you can redistribute it and/or modify |  | ||||||
|     it under the terms of the GNU General Public License as published by |  | ||||||
|     the Free Software Foundation; either version 2 of the License, or |  | ||||||
|     (at your option) any later version. |  | ||||||
|  |  | ||||||
|     This program is distributed in the hope that it will be useful, |  | ||||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
|     GNU General Public License for more details. |  | ||||||
|  |  | ||||||
|     You should have received a copy of the GNU General Public License along |  | ||||||
|     with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |  | ||||||
|     *************************************************************************************/ |  | ||||||
|     /*  END LEGAL */ |  | ||||||
| #ifndef MATRIX_H |  | ||||||
| #define MATRIX_H |  | ||||||
|  |  | ||||||
| #include <cstdlib> |  | ||||||
| #include <string> |  | ||||||
| #include <cmath> |  | ||||||
| #include <vector> |  | ||||||
| #include <iostream> |  | ||||||
| #include <iomanip> |  | ||||||
| #include <complex> |  | ||||||
| #include <typeinfo> |  | ||||||
| #include <Grid/Grid.h> |  | ||||||
|  |  | ||||||
|  |  | ||||||
| /** Sign function **/ |  | ||||||
| template <class T> T sign(T p){return ( p/abs(p) );} |  | ||||||
|  |  | ||||||
| ///////////////////////////////////////////////////////////////////////////////////////////////////////// |  | ||||||
| ///////////////////// Hijack STL containers for our wicked means ///////////////////////////////////////// |  | ||||||
| ///////////////////////////////////////////////////////////////////////////////////////////////////////// |  | ||||||
| template<class T> using Vector = Vector<T>; |  | ||||||
| template<class T> using Matrix = Vector<Vector<T> >; |  | ||||||
|  |  | ||||||
| template<class T> void Resize(Vector<T > & vec, int N) { vec.resize(N); } |  | ||||||
|  |  | ||||||
| template<class T> void Resize(Matrix<T > & mat, int N, int M) {  |  | ||||||
|   mat.resize(N); |  | ||||||
|   for(int i=0;i<N;i++){ |  | ||||||
|     mat[i].resize(M); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| template<class T> void Size(Vector<T> & vec, int &N)  |  | ||||||
| {  |  | ||||||
|   N= vec.size(); |  | ||||||
| } |  | ||||||
| template<class T> void Size(Matrix<T> & mat, int &N,int &M)  |  | ||||||
| {  |  | ||||||
|   N= mat.size(); |  | ||||||
|   M= mat[0].size(); |  | ||||||
| } |  | ||||||
| template<class T> void SizeSquare(Matrix<T> & mat, int &N)  |  | ||||||
| {  |  | ||||||
|   int M; Size(mat,N,M); |  | ||||||
|   assert(N==M); |  | ||||||
| } |  | ||||||
| template<class T> void SizeSame(Matrix<T> & mat1,Matrix<T> &mat2, int &N1,int &M1)  |  | ||||||
| {  |  | ||||||
|   int N2,M2; |  | ||||||
|   Size(mat1,N1,M1); |  | ||||||
|   Size(mat2,N2,M2); |  | ||||||
|   assert(N1==N2); |  | ||||||
|   assert(M1==M2); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| //***************************************** |  | ||||||
| //*	(Complex) Vector operations	* |  | ||||||
| //***************************************** |  | ||||||
|  |  | ||||||
| /**Conj of a Vector **/ |  | ||||||
| template <class T> Vector<T> conj(Vector<T> p){ |  | ||||||
| 	Vector<T> q(p.size()); |  | ||||||
| 	for(int i=0;i<p.size();i++){q[i] = conj(p[i]);} |  | ||||||
| 	return q; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /** Norm of a Vector**/ |  | ||||||
| template <class T> T norm(Vector<T> p){ |  | ||||||
| 	T sum = 0; |  | ||||||
| 	for(int i=0;i<p.size();i++){sum = sum + p[i]*conj(p[i]);} |  | ||||||
| 	return abs(sqrt(sum)); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /** Norm squared of a Vector **/ |  | ||||||
| template <class T> T norm2(Vector<T> p){ |  | ||||||
| 	T sum = 0; |  | ||||||
| 	for(int i=0;i<p.size();i++){sum = sum + p[i]*conj(p[i]);} |  | ||||||
| 	return abs((sum)); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /** Sum elements of a Vector **/ |  | ||||||
| template <class T> T trace(Vector<T> p){ |  | ||||||
| 	T sum = 0; |  | ||||||
| 	for(int i=0;i<p.size();i++){sum = sum + p[i];} |  | ||||||
| 	return sum; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /** Fill a Vector with constant c **/ |  | ||||||
| template <class T> void Fill(Vector<T> &p, T c){ |  | ||||||
| 	for(int i=0;i<p.size();i++){p[i] = c;} |  | ||||||
| } |  | ||||||
| /** Normalize a Vector **/ |  | ||||||
| template <class T> void normalize(Vector<T> &p){ |  | ||||||
| 	T m = norm(p); |  | ||||||
| 	if( abs(m) > 0.0) for(int i=0;i<p.size();i++){p[i] /= m;} |  | ||||||
| } |  | ||||||
| /** Vector by scalar **/ |  | ||||||
| template <class T, class U> Vector<T> times(Vector<T> p, U s){ |  | ||||||
| 	for(int i=0;i<p.size();i++){p[i] *= s;} |  | ||||||
| 	return p; |  | ||||||
| } |  | ||||||
| template <class T, class U> Vector<T> times(U s, Vector<T> p){ |  | ||||||
| 	for(int i=0;i<p.size();i++){p[i] *= s;} |  | ||||||
| 	return p; |  | ||||||
| } |  | ||||||
| /** inner product of a and b = conj(a) . b **/ |  | ||||||
| template <class T> T inner(Vector<T> a, Vector<T> b){ |  | ||||||
| 	T m = 0.; |  | ||||||
| 	for(int i=0;i<a.size();i++){m = m + conj(a[i])*b[i];} |  | ||||||
| 	return m; |  | ||||||
| } |  | ||||||
| /** sum of a and b = a + b **/ |  | ||||||
| template <class T> Vector<T> add(Vector<T> a, Vector<T> b){ |  | ||||||
| 	Vector<T> m(a.size()); |  | ||||||
| 	for(int i=0;i<a.size();i++){m[i] = a[i] + b[i];} |  | ||||||
| 	return m; |  | ||||||
| } |  | ||||||
| /** sum of a and b = a - b **/ |  | ||||||
| template <class T> Vector<T> sub(Vector<T> a, Vector<T> b){ |  | ||||||
| 	Vector<T> m(a.size()); |  | ||||||
| 	for(int i=0;i<a.size();i++){m[i] = a[i] - b[i];} |  | ||||||
| 	return m; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /**  |  | ||||||
|  ********************************* |  | ||||||
|  *	Matrices	         * |  | ||||||
|  ********************************* |  | ||||||
|  **/ |  | ||||||
|  |  | ||||||
| template<class T> void Fill(Matrix<T> & mat, T&val) {  |  | ||||||
|   int N,M; |  | ||||||
|   Size(mat,N,M); |  | ||||||
|   for(int i=0;i<N;i++){ |  | ||||||
|   for(int j=0;j<M;j++){ |  | ||||||
|     mat[i][j] = val; |  | ||||||
|   }} |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /** Transpose of a matrix **/ |  | ||||||
| Matrix<T> Transpose(Matrix<T> & mat){ |  | ||||||
|   int N,M; |  | ||||||
|   Size(mat,N,M); |  | ||||||
|   Matrix C; Resize(C,M,N); |  | ||||||
|   for(int i=0;i<M;i++){ |  | ||||||
|   for(int j=0;j<N;j++){ |  | ||||||
|     C[i][j] = mat[j][i]; |  | ||||||
|   }}  |  | ||||||
|   return C; |  | ||||||
| } |  | ||||||
| /** Set Matrix to unit matrix **/ |  | ||||||
| template<class T> void Unity(Matrix<T> &mat){ |  | ||||||
|   int N;  SizeSquare(mat,N); |  | ||||||
|   for(int i=0;i<N;i++){ |  | ||||||
|     for(int j=0;j<N;j++){ |  | ||||||
|       if ( i==j ) A[i][j] = 1; |  | ||||||
|       else        A[i][j] = 0; |  | ||||||
|     }  |  | ||||||
|   }  |  | ||||||
| } |  | ||||||
| /** Add C * I to matrix **/ |  | ||||||
| template<class T> |  | ||||||
| void PlusUnit(Matrix<T> & A,T c){ |  | ||||||
|   int dim;  SizeSquare(A,dim); |  | ||||||
|   for(int i=0;i<dim;i++){A[i][i] = A[i][i] + c;}  |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /** return the Hermitian conjugate of matrix **/ |  | ||||||
| Matrix<T> HermitianConj(Matrix<T> &mat){ |  | ||||||
|  |  | ||||||
|   int dim; SizeSquare(mat,dim); |  | ||||||
|  |  | ||||||
|   Matrix<T> C; Resize(C,dim,dim); |  | ||||||
|  |  | ||||||
|   for(int i=0;i<dim;i++){ |  | ||||||
|     for(int j=0;j<dim;j++){ |  | ||||||
|       C[i][j] = conj(mat[j][i]); |  | ||||||
|     }  |  | ||||||
|   }  |  | ||||||
|   return C; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /** return diagonal entries as a Vector **/ |  | ||||||
| Vector<T> diag(Matrix<T> &A) |  | ||||||
| { |  | ||||||
|   int dim; SizeSquare(A,dim); |  | ||||||
|   Vector<T> d; Resize(d,dim); |  | ||||||
|  |  | ||||||
|   for(int i=0;i<dim;i++){ |  | ||||||
|     d[i] = A[i][i]; |  | ||||||
|   } |  | ||||||
|   return d; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /** Left multiply by a Vector **/ |  | ||||||
| Vector<T> operator *(Vector<T> &B,Matrix<T> &A) |  | ||||||
| { |  | ||||||
|   int K,M,N;  |  | ||||||
|   Size(B,K); |  | ||||||
|   Size(A,M,N); |  | ||||||
|   assert(K==M); |  | ||||||
|    |  | ||||||
|   Vector<T> C; Resize(C,N); |  | ||||||
|  |  | ||||||
|   for(int j=0;j<N;j++){ |  | ||||||
|     T sum = 0.0; |  | ||||||
|     for(int i=0;i<M;i++){ |  | ||||||
|       sum += B[i] * A[i][j]; |  | ||||||
|     } |  | ||||||
|     C[j] =  sum; |  | ||||||
|   } |  | ||||||
|   return C;  |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /** return 1/diagonal entries as a Vector **/ |  | ||||||
| Vector<T> inv_diag(Matrix<T> & A){ |  | ||||||
|   int dim; SizeSquare(A,dim); |  | ||||||
|   Vector<T> d; Resize(d,dim); |  | ||||||
|   for(int i=0;i<dim;i++){ |  | ||||||
|     d[i] = 1.0/A[i][i]; |  | ||||||
|   } |  | ||||||
|   return d; |  | ||||||
| } |  | ||||||
| /** Matrix Addition **/ |  | ||||||
| inline Matrix<T> operator + (Matrix<T> &A,Matrix<T> &B) |  | ||||||
| { |  | ||||||
|   int N,M  ; SizeSame(A,B,N,M); |  | ||||||
|   Matrix C; Resize(C,N,M); |  | ||||||
|   for(int i=0;i<N;i++){ |  | ||||||
|     for(int j=0;j<M;j++){ |  | ||||||
|       C[i][j] = A[i][j] +  B[i][j]; |  | ||||||
|     }  |  | ||||||
|   }  |  | ||||||
|   return C; |  | ||||||
| }  |  | ||||||
| /** Matrix Subtraction **/ |  | ||||||
| inline Matrix<T> operator- (Matrix<T> & A,Matrix<T> &B){ |  | ||||||
|   int N,M  ; SizeSame(A,B,N,M); |  | ||||||
|   Matrix C; Resize(C,N,M); |  | ||||||
|   for(int i=0;i<N;i++){ |  | ||||||
|   for(int j=0;j<M;j++){ |  | ||||||
|     C[i][j] = A[i][j] -  B[i][j]; |  | ||||||
|   }} |  | ||||||
|   return C; |  | ||||||
| }  |  | ||||||
|  |  | ||||||
| /** Matrix scalar multiplication **/ |  | ||||||
| inline Matrix<T> operator* (Matrix<T> & A,T c){ |  | ||||||
|   int N,M; Size(A,N,M); |  | ||||||
|   Matrix C; Resize(C,N,M); |  | ||||||
|   for(int i=0;i<N;i++){ |  | ||||||
|   for(int j=0;j<M;j++){ |  | ||||||
|     C[i][j] = A[i][j]*c; |  | ||||||
|   }}  |  | ||||||
|   return C; |  | ||||||
| }  |  | ||||||
| /** Matrix Matrix multiplication **/ |  | ||||||
| inline Matrix<T> operator* (Matrix<T> &A,Matrix<T> &B){ |  | ||||||
|   int K,L,N,M; |  | ||||||
|   Size(A,K,L); |  | ||||||
|   Size(B,N,M); assert(L==N); |  | ||||||
|   Matrix C; Resize(C,K,M); |  | ||||||
|  |  | ||||||
|   for(int i=0;i<K;i++){ |  | ||||||
|     for(int j=0;j<M;j++){ |  | ||||||
|       T sum = 0.0; |  | ||||||
|       for(int k=0;k<N;k++) sum += A[i][k]*B[k][j]; |  | ||||||
|       C[i][j] =sum; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|   return C;  |  | ||||||
| }  |  | ||||||
| /** Matrix Vector multiplication **/ |  | ||||||
| inline Vector<T> operator* (Matrix<T> &A,Vector<T> &B){ |  | ||||||
|   int M,N,K; |  | ||||||
|   Size(A,N,M); |  | ||||||
|   Size(B,K); assert(K==M); |  | ||||||
|   Vector<T> C; Resize(C,N); |  | ||||||
|   for(int i=0;i<N;i++){ |  | ||||||
|     T sum = 0.0; |  | ||||||
|     for(int j=0;j<M;j++) sum += A[i][j]*B[j]; |  | ||||||
|     C[i] =  sum; |  | ||||||
|   } |  | ||||||
|   return C;  |  | ||||||
| }  |  | ||||||
|  |  | ||||||
| /** Some version of Matrix norm **/ |  | ||||||
| /* |  | ||||||
| inline T Norm(){ // this is not a usual L2 norm |  | ||||||
|     T norm = 0; |  | ||||||
|     for(int i=0;i<dim;i++){ |  | ||||||
|       for(int j=0;j<dim;j++){ |  | ||||||
| 	norm += abs(A[i][j]); |  | ||||||
|     }} |  | ||||||
|     return norm; |  | ||||||
|   } |  | ||||||
| */ |  | ||||||
|  |  | ||||||
| /** Some version of Matrix norm **/ |  | ||||||
| template<class T> T LargestDiag(Matrix<T> &A) |  | ||||||
| { |  | ||||||
|   int dim ; SizeSquare(A,dim);  |  | ||||||
|  |  | ||||||
|   T ld = abs(A[0][0]); |  | ||||||
|   for(int i=1;i<dim;i++){ |  | ||||||
|     T cf = abs(A[i][i]); |  | ||||||
|     if(abs(cf) > abs(ld) ){ld = cf;} |  | ||||||
|   } |  | ||||||
|   return ld; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /** Look for entries on the leading subdiagonal that are smaller than 'small' **/ |  | ||||||
| template <class T,class U> int Chop_subdiag(Matrix<T> &A,T norm, int offset, U small) |  | ||||||
| { |  | ||||||
|   int dim; SizeSquare(A,dim); |  | ||||||
|   for(int l = dim - 1 - offset; l >= 1; l--) {             		 |  | ||||||
|     if((U)abs(A[l][l - 1]) < (U)small) { |  | ||||||
|       A[l][l-1]=(U)0.0; |  | ||||||
|       return l; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|   return 0; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /** Look for entries on the leading subdiagonal that are smaller than 'small' **/ |  | ||||||
| template <class T,class U> int Chop_symm_subdiag(Matrix<T> & A,T norm, int offset, U small)  |  | ||||||
| { |  | ||||||
|   int dim; SizeSquare(A,dim); |  | ||||||
|   for(int l = dim - 1 - offset; l >= 1; l--) { |  | ||||||
|     if((U)abs(A[l][l - 1]) < (U)small) { |  | ||||||
|       A[l][l - 1] = (U)0.0; |  | ||||||
|       A[l - 1][l] = (U)0.0; |  | ||||||
|       return l; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|   return 0; |  | ||||||
| } |  | ||||||
| /**Assign a submatrix to a larger one**/ |  | ||||||
| template<class T> |  | ||||||
| void AssignSubMtx(Matrix<T> & A,int row_st, int row_end, int col_st, int col_end, Matrix<T> &S) |  | ||||||
| { |  | ||||||
|   for(int i = row_st; i<row_end; i++){ |  | ||||||
|     for(int j = col_st; j<col_end; j++){ |  | ||||||
|       A[i][j] = S[i - row_st][j - col_st]; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /**Get a square submatrix**/ |  | ||||||
| template <class T> |  | ||||||
| Matrix<T> GetSubMtx(Matrix<T> &A,int row_st, int row_end, int col_st, int col_end) |  | ||||||
| { |  | ||||||
|   Matrix<T> H; Resize(row_end - row_st,col_end-col_st); |  | ||||||
|  |  | ||||||
|   for(int i = row_st; i<row_end; i++){ |  | ||||||
|   for(int j = col_st; j<col_end; j++){ |  | ||||||
|     H[i-row_st][j-col_st]=A[i][j]; |  | ||||||
|   }} |  | ||||||
|   return H; |  | ||||||
| } |  | ||||||
|    |  | ||||||
|  /**Assign a submatrix to a larger one NB remember Vector Vectors are transposes of the matricies they represent**/ |  | ||||||
| template<class T> |  | ||||||
| void AssignSubMtx(Matrix<T> & A,int row_st, int row_end, int col_st, int col_end, Matrix<T> &S) |  | ||||||
| { |  | ||||||
|   for(int i = row_st; i<row_end; i++){ |  | ||||||
|   for(int j = col_st; j<col_end; j++){ |  | ||||||
|     A[i][j] = S[i - row_st][j - col_st]; |  | ||||||
|   }} |  | ||||||
| } |  | ||||||
|    |  | ||||||
| /** compute b_i A_ij b_j **/ // surprised no Conj |  | ||||||
| template<class T> T proj(Matrix<T> A, Vector<T> B){ |  | ||||||
|   int dim; SizeSquare(A,dim); |  | ||||||
|   int dimB; Size(B,dimB); |  | ||||||
|   assert(dimB==dim); |  | ||||||
|   T C = 0; |  | ||||||
|   for(int i=0;i<dim;i++){ |  | ||||||
|     T sum = 0.0; |  | ||||||
|     for(int j=0;j<dim;j++){ |  | ||||||
|       sum += A[i][j]*B[j]; |  | ||||||
|     } |  | ||||||
|     C +=  B[i]*sum; // No conj? |  | ||||||
|   } |  | ||||||
|   return C;  |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| /* |  | ||||||
|  ************************************************************* |  | ||||||
|  * |  | ||||||
|  * Matrix Vector products |  | ||||||
|  * |  | ||||||
|  ************************************************************* |  | ||||||
|  */ |  | ||||||
| // Instead make a linop and call my CG; |  | ||||||
|  |  | ||||||
| /// q -> q Q |  | ||||||
| template <class T,class Fermion> void times(Vector<Fermion> &q, Matrix<T> &Q) |  | ||||||
| { |  | ||||||
|   int M; SizeSquare(Q,M); |  | ||||||
|   int N; Size(q,N);  |  | ||||||
|   assert(M==N); |  | ||||||
|  |  | ||||||
|   times(q,Q,N); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /// q -> q Q |  | ||||||
| template <class T> void times(multi1d<LatticeFermion> &q, Matrix<T> &Q, int N) |  | ||||||
| { |  | ||||||
|   GridBase *grid = q[0]._grid; |  | ||||||
|   int M; SizeSquare(Q,M); |  | ||||||
|   int K; Size(q,K);  |  | ||||||
|   assert(N<M); |  | ||||||
|   assert(N<K); |  | ||||||
|   Vector<Fermion> S(N,grid ); |  | ||||||
|   for(int j=0;j<N;j++){ |  | ||||||
|     S[j] = zero; |  | ||||||
|     for(int k=0;k<N;k++){ |  | ||||||
|       S[j] = S[j] +  q[k]* Q[k][j];  |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|   for(int j=0;j<q.size();j++){ |  | ||||||
|     q[j] = S[j]; |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| #endif |  | ||||||
| @@ -1,75 +0,0 @@ | |||||||
|     /************************************************************************************* |  | ||||||
|  |  | ||||||
|     Grid physics library, www.github.com/paboyle/Grid  |  | ||||||
|  |  | ||||||
|     Source file: ./lib/algorithms/iterative/MatrixUtils.h |  | ||||||
|  |  | ||||||
|     Copyright (C) 2015 |  | ||||||
|  |  | ||||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> |  | ||||||
|  |  | ||||||
|     This program is free software; you can redistribute it and/or modify |  | ||||||
|     it under the terms of the GNU General Public License as published by |  | ||||||
|     the Free Software Foundation; either version 2 of the License, or |  | ||||||
|     (at your option) any later version. |  | ||||||
|  |  | ||||||
|     This program is distributed in the hope that it will be useful, |  | ||||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
|     GNU General Public License for more details. |  | ||||||
|  |  | ||||||
|     You should have received a copy of the GNU General Public License along |  | ||||||
|     with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |  | ||||||
|     *************************************************************************************/ |  | ||||||
|     /*  END LEGAL */ |  | ||||||
| #ifndef GRID_MATRIX_UTILS_H |  | ||||||
| #define GRID_MATRIX_UTILS_H |  | ||||||
|  |  | ||||||
| namespace Grid { |  | ||||||
|  |  | ||||||
|   namespace MatrixUtils {  |  | ||||||
|  |  | ||||||
|     template<class T> inline void Size(Matrix<T>& A,int &N,int &M){ |  | ||||||
|       N=A.size(); assert(N>0); |  | ||||||
|       M=A[0].size(); |  | ||||||
|       for(int i=0;i<N;i++){ |  | ||||||
| 	assert(A[i].size()==M); |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     template<class T> inline void SizeSquare(Matrix<T>& A,int &N) |  | ||||||
|     { |  | ||||||
|       int M; |  | ||||||
|       Size(A,N,M); |  | ||||||
|       assert(N==M); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     template<class T> inline void Fill(Matrix<T>& A,T & val) |  | ||||||
|     {  |  | ||||||
|       int N,M; |  | ||||||
|       Size(A,N,M); |  | ||||||
|       for(int i=0;i<N;i++){ |  | ||||||
|       for(int j=0;j<M;j++){ |  | ||||||
| 	A[i][j]=val; |  | ||||||
|       }} |  | ||||||
|     } |  | ||||||
|     template<class T> inline void Diagonal(Matrix<T>& A,T & val) |  | ||||||
|     {  |  | ||||||
|       int N; |  | ||||||
|       SizeSquare(A,N); |  | ||||||
|       for(int i=0;i<N;i++){ |  | ||||||
| 	A[i][i]=val; |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     template<class T> inline void Identity(Matrix<T>& A) |  | ||||||
|     { |  | ||||||
|       Fill(A,0.0); |  | ||||||
|       Diagonal(A,1.0); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|   }; |  | ||||||
| } |  | ||||||
| #endif |  | ||||||
| @@ -1,15 +0,0 @@ | |||||||
| - ConjugateGradientMultiShift |  | ||||||
| - MCR |  | ||||||
|  |  | ||||||
| - Potentially Useful Boost libraries |  | ||||||
|  |  | ||||||
| - MultiArray |  | ||||||
| - Aligned allocator; memory pool |  | ||||||
| - Remez -- Mike or Boost? |  | ||||||
| - Multiprecision |  | ||||||
| - quaternians |  | ||||||
| - Tokenize |  | ||||||
| - Serialization |  | ||||||
| - Regex |  | ||||||
| - Proto (ET) |  | ||||||
| - uBlas |  | ||||||
| @@ -1,122 +0,0 @@ | |||||||
| #include <math.h> |  | ||||||
| #include <stdlib.h> |  | ||||||
| #include <vector> |  | ||||||
|  |  | ||||||
| struct Bisection { |  | ||||||
|  |  | ||||||
| static void get_eig2(int row_num,std::vector<RealD> &ALPHA,std::vector<RealD> &BETA, std::vector<RealD> & eig) |  | ||||||
| { |  | ||||||
|   int i,j; |  | ||||||
|   std::vector<RealD> evec1(row_num+3); |  | ||||||
|   std::vector<RealD> evec2(row_num+3); |  | ||||||
|   RealD eps2; |  | ||||||
|   ALPHA[1]=0.; |  | ||||||
|   BETHA[1]=0.; |  | ||||||
|   for(i=0;i<row_num-1;i++) { |  | ||||||
|     ALPHA[i+1] = A[i*(row_num+1)].real(); |  | ||||||
|     BETHA[i+2] = A[i*(row_num+1)+1].real(); |  | ||||||
|   } |  | ||||||
|   ALPHA[row_num] = A[(row_num-1)*(row_num+1)].real(); |  | ||||||
|   bisec(ALPHA,BETHA,row_num,1,row_num,1e-10,1e-10,evec1,eps2); |  | ||||||
|   bisec(ALPHA,BETHA,row_num,1,row_num,1e-16,1e-16,evec2,eps2); |  | ||||||
|  |  | ||||||
|   // Do we really need to sort here? |  | ||||||
|   int begin=1; |  | ||||||
|   int end = row_num; |  | ||||||
|   int swapped=1; |  | ||||||
|   while(swapped) { |  | ||||||
|     swapped=0; |  | ||||||
|     for(i=begin;i<end;i++){ |  | ||||||
|       if(mag(evec2[i])>mag(evec2[i+1]))	{ |  | ||||||
| 	swap(evec2+i,evec2+i+1); |  | ||||||
| 	swapped=1; |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     end--; |  | ||||||
|     for(i=end-1;i>=begin;i--){ |  | ||||||
|       if(mag(evec2[i])>mag(evec2[i+1]))	{ |  | ||||||
| 	swap(evec2+i,evec2+i+1); |  | ||||||
| 	swapped=1; |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     begin++; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   for(i=0;i<row_num;i++){ |  | ||||||
|     for(j=0;j<row_num;j++) { |  | ||||||
|       if(i==j) H[i*row_num+j]=evec2[i+1]; |  | ||||||
|       else H[i*row_num+j]=0.; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| static void bisec(std::vector<RealD> &c,    |  | ||||||
| 		  std::vector<RealD> &b, |  | ||||||
| 		  int n, |  | ||||||
| 		  int m1, |  | ||||||
| 		  int m2, |  | ||||||
| 		  RealD eps1, |  | ||||||
| 		  RealD relfeh, |  | ||||||
| 		  std::vector<RealD> &x, |  | ||||||
| 		  RealD &eps2) |  | ||||||
| { |  | ||||||
|   std::vector<RealD> wu(n+2); |  | ||||||
|  |  | ||||||
|   RealD h,q,x1,xu,x0,xmin,xmax;  |  | ||||||
|   int i,a,k; |  | ||||||
|  |  | ||||||
|   b[1]=0.0; |  | ||||||
|   xmin=c[n]-fabs(b[n]); |  | ||||||
|   xmax=c[n]+fabs(b[n]); |  | ||||||
|   for(i=1;i<n;i++){ |  | ||||||
|     h=fabs(b[i])+fabs(b[i+1]); |  | ||||||
|     if(c[i]+h>xmax) xmax= c[i]+h; |  | ||||||
|     if(c[i]-h<xmin) xmin= c[i]-h; |  | ||||||
|   } |  | ||||||
|   xmax *=2.; |  | ||||||
|  |  | ||||||
|   eps2=relfeh*((xmin+xmax)>0.0 ? xmax : -xmin); |  | ||||||
|   if(eps1<=0.0) eps1=eps2; |  | ||||||
|   eps2=0.5*eps1+7.0*(eps2); |  | ||||||
|   x0=xmax; |  | ||||||
|   for(i=m1;i<=m2;i++){ |  | ||||||
|     x[i]=xmax; |  | ||||||
|     wu[i]=xmin; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   for(k=m2;k>=m1;k--){ |  | ||||||
|     xu=xmin; |  | ||||||
|     i=k; |  | ||||||
|     do{ |  | ||||||
|       if(xu<wu[i]){ |  | ||||||
| 	xu=wu[i]; |  | ||||||
| 	i=m1-1; |  | ||||||
|       } |  | ||||||
|       i--; |  | ||||||
|     }while(i>=m1); |  | ||||||
|     if(x0>x[k]) x0=x[k]; |  | ||||||
|     while((x0-xu)>2*relfeh*(fabs(xu)+fabs(x0))+eps1){ |  | ||||||
|       x1=(xu+x0)/2; |  | ||||||
|  |  | ||||||
|       a=0; |  | ||||||
|       q=1.0; |  | ||||||
|       for(i=1;i<=n;i++){ |  | ||||||
| 	q=c[i]-x1-((q!=0.0)? b[i]*b[i]/q:fabs(b[i])/relfeh); |  | ||||||
| 	if(q<0) a++; |  | ||||||
|       } |  | ||||||
|       //			printf("x1=%e a=%d\n",x1,a); |  | ||||||
|       if(a<k){ |  | ||||||
| 	if(a<m1){ |  | ||||||
| 	  xu=x1; |  | ||||||
| 	  wu[m1]=x1; |  | ||||||
| 	}else { |  | ||||||
| 	  xu=x1; |  | ||||||
| 	  wu[a+1]=x1; |  | ||||||
| 	  if(x[a]>x1) x[a]=x1; |  | ||||||
| 	} |  | ||||||
|       }else x0=x1; |  | ||||||
|     } |  | ||||||
|     x[k]=(x0+xu)/2; |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| } |  | ||||||
| @@ -1 +0,0 @@ | |||||||
|  |  | ||||||
| @@ -30,6 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
| #ifndef GRID_LATTICE_REDUCTION_H | #ifndef GRID_LATTICE_REDUCTION_H | ||||||
| #define GRID_LATTICE_REDUCTION_H | #define GRID_LATTICE_REDUCTION_H | ||||||
|  |  | ||||||
|  | #include <Grid/Eigen/Dense> | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
| #ifdef GRID_WARN_SUBOPTIMAL | #ifdef GRID_WARN_SUBOPTIMAL | ||||||
| #warning "Optimisation alert all these reduction loops are NOT threaded " | #warning "Optimisation alert all these reduction loops are NOT threaded " | ||||||
| @@ -43,27 +45,25 @@ namespace Grid { | |||||||
|   return std::real(nrm);  |   return std::real(nrm);  | ||||||
| } | } | ||||||
|  |  | ||||||
|  | // Double inner product | ||||||
| template<class vobj> | template<class vobj> | ||||||
| inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)  | inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)  | ||||||
| { | { | ||||||
|   typedef typename vobj::scalar_type scalar_type; |   typedef typename vobj::scalar_type scalar_type; | ||||||
|       typedef typename vobj::vector_type vector_type; |   typedef typename vobj::vector_typeD vector_type; | ||||||
|   scalar_type  nrm; |   scalar_type  nrm; | ||||||
|    |    | ||||||
|   GridBase *grid = left._grid; |   GridBase *grid = left._grid; | ||||||
|    |    | ||||||
|   std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize()); |   std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize()); | ||||||
|       for(int i=0;i<grid->SumArraySize();i++){ |  | ||||||
| 	sumarray[i]=zero; |  | ||||||
|       } |  | ||||||
|    |    | ||||||
|   parallel_for(int thr=0;thr<grid->SumArraySize();thr++){ |   parallel_for(int thr=0;thr<grid->SumArraySize();thr++){ | ||||||
|     int nwork, mywork, myoff; |     int nwork, mywork, myoff; | ||||||
|     GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); |     GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); | ||||||
|      |      | ||||||
| 	decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation |     decltype(innerProductD(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation | ||||||
|     for(int ss=myoff;ss<mywork+myoff; ss++){ |     for(int ss=myoff;ss<mywork+myoff; ss++){ | ||||||
| 	  vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]); |       vnrm = vnrm + innerProductD(left._odata[ss],right._odata[ss]); | ||||||
|     } |     } | ||||||
|     sumarray[thr]=TensorRemove(vnrm) ; |     sumarray[thr]=TensorRemove(vnrm) ; | ||||||
|   } |   } | ||||||
| @@ -103,8 +103,8 @@ namespace Grid { | |||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> | template<class vobj> | ||||||
|     inline typename vobj::scalar_object sum(const Lattice<vobj> &arg){ | inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) | ||||||
|  | { | ||||||
|   GridBase *grid=arg._grid; |   GridBase *grid=arg._grid; | ||||||
|   int Nsimd = grid->Nsimd(); |   int Nsimd = grid->Nsimd(); | ||||||
|    |    | ||||||
| @@ -142,16 +142,21 @@ namespace Grid { | |||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | ////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc... | ||||||
|  | ////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
| template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim) | template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim) | ||||||
| { | { | ||||||
|  |   /////////////////////////////////////////////////////// | ||||||
|  |   // FIXME precision promoted summation | ||||||
|  |   // may be important for correlation functions | ||||||
|  |   // But easily avoided by using double precision fields | ||||||
|  |   /////////////////////////////////////////////////////// | ||||||
|   typedef typename vobj::scalar_object sobj; |   typedef typename vobj::scalar_object sobj; | ||||||
|   GridBase  *grid = Data._grid; |   GridBase  *grid = Data._grid; | ||||||
|   assert(grid!=NULL); |   assert(grid!=NULL); | ||||||
|  |  | ||||||
|   // FIXME |  | ||||||
|   // std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl; |  | ||||||
|  |  | ||||||
|   const int    Nd = grid->_ndimension; |   const int    Nd = grid->_ndimension; | ||||||
|   const int Nsimd = grid->Nsimd(); |   const int Nsimd = grid->Nsimd(); | ||||||
|  |  | ||||||
| @@ -166,20 +171,28 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector< | |||||||
|   std::vector<sobj> lsSum(ld,zero);                    // sum across these down to scalars |   std::vector<sobj> lsSum(ld,zero);                    // sum across these down to scalars | ||||||
|   std::vector<sobj> extracted(Nsimd);                  // splitting the SIMD |   std::vector<sobj> extracted(Nsimd);                  // splitting the SIMD | ||||||
|  |  | ||||||
|   result.resize(fd); // And then global sum to return the same vector to every node for IO to file |   result.resize(fd); // And then global sum to return the same vector to every node  | ||||||
|   for(int r=0;r<rd;r++){ |   for(int r=0;r<rd;r++){ | ||||||
|     lvSum[r]=zero; |     lvSum[r]=zero; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   std::vector<int>  coor(Nd);   |   int e1=    grid->_slice_nblock[orthogdim]; | ||||||
|  |   int e2=    grid->_slice_block [orthogdim]; | ||||||
|  |   int stride=grid->_slice_stride[orthogdim]; | ||||||
|  |  | ||||||
|   // sum over reduced dimension planes, breaking out orthog dir |   // sum over reduced dimension planes, breaking out orthog dir | ||||||
|  |   // Parallel over orthog direction | ||||||
|  |   parallel_for(int r=0;r<rd;r++){ | ||||||
|  |  | ||||||
|   for(int ss=0;ss<grid->oSites();ss++){ |     int so=r*grid->_ostride[orthogdim]; // base offset for start of plane  | ||||||
|     Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions); |  | ||||||
|     int r = coor[orthogdim]; |     for(int n=0;n<e1;n++){ | ||||||
|  |       for(int b=0;b<e2;b++){ | ||||||
|  | 	int ss= so+n*stride+b; | ||||||
| 	lvSum[r]=lvSum[r]+Data._odata[ss]; | 	lvSum[r]=lvSum[r]+Data._odata[ss]; | ||||||
|       } |       } | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|   // Sum across simd lanes in the plane, breaking out orthog dir. |   // Sum across simd lanes in the plane, breaking out orthog dir. | ||||||
|   std::vector<int> icoor(Nd); |   std::vector<int> icoor(Nd); | ||||||
| @@ -214,10 +227,304 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector< | |||||||
|  |  | ||||||
|     result[t]=gsum; |     result[t]=gsum; | ||||||
|   } |   } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | template<class vobj> | ||||||
|  | static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)  | ||||||
|  | { | ||||||
|  |   typedef typename vobj::vector_type   vector_type; | ||||||
|  |   typedef typename vobj::scalar_type   scalar_type; | ||||||
|  |   GridBase  *grid = lhs._grid; | ||||||
|  |   assert(grid!=NULL); | ||||||
|  |   conformable(grid,rhs._grid); | ||||||
|  |  | ||||||
|  |   const int    Nd = grid->_ndimension; | ||||||
|  |   const int Nsimd = grid->Nsimd(); | ||||||
|  |  | ||||||
|  |   assert(orthogdim >= 0); | ||||||
|  |   assert(orthogdim < Nd); | ||||||
|  |  | ||||||
|  |   int fd=grid->_fdimensions[orthogdim]; | ||||||
|  |   int ld=grid->_ldimensions[orthogdim]; | ||||||
|  |   int rd=grid->_rdimensions[orthogdim]; | ||||||
|  |  | ||||||
|  |   std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first | ||||||
|  |   std::vector<scalar_type > lsSum(ld,scalar_type(0.0));                    // sum across these down to scalars | ||||||
|  |   std::vector<iScalar<scalar_type> > extracted(Nsimd);                  // splitting the SIMD | ||||||
|  |  | ||||||
|  |   result.resize(fd); // And then global sum to return the same vector to every node for IO to file | ||||||
|  |   for(int r=0;r<rd;r++){ | ||||||
|  |     lvSum[r]=zero; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   int e1=    grid->_slice_nblock[orthogdim]; | ||||||
|  |   int e2=    grid->_slice_block [orthogdim]; | ||||||
|  |   int stride=grid->_slice_stride[orthogdim]; | ||||||
|  |  | ||||||
|  |   parallel_for(int r=0;r<rd;r++){ | ||||||
|  |  | ||||||
|  |     int so=r*grid->_ostride[orthogdim]; // base offset for start of plane  | ||||||
|  |  | ||||||
|  |     for(int n=0;n<e1;n++){ | ||||||
|  |       for(int b=0;b<e2;b++){ | ||||||
|  | 	int ss= so+n*stride+b; | ||||||
|  | 	vector_type vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss])); | ||||||
|  | 	lvSum[r]=lvSum[r]+vv; | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   // Sum across simd lanes in the plane, breaking out orthog dir. | ||||||
|  |   std::vector<int> icoor(Nd); | ||||||
|  |   for(int rt=0;rt<rd;rt++){ | ||||||
|  |  | ||||||
|  |     iScalar<vector_type> temp;  | ||||||
|  |     temp._internal = lvSum[rt]; | ||||||
|  |     extract(temp,extracted); | ||||||
|  |  | ||||||
|  |     for(int idx=0;idx<Nsimd;idx++){ | ||||||
|  |  | ||||||
|  |       grid->iCoorFromIindex(icoor,idx); | ||||||
|  |  | ||||||
|  |       int ldx =rt+icoor[orthogdim]*rd; | ||||||
|  |  | ||||||
|  |       lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal; | ||||||
|  |  | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   // sum over nodes. | ||||||
|  |   scalar_type gsum; | ||||||
|  |   for(int t=0;t<fd;t++){ | ||||||
|  |     int pt = t/ld; // processor plane | ||||||
|  |     int lt = t%ld; | ||||||
|  |     if ( pt == grid->_processor_coor[orthogdim] ) { | ||||||
|  |       gsum=lsSum[lt]; | ||||||
|  |     } else { | ||||||
|  |       gsum=scalar_type(0.0); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     grid->GlobalSum(gsum); | ||||||
|  |  | ||||||
|  |     result[t]=gsum; | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | template<class vobj> | ||||||
|  | static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Orthog)  | ||||||
|  | { | ||||||
|  |   typedef typename vobj::scalar_object sobj; | ||||||
|  |   typedef typename vobj::scalar_type scalar_type; | ||||||
|  |   typedef typename vobj::vector_type vector_type; | ||||||
|  |    | ||||||
|  |   int Nblock = rhs._grid->GlobalDimensions()[Orthog]; | ||||||
|  |   std::vector<ComplexD> ip(Nblock); | ||||||
|  |   sn.resize(Nblock); | ||||||
|  |    | ||||||
|  |   sliceInnerProductVector(ip,rhs,rhs,Orthog); | ||||||
|  |   for(int ss=0;ss<Nblock;ss++){ | ||||||
|  |     sn[ss] = real(ip[ss]); | ||||||
|  |   } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  |  | ||||||
|  | template<class vobj> | ||||||
|  | static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y, | ||||||
|  | 			    int orthogdim,RealD scale=1.0)  | ||||||
|  | {     | ||||||
|  |   typedef typename vobj::scalar_object sobj; | ||||||
|  |   typedef typename vobj::scalar_type scalar_type; | ||||||
|  |   typedef typename vobj::vector_type vector_type; | ||||||
|  |   typedef typename vobj::tensor_reduced tensor_reduced; | ||||||
|  |    | ||||||
|  |   GridBase *grid  = X._grid; | ||||||
|  |  | ||||||
|  |   int Nsimd  =grid->Nsimd(); | ||||||
|  |   int Nblock =grid->GlobalDimensions()[orthogdim]; | ||||||
|  |  | ||||||
|  |   int fd     =grid->_fdimensions[orthogdim]; | ||||||
|  |   int ld     =grid->_ldimensions[orthogdim]; | ||||||
|  |   int rd     =grid->_rdimensions[orthogdim]; | ||||||
|  |  | ||||||
|  |   int e1     =grid->_slice_nblock[orthogdim]; | ||||||
|  |   int e2     =grid->_slice_block [orthogdim]; | ||||||
|  |   int stride =grid->_slice_stride[orthogdim]; | ||||||
|  |  | ||||||
|  |   std::vector<int> icoor; | ||||||
|  |  | ||||||
|  |   for(int r=0;r<rd;r++){ | ||||||
|  |  | ||||||
|  |     int so=r*grid->_ostride[orthogdim]; // base offset for start of plane  | ||||||
|  |  | ||||||
|  |     vector_type    av; | ||||||
|  |  | ||||||
|  |     for(int l=0;l<Nsimd;l++){ | ||||||
|  |       grid->iCoorFromIindex(icoor,l); | ||||||
|  |       int ldx =r+icoor[orthogdim]*rd; | ||||||
|  |       scalar_type *as =(scalar_type *)&av; | ||||||
|  |       as[l] = scalar_type(a[ldx])*scale; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     tensor_reduced at; at=av; | ||||||
|  |  | ||||||
|  |     parallel_for_nest2(int n=0;n<e1;n++){ | ||||||
|  |       for(int b=0;b<e2;b++){ | ||||||
|  | 	int ss= so+n*stride+b; | ||||||
|  | 	R._odata[ss] = at*X._odata[ss]+Y._odata[ss]; | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  |  | ||||||
|  | /* | ||||||
|  | template<class vobj> | ||||||
|  | static void sliceMaddVectorSlow (Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y, | ||||||
|  | 			     int Orthog,RealD scale=1.0)  | ||||||
|  | {     | ||||||
|  |   // FIXME: Implementation is slow | ||||||
|  |   // Best base the linear combination by constructing a  | ||||||
|  |   // set of vectors of size grid->_rdimensions[Orthog]. | ||||||
|  |   typedef typename vobj::scalar_object sobj; | ||||||
|  |   typedef typename vobj::scalar_type scalar_type; | ||||||
|  |   typedef typename vobj::vector_type vector_type; | ||||||
|  |    | ||||||
|  |   int Nblock = X._grid->GlobalDimensions()[Orthog]; | ||||||
|  |    | ||||||
|  |   GridBase *FullGrid  = X._grid; | ||||||
|  |   GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); | ||||||
|  |    | ||||||
|  |   Lattice<vobj> Xslice(SliceGrid); | ||||||
|  |   Lattice<vobj> Rslice(SliceGrid); | ||||||
|  |   // If we based this on Cshift it would work for spread out | ||||||
|  |   // but it would be even slower | ||||||
|  |   for(int i=0;i<Nblock;i++){ | ||||||
|  |     ExtractSlice(Rslice,Y,i,Orthog); | ||||||
|  |     ExtractSlice(Xslice,X,i,Orthog); | ||||||
|  |     Rslice = Rslice + Xslice*(scale*a[i]); | ||||||
|  |     InsertSlice(Rslice,R,i,Orthog); | ||||||
|  |   } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | template<class vobj> | ||||||
|  | static void sliceInnerProductVectorSlow( std::vector<ComplexD> & vec, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)  | ||||||
|  |   { | ||||||
|  |     // FIXME: Implementation is slow | ||||||
|  |     // Look at localInnerProduct implementation, | ||||||
|  |     // and do inside a site loop with block strided iterators | ||||||
|  |     typedef typename vobj::scalar_object sobj; | ||||||
|  |     typedef typename vobj::scalar_type scalar_type; | ||||||
|  |     typedef typename vobj::vector_type vector_type; | ||||||
|  |     typedef typename vobj::tensor_reduced scalar; | ||||||
|  |     typedef typename scalar::scalar_object  scomplex; | ||||||
|  |    | ||||||
|  |     int Nblock = lhs._grid->GlobalDimensions()[Orthog]; | ||||||
|  |  | ||||||
|  |     vec.resize(Nblock); | ||||||
|  |     std::vector<scomplex> sip(Nblock); | ||||||
|  |     Lattice<scalar> IP(lhs._grid);  | ||||||
|  |  | ||||||
|  |     IP=localInnerProduct(lhs,rhs); | ||||||
|  |     sliceSum(IP,sip,Orthog); | ||||||
|  |    | ||||||
|  |     for(int ss=0;ss<Nblock;ss++){ | ||||||
|  |       vec[ss] = TensorRemove(sip[ss]); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  | */ | ||||||
|  |  | ||||||
|  | ////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | // FIXME: Implementation is slow | ||||||
|  | // If we based this on Cshift it would work for spread out | ||||||
|  | // but it would be even slower | ||||||
|  | // | ||||||
|  | // Repeated extract slice is inefficient | ||||||
|  | // | ||||||
|  | // Best base the linear combination by constructing a  | ||||||
|  | // set of vectors of size grid->_rdimensions[Orthog]. | ||||||
|  | ////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
|  | inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) | ||||||
|  | { | ||||||
|  |   int NN    = BlockSolverGrid->_ndimension; | ||||||
|  |   int nsimd = BlockSolverGrid->Nsimd(); | ||||||
|  |    | ||||||
|  |   std::vector<int> latt_phys(0); | ||||||
|  |   std::vector<int> simd_phys(0); | ||||||
|  |   std::vector<int>  mpi_phys(0); | ||||||
|  |    | ||||||
|  |   for(int d=0;d<NN;d++){ | ||||||
|  |     if( d!=Orthog ) {  | ||||||
|  |       latt_phys.push_back(BlockSolverGrid->_fdimensions[d]); | ||||||
|  |       simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); | ||||||
|  |       mpi_phys.push_back(BlockSolverGrid->_processors[d]); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);  | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | template<class vobj> | ||||||
|  | static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)  | ||||||
|  | {     | ||||||
|  |   typedef typename vobj::scalar_object sobj; | ||||||
|  |   typedef typename vobj::scalar_type scalar_type; | ||||||
|  |   typedef typename vobj::vector_type vector_type; | ||||||
|  |  | ||||||
|  |   int Nblock = X._grid->GlobalDimensions()[Orthog]; | ||||||
|  |    | ||||||
|  |   GridBase *FullGrid  = X._grid; | ||||||
|  |   GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); | ||||||
|  |    | ||||||
|  |   Lattice<vobj> Xslice(SliceGrid); | ||||||
|  |   Lattice<vobj> Rslice(SliceGrid); | ||||||
|  |    | ||||||
|  |   for(int i=0;i<Nblock;i++){ | ||||||
|  |     ExtractSlice(Rslice,Y,i,Orthog); | ||||||
|  |     for(int j=0;j<Nblock;j++){ | ||||||
|  |       ExtractSlice(Xslice,X,j,Orthog); | ||||||
|  |       Rslice = Rslice + Xslice*(scale*aa(j,i)); | ||||||
|  |     } | ||||||
|  |     InsertSlice(Rslice,R,i,Orthog); | ||||||
|  |   } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | template<class vobj> | ||||||
|  | static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)  | ||||||
|  | { | ||||||
|  |   // FIXME: Implementation is slow | ||||||
|  |   // Not sure of best solution.. think about it | ||||||
|  |   typedef typename vobj::scalar_object sobj; | ||||||
|  |   typedef typename vobj::scalar_type scalar_type; | ||||||
|  |   typedef typename vobj::vector_type vector_type; | ||||||
|  |    | ||||||
|  |   GridBase *FullGrid  = lhs._grid; | ||||||
|  |   GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); | ||||||
|  |    | ||||||
|  |   int Nblock = FullGrid->GlobalDimensions()[Orthog]; | ||||||
|  |    | ||||||
|  |   Lattice<vobj> Lslice(SliceGrid); | ||||||
|  |   Lattice<vobj> Rslice(SliceGrid); | ||||||
|  |    | ||||||
|  |   mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); | ||||||
|  |    | ||||||
|  |   for(int i=0;i<Nblock;i++){ | ||||||
|  |     ExtractSlice(Lslice,lhs,i,Orthog); | ||||||
|  |     for(int j=0;j<Nblock;j++){ | ||||||
|  |       ExtractSlice(Rslice,rhs,j,Orthog); | ||||||
|  |       mat(i,j) = innerProduct(Lslice,Rslice); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  | #undef FORCE_DIAG | ||||||
|  | #ifdef FORCE_DIAG | ||||||
|  |   for(int i=0;i<Nblock;i++){ | ||||||
|  |     for(int j=0;j<Nblock;j++){ | ||||||
|  |       if ( i != j ) mat(i,j)=0.0; | ||||||
|  |     } | ||||||
|   } |   } | ||||||
|  | #endif | ||||||
|  |   return; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | } /*END NAMESPACE GRID*/ | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|   | |||||||
| @@ -359,7 +359,7 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out) | |||||||
|  |  | ||||||
|  |  | ||||||
| template<class vobj> | template<class vobj> | ||||||
| void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog) | void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog) | ||||||
| { | { | ||||||
|   typedef typename vobj::scalar_object sobj; |   typedef typename vobj::scalar_object sobj; | ||||||
|  |  | ||||||
| @@ -401,7 +401,7 @@ void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int | |||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> | template<class vobj> | ||||||
| void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, int orthog) | void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slice, int orthog) | ||||||
| { | { | ||||||
|   typedef typename vobj::scalar_object sobj; |   typedef typename vobj::scalar_object sobj; | ||||||
|  |  | ||||||
| @@ -444,7 +444,7 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in | |||||||
|  |  | ||||||
|  |  | ||||||
| template<class vobj> | template<class vobj> | ||||||
| void InsertSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog) | void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog) | ||||||
| { | { | ||||||
|   typedef typename vobj::scalar_object sobj; |   typedef typename vobj::scalar_object sobj; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -110,8 +110,8 @@ public: | |||||||
|   friend std::ostream& operator<< (std::ostream& stream, Logger& log){ |   friend std::ostream& operator<< (std::ostream& stream, Logger& log){ | ||||||
|  |  | ||||||
|     if ( log.active ) { |     if ( log.active ) { | ||||||
|       stream << log.background()<< std::setw(10) << std::left << log.topName << log.background()<< " : "; |       stream << log.background()<< std::setw(8) << std::left << log.topName << log.background()<< " : "; | ||||||
|       stream << log.colour() << std::setw(14) << std::left << log.name << log.background() << " : "; |       stream << log.colour() << std::setw(10) << std::left << log.name << log.background() << " : "; | ||||||
|       if ( log.timestamp ) { |       if ( log.timestamp ) { | ||||||
| 	StopWatch.Stop(); | 	StopWatch.Stop(); | ||||||
| 	GridTime now = StopWatch.Elapsed(); | 	GridTime now = StopWatch.Elapsed(); | ||||||
|   | |||||||
| @@ -58,6 +58,7 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk> | |||||||
| #include <Grid/qcd/action/fermion/DomainWallFermion.h> | #include <Grid/qcd/action/fermion/DomainWallFermion.h> | ||||||
| #include <Grid/qcd/action/fermion/MobiusFermion.h> | #include <Grid/qcd/action/fermion/MobiusFermion.h> | ||||||
| #include <Grid/qcd/action/fermion/ZMobiusFermion.h> | #include <Grid/qcd/action/fermion/ZMobiusFermion.h> | ||||||
|  | #include <Grid/qcd/action/fermion/SchurDiagTwoKappa.h> | ||||||
| #include <Grid/qcd/action/fermion/ScaledShamirFermion.h> | #include <Grid/qcd/action/fermion/ScaledShamirFermion.h> | ||||||
| #include <Grid/qcd/action/fermion/MobiusZolotarevFermion.h> | #include <Grid/qcd/action/fermion/MobiusZolotarevFermion.h> | ||||||
| #include <Grid/qcd/action/fermion/ShamirZolotarevFermion.h> | #include <Grid/qcd/action/fermion/ShamirZolotarevFermion.h> | ||||||
|   | |||||||
| @@ -160,8 +160,6 @@ void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const | |||||||
|     PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4); |     PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   std::cout << " Umu " << Umu._odata[0]<<std::endl; |  | ||||||
|   std::cout << " UUUmu " << UUUmu._odata[0]<<std::endl; |  | ||||||
|   pickCheckerboard(Even, UmuEven, Umu); |   pickCheckerboard(Even, UmuEven, Umu); | ||||||
|   pickCheckerboard(Odd,  UmuOdd , Umu); |   pickCheckerboard(Odd,  UmuOdd , Umu); | ||||||
|   pickCheckerboard(Even, UUUmuEven, UUUmu); |   pickCheckerboard(Even, UUUmuEven, UUUmu); | ||||||
|   | |||||||
							
								
								
									
										102
									
								
								lib/qcd/action/fermion/SchurDiagTwoKappa.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										102
									
								
								lib/qcd/action/fermion/SchurDiagTwoKappa.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,102 @@ | |||||||
|  |     /************************************************************************************* | ||||||
|  |  | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  |     Source file: SchurDiagTwoKappa.h | ||||||
|  |  | ||||||
|  |     Copyright (C) 2017 | ||||||
|  |  | ||||||
|  | Author: Christoph Lehner | ||||||
|  | Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |  | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |  | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  |     *************************************************************************************/ | ||||||
|  |     /*  END LEGAL */ | ||||||
|  | #ifndef  _SCHUR_DIAG_TWO_KAPPA_H | ||||||
|  | #define  _SCHUR_DIAG_TWO_KAPPA_H | ||||||
|  |  | ||||||
|  | namespace Grid { | ||||||
|  |  | ||||||
|  |   // This is specific to (Z)mobius fermions | ||||||
|  |   template<class Matrix, class Field> | ||||||
|  |     class KappaSimilarityTransform { | ||||||
|  |   public: | ||||||
|  |     INHERIT_IMPL_TYPES(Matrix); | ||||||
|  |     std::vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag; | ||||||
|  |  | ||||||
|  |     KappaSimilarityTransform (Matrix &zmob) { | ||||||
|  |       for (int i=0;i<(int)zmob.bs.size();i++) { | ||||||
|  | 	Coeff_t k = 1.0 / ( 2.0 * (zmob.bs[i] *(4 - zmob.M5) + 1.0) ); | ||||||
|  | 	kappa.push_back( k ); | ||||||
|  | 	kappaDag.push_back( conj(k) ); | ||||||
|  | 	kappaInv.push_back( 1.0 / k ); | ||||||
|  | 	kappaInvDag.push_back( 1.0 / conj(k) ); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |   template<typename vobj> | ||||||
|  |     void sscale(const Lattice<vobj>& in, Lattice<vobj>& out, Coeff_t* s) { | ||||||
|  |     GridBase *grid=out._grid; | ||||||
|  |     out.checkerboard = in.checkerboard; | ||||||
|  |     assert(grid->_simd_layout[0] == 1); // should be fine for ZMobius for now | ||||||
|  |     int Ls = grid->_rdimensions[0]; | ||||||
|  |     parallel_for(int ss=0;ss<grid->oSites();ss++){ | ||||||
|  |       vobj tmp = s[ss % Ls]*in._odata[ss]; | ||||||
|  |       vstream(out._odata[ss],tmp); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   RealD sscale_norm(const Field& in, Field& out, Coeff_t* s) { | ||||||
|  |     sscale(in,out,s); | ||||||
|  |     return norm2(out); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   virtual RealD M       (const Field& in, Field& out) { return sscale_norm(in,out,&kappa[0]);   } | ||||||
|  |   virtual RealD MDag    (const Field& in, Field& out) { return sscale_norm(in,out,&kappaDag[0]);} | ||||||
|  |   virtual RealD MInv    (const Field& in, Field& out) { return sscale_norm(in,out,&kappaInv[0]);} | ||||||
|  |   virtual RealD MInvDag (const Field& in, Field& out) { return sscale_norm(in,out,&kappaInvDag[0]);} | ||||||
|  |  | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   template<class Matrix,class Field> | ||||||
|  |     class SchurDiagTwoKappaOperator :  public SchurOperatorBase<Field> { | ||||||
|  |   public: | ||||||
|  |     KappaSimilarityTransform<Matrix, Field> _S; | ||||||
|  |     SchurDiagTwoOperator<Matrix, Field> _Mat; | ||||||
|  |  | ||||||
|  |     SchurDiagTwoKappaOperator (Matrix &Mat): _S(Mat), _Mat(Mat) {}; | ||||||
|  |  | ||||||
|  |     virtual  RealD Mpc      (const Field &in, Field &out) { | ||||||
|  |       Field tmp(in._grid); | ||||||
|  |  | ||||||
|  |       _S.MInv(in,out); | ||||||
|  |       _Mat.Mpc(out,tmp); | ||||||
|  |       return _S.M(tmp,out); | ||||||
|  |  | ||||||
|  |     } | ||||||
|  |     virtual  RealD MpcDag   (const Field &in, Field &out){ | ||||||
|  |       Field tmp(in._grid); | ||||||
|  |  | ||||||
|  |       _S.MDag(in,out); | ||||||
|  |       _Mat.MpcDag(out,tmp); | ||||||
|  |       return _S.MInvDag(tmp,out); | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #endif | ||||||
| @@ -54,7 +54,7 @@ THE SOFTWARE. | |||||||
|  |  | ||||||
| #define GRID_MACRO_EMPTY() | #define GRID_MACRO_EMPTY() | ||||||
|  |  | ||||||
| #define GRID_MACRO_EVAL(...)     GRID_MACRO_EVAL1024(__VA_ARGS__) | #define GRID_MACRO_EVAL(...)     GRID_MACRO_EVAL64(__VA_ARGS__) | ||||||
| #define GRID_MACRO_EVAL1024(...) GRID_MACRO_EVAL512(GRID_MACRO_EVAL512(__VA_ARGS__)) | #define GRID_MACRO_EVAL1024(...) GRID_MACRO_EVAL512(GRID_MACRO_EVAL512(__VA_ARGS__)) | ||||||
| #define GRID_MACRO_EVAL512(...)  GRID_MACRO_EVAL256(GRID_MACRO_EVAL256(__VA_ARGS__)) | #define GRID_MACRO_EVAL512(...)  GRID_MACRO_EVAL256(GRID_MACRO_EVAL256(__VA_ARGS__)) | ||||||
| #define GRID_MACRO_EVAL256(...)  GRID_MACRO_EVAL128(GRID_MACRO_EVAL128(__VA_ARGS__)) | #define GRID_MACRO_EVAL256(...)  GRID_MACRO_EVAL128(GRID_MACRO_EVAL128(__VA_ARGS__)) | ||||||
|   | |||||||
| @@ -377,8 +377,8 @@ namespace Optimization { | |||||||
|       b0 = _mm256_extractf128_si256(b,0); |       b0 = _mm256_extractf128_si256(b,0); | ||||||
|       a1 = _mm256_extractf128_si256(a,1); |       a1 = _mm256_extractf128_si256(a,1); | ||||||
|       b1 = _mm256_extractf128_si256(b,1); |       b1 = _mm256_extractf128_si256(b,1); | ||||||
|       a0 = _mm_mul_epi32(a0,b0); |       a0 = _mm_mullo_epi32(a0,b0); | ||||||
|       a1 = _mm_mul_epi32(a1,b1); |       a1 = _mm_mullo_epi32(a1,b1); | ||||||
|       return _mm256_set_m128i(a1,a0); |       return _mm256_set_m128i(a1,a0); | ||||||
| #endif | #endif | ||||||
| #if defined (AVX2) | #if defined (AVX2) | ||||||
| @@ -470,7 +470,52 @@ namespace Optimization { | |||||||
|       return in; |       return in; | ||||||
|     }; |     }; | ||||||
|   }; |   }; | ||||||
|  | #define USE_FP16 | ||||||
|  |   struct PrecisionChange { | ||||||
|  |     static inline __m256i StoH (__m256 a,__m256 b) { | ||||||
|  |       __m256 h; | ||||||
|  | #ifdef USE_FP16 | ||||||
|  |       __m128i ha = _mm256_cvtps_ph(a,0); | ||||||
|  |       __m128i hb = _mm256_cvtps_ph(b,0); | ||||||
|  |       h = _mm256_castps128_ps256(ha); | ||||||
|  |       h = _mm256_insertf128_ps(h,hb,1); | ||||||
|  | #else  | ||||||
|  |       assert(0); | ||||||
|  | #endif | ||||||
|  |       return h; | ||||||
|  |     } | ||||||
|  |     static inline void  HtoS (__m256i h,__m256 &sa,__m256 &sb) { | ||||||
|  | #ifdef USE_FP16 | ||||||
|  |       sa = _mm256_cvtph_ps(_mm256_extractf128_ps(h,0)); | ||||||
|  |       sb = _mm256_cvtph_ps(_mm256_extractf128_ps(h,1)); | ||||||
|  | #else  | ||||||
|  |       assert(0); | ||||||
|  | #endif | ||||||
|  |     } | ||||||
|  |     static inline __m256 DtoS (__m256d a,__m256d b) { | ||||||
|  |       __m128 sa = _mm256_cvtpd_ps(a); | ||||||
|  |       __m128 sb = _mm256_cvtpd_ps(b); | ||||||
|  |       __m256 s = _mm256_castps128_ps256(sa); | ||||||
|  |       s = _mm256_insertf128_ps(s,sb,1); | ||||||
|  |       return s; | ||||||
|  |     } | ||||||
|  |     static inline void StoD (__m256 s,__m256d &a,__m256d &b) { | ||||||
|  |       a = _mm256_cvtps_pd(_mm256_extractf128_ps(s,0)); | ||||||
|  |       b = _mm256_cvtps_pd(_mm256_extractf128_ps(s,1)); | ||||||
|  |     } | ||||||
|  |     static inline __m256i DtoH (__m256d a,__m256d b,__m256d c,__m256d d) { | ||||||
|  |       __m256 sa,sb; | ||||||
|  |       sa = DtoS(a,b); | ||||||
|  |       sb = DtoS(c,d); | ||||||
|  |       return StoH(sa,sb); | ||||||
|  |     } | ||||||
|  |     static inline void HtoD (__m256i h,__m256d &a,__m256d &b,__m256d &c,__m256d &d) { | ||||||
|  |       __m256 sa,sb; | ||||||
|  |       HtoS(h,sa,sb); | ||||||
|  |       StoD(sa,a,b); | ||||||
|  |       StoD(sb,c,d); | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|   struct Exchange{ |   struct Exchange{ | ||||||
|     // 3210 ordering |     // 3210 ordering | ||||||
|     static inline void Exchange0(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){ |     static inline void Exchange0(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){ | ||||||
| @@ -675,6 +720,7 @@ namespace Optimization { | |||||||
| ////////////////////////////////////////////////////////////////////////////////////// | ////////////////////////////////////////////////////////////////////////////////////// | ||||||
| // Here assign types  | // Here assign types  | ||||||
|  |  | ||||||
|  |   typedef __m256i SIMD_Htype;  // Single precision type | ||||||
|   typedef __m256  SIMD_Ftype; // Single precision type |   typedef __m256  SIMD_Ftype; // Single precision type | ||||||
|   typedef __m256d SIMD_Dtype; // Double precision type |   typedef __m256d SIMD_Dtype; // Double precision type | ||||||
|   typedef __m256i SIMD_Itype; // Integer type |   typedef __m256i SIMD_Itype; // Integer type | ||||||
|   | |||||||
| @@ -235,11 +235,9 @@ namespace Optimization { | |||||||
|     inline void mac(__m512 &a, __m512 b, __m512 c){          |     inline void mac(__m512 &a, __m512 b, __m512 c){          | ||||||
|        a= _mm512_fmadd_ps( b, c, a);                          |        a= _mm512_fmadd_ps( b, c, a);                          | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     inline void mac(__m512d &a, __m512d b, __m512d c){ |     inline void mac(__m512d &a, __m512d b, __m512d c){ | ||||||
|       a= _mm512_fmadd_pd( b, c, a);                    |       a= _mm512_fmadd_pd( b, c, a);                    | ||||||
|     }                                              |     }                                              | ||||||
|  |  | ||||||
|     // Real float |     // Real float | ||||||
|     inline __m512 operator()(__m512 a, __m512 b){ |     inline __m512 operator()(__m512 a, __m512 b){ | ||||||
|       return _mm512_mul_ps(a,b); |       return _mm512_mul_ps(a,b); | ||||||
| @@ -342,7 +340,52 @@ namespace Optimization { | |||||||
|     }; |     }; | ||||||
|  |  | ||||||
|   }; |   }; | ||||||
|  | #define USE_FP16 | ||||||
|  |   struct PrecisionChange { | ||||||
|  |     static inline __m512i StoH (__m512 a,__m512 b) { | ||||||
|  |       __m512i h; | ||||||
|  | #ifdef USE_FP16 | ||||||
|  |       __m256i ha = _mm512_cvtps_ph(a,0); | ||||||
|  |       __m256i hb = _mm512_cvtps_ph(b,0); | ||||||
|  |       h =(__m512i) _mm512_castps256_ps512((__m256)ha); | ||||||
|  |       h =(__m512i) _mm512_insertf64x4((__m512d)h,(__m256d)hb,1); | ||||||
|  | #else | ||||||
|  |       assert(0); | ||||||
|  | #endif | ||||||
|  |       return h; | ||||||
|  |     } | ||||||
|  |     static inline void  HtoS (__m512i h,__m512 &sa,__m512 &sb) { | ||||||
|  | #ifdef USE_FP16 | ||||||
|  |       sa = _mm512_cvtph_ps((__m256i)_mm512_extractf64x4_pd((__m512d)h,0)); | ||||||
|  |       sb = _mm512_cvtph_ps((__m256i)_mm512_extractf64x4_pd((__m512d)h,1)); | ||||||
|  | #else | ||||||
|  |       assert(0); | ||||||
|  | #endif | ||||||
|  |     } | ||||||
|  |     static inline __m512 DtoS (__m512d a,__m512d b) { | ||||||
|  |       __m256 sa = _mm512_cvtpd_ps(a); | ||||||
|  |       __m256 sb = _mm512_cvtpd_ps(b); | ||||||
|  |       __m512 s = _mm512_castps256_ps512(sa); | ||||||
|  |       s =(__m512) _mm512_insertf64x4((__m512d)s,(__m256d)sb,1); | ||||||
|  |       return s; | ||||||
|  |     } | ||||||
|  |     static inline void StoD (__m512 s,__m512d &a,__m512d &b) { | ||||||
|  |       a = _mm512_cvtps_pd((__m256)_mm512_extractf64x4_pd((__m512d)s,0)); | ||||||
|  |       b = _mm512_cvtps_pd((__m256)_mm512_extractf64x4_pd((__m512d)s,1)); | ||||||
|  |     } | ||||||
|  |     static inline __m512i DtoH (__m512d a,__m512d b,__m512d c,__m512d d) { | ||||||
|  |       __m512 sa,sb; | ||||||
|  |       sa = DtoS(a,b); | ||||||
|  |       sb = DtoS(c,d); | ||||||
|  |       return StoH(sa,sb); | ||||||
|  |     } | ||||||
|  |     static inline void HtoD (__m512i h,__m512d &a,__m512d &b,__m512d &c,__m512d &d) { | ||||||
|  |       __m512 sa,sb; | ||||||
|  |       HtoS(h,sa,sb); | ||||||
|  |       StoD(sa,a,b); | ||||||
|  |       StoD(sb,c,d); | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|   // On extracting face: Ah Al , Bh Bl -> Ah Bh, Al Bl |   // On extracting face: Ah Al , Bh Bl -> Ah Bh, Al Bl | ||||||
|   // On merging buffers: Ah,Bh , Al Bl -> Ah Al, Bh, Bl |   // On merging buffers: Ah,Bh , Al Bl -> Ah Al, Bh, Bl | ||||||
|   // The operation is its own inverse |   // The operation is its own inverse | ||||||
| @@ -539,6 +582,8 @@ namespace Optimization { | |||||||
| ////////////////////////////////////////////////////////////////////////////////////// | ////////////////////////////////////////////////////////////////////////////////////// | ||||||
| // Here assign types  | // Here assign types  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   typedef __m512i SIMD_Htype;  // Single precision type | ||||||
|   typedef __m512  SIMD_Ftype;  // Single precision type |   typedef __m512  SIMD_Ftype;  // Single precision type | ||||||
|   typedef __m512d SIMD_Dtype; // Double precision type |   typedef __m512d SIMD_Dtype; // Double precision type | ||||||
|   typedef __m512i SIMD_Itype; // Integer type |   typedef __m512i SIMD_Itype; // Integer type | ||||||
|   | |||||||
| @@ -279,6 +279,101 @@ namespace Optimization { | |||||||
|    |    | ||||||
|   #undef timesi |   #undef timesi | ||||||
|  |  | ||||||
|  |   struct PrecisionChange { | ||||||
|  |     static inline vech StoH (const vecf &a,const vecf &b) { | ||||||
|  | #ifdef USE_FP16 | ||||||
|  |       vech ret; | ||||||
|  |       vech *ha = (vech *)&a; | ||||||
|  |       vech *hb = (vech *)&b; | ||||||
|  |       const int nf = W<float>::r; | ||||||
|  |       //      VECTOR_FOR(i, nf,1){ ret.v[i]    = ( (uint16_t *) &a.v[i])[1] ; } | ||||||
|  |       //      VECTOR_FOR(i, nf,1){ ret.v[i+nf] = ( (uint16_t *) &b.v[i])[1] ; } | ||||||
|  |       VECTOR_FOR(i, nf,1){ ret.v[i]    = ha->v[2*i+1]; } | ||||||
|  |       VECTOR_FOR(i, nf,1){ ret.v[i+nf] = hb->v[2*i+1]; } | ||||||
|  | #else | ||||||
|  |       assert(0); | ||||||
|  | #endif | ||||||
|  |       return ret; | ||||||
|  |     } | ||||||
|  |     static inline void  HtoS (vech h,vecf &sa,vecf &sb) { | ||||||
|  | #ifdef USE_FP16 | ||||||
|  |       const int nf = W<float>::r; | ||||||
|  |       const int nh = W<uint16_t>::r; | ||||||
|  |       vech *ha = (vech *)&sa; | ||||||
|  |       vech *hb = (vech *)&sb; | ||||||
|  |       VECTOR_FOR(i, nf, 1){ sb.v[i]= sa.v[i] = 0; } | ||||||
|  |       //      VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sa.v[i]))[1] = h.v[i];} | ||||||
|  |       //      VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sb.v[i]))[1] = h.v[i+nf];} | ||||||
|  |       VECTOR_FOR(i, nf, 1){ ha->v[2*i+1]=h.v[i]; } | ||||||
|  |       VECTOR_FOR(i, nf, 1){ hb->v[2*i+1]=h.v[i+nf]; } | ||||||
|  | #else | ||||||
|  |       assert(0); | ||||||
|  | #endif | ||||||
|  |     } | ||||||
|  |     static inline vecf DtoS (vecd a,vecd b) { | ||||||
|  |       const int nd = W<double>::r; | ||||||
|  |       const int nf = W<float>::r; | ||||||
|  |       vecf ret; | ||||||
|  |       VECTOR_FOR(i, nd,1){ ret.v[i]    = a.v[i] ; } | ||||||
|  |       VECTOR_FOR(i, nd,1){ ret.v[i+nd] = b.v[i] ; } | ||||||
|  |       return ret; | ||||||
|  |     } | ||||||
|  |     static inline void StoD (vecf s,vecd &a,vecd &b) { | ||||||
|  |       const int nd = W<double>::r; | ||||||
|  |       VECTOR_FOR(i, nd,1){ a.v[i] = s.v[i] ; } | ||||||
|  |       VECTOR_FOR(i, nd,1){ b.v[i] = s.v[i+nd] ; } | ||||||
|  |     } | ||||||
|  |     static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) { | ||||||
|  |       vecf sa,sb; | ||||||
|  |       sa = DtoS(a,b); | ||||||
|  |       sb = DtoS(c,d); | ||||||
|  |       return StoH(sa,sb); | ||||||
|  |     } | ||||||
|  |     static inline void HtoD (vech h,vecd &a,vecd &b,vecd &c,vecd &d) { | ||||||
|  |       vecf sa,sb; | ||||||
|  |       HtoS(h,sa,sb); | ||||||
|  |       StoD(sa,a,b); | ||||||
|  |       StoD(sb,c,d); | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   ////////////////////////////////////////////// | ||||||
|  |   // Exchange support | ||||||
|  |   struct Exchange{ | ||||||
|  |  | ||||||
|  |     template <typename T,int n> | ||||||
|  |     static inline void ExchangeN(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){ | ||||||
|  |       const int w = W<T>::r; | ||||||
|  |       unsigned int mask = w >> (n + 1); | ||||||
|  |       //      std::cout << " Exchange "<<n<<" nsimd "<<w<<" mask 0x" <<std::hex<<mask<<std::dec<<std::endl; | ||||||
|  |       VECTOR_FOR(i, w, 1) {	 | ||||||
|  | 	int j1 = i&(~mask); | ||||||
|  | 	if  ( (i&mask) == 0 ) { out1.v[i]=in1.v[j1];} | ||||||
|  | 	else                  { out1.v[i]=in2.v[j1];} | ||||||
|  | 	int j2 = i|mask; | ||||||
|  | 	if  ( (i&mask) == 0 ) { out2.v[i]=in1.v[j2];} | ||||||
|  | 	else                  { out2.v[i]=in2.v[j2];} | ||||||
|  |       }       | ||||||
|  |     } | ||||||
|  |     template <typename T> | ||||||
|  |     static inline void Exchange0(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){ | ||||||
|  |       ExchangeN<T,0>(out1,out2,in1,in2); | ||||||
|  |     }; | ||||||
|  |     template <typename T> | ||||||
|  |     static inline void Exchange1(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){ | ||||||
|  |       ExchangeN<T,1>(out1,out2,in1,in2); | ||||||
|  |     }; | ||||||
|  |     template <typename T> | ||||||
|  |     static inline void Exchange2(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){ | ||||||
|  |       ExchangeN<T,2>(out1,out2,in1,in2); | ||||||
|  |     }; | ||||||
|  |     template <typename T> | ||||||
|  |     static inline void Exchange3(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){ | ||||||
|  |       ExchangeN<T,3>(out1,out2,in1,in2); | ||||||
|  |     }; | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |  | ||||||
|   ////////////////////////////////////////////// |   ////////////////////////////////////////////// | ||||||
|   // Some Template specialization |   // Some Template specialization | ||||||
|   #define perm(a, b, n, w)\ |   #define perm(a, b, n, w)\ | ||||||
| @@ -403,6 +498,7 @@ namespace Optimization { | |||||||
| ////////////////////////////////////////////////////////////////////////////////////// | ////////////////////////////////////////////////////////////////////////////////////// | ||||||
| // Here assign types  | // Here assign types  | ||||||
|  |  | ||||||
|  |   typedef Optimization::vech SIMD_Htype; // Reduced precision type | ||||||
|   typedef Optimization::vecf SIMD_Ftype; // Single precision type |   typedef Optimization::vecf SIMD_Ftype; // Single precision type | ||||||
|   typedef Optimization::vecd SIMD_Dtype; // Double precision type |   typedef Optimization::vecd SIMD_Dtype; // Double precision type | ||||||
|   typedef Optimization::veci SIMD_Itype; // Integer type |   typedef Optimization::veci SIMD_Itype; // Integer type | ||||||
|   | |||||||
| @@ -66,6 +66,10 @@ namespace Optimization { | |||||||
|   template <> struct W<Integer> { |   template <> struct W<Integer> { | ||||||
|     constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; |     constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; | ||||||
|   }; |   }; | ||||||
|  |   template <> struct W<uint16_t> { | ||||||
|  |     constexpr static unsigned int c = GEN_SIMD_WIDTH/4u; | ||||||
|  |     constexpr static unsigned int r = GEN_SIMD_WIDTH/2u; | ||||||
|  |   }; | ||||||
|    |    | ||||||
|   // SIMD vector types |   // SIMD vector types | ||||||
|   template <typename T> |   template <typename T> | ||||||
| @@ -75,6 +79,7 @@ namespace Optimization { | |||||||
|  |  | ||||||
|   typedef vec<float>     vecf; |   typedef vec<float>     vecf; | ||||||
|   typedef vec<double>    vecd; |   typedef vec<double>    vecd; | ||||||
|  |   typedef vec<uint16_t>  vech; // half precision comms | ||||||
|   typedef vec<Integer>   veci; |   typedef vec<Integer>   veci; | ||||||
|    |    | ||||||
| }} | }} | ||||||
|   | |||||||
| @@ -125,7 +125,6 @@ namespace Optimization { | |||||||
|       f[2] = a.v2; |       f[2] = a.v2; | ||||||
|       f[3] = a.v3; |       f[3] = a.v3; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     //Double |     //Double | ||||||
|     inline void operator()(double *d, vector4double a){ |     inline void operator()(double *d, vector4double a){ | ||||||
|       vec_st(a, 0, d); |       vec_st(a, 0, d); | ||||||
|   | |||||||
| @@ -38,6 +38,7 @@ Author: neo <cossu@post.kek.jp> | |||||||
|  |  | ||||||
| #include <pmmintrin.h> | #include <pmmintrin.h> | ||||||
|  |  | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
| namespace Optimization { | namespace Optimization { | ||||||
|  |  | ||||||
| @@ -328,6 +329,56 @@ namespace Optimization { | |||||||
|     }; |     }; | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|  |    | ||||||
|  | #define _my_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) | ||||||
|  | #define _my_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) | ||||||
|  |  | ||||||
|  |   struct PrecisionChange { | ||||||
|  |     static inline __m128i StoH (__m128 a,__m128 b) { | ||||||
|  | #ifdef USE_FP16 | ||||||
|  |       __m128i ha = _mm_cvtps_ph(a,0); | ||||||
|  |       __m128i hb = _mm_cvtps_ph(b,0); | ||||||
|  |       __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); | ||||||
|  | #else | ||||||
|  |       __m128i h = (__m128i)a; | ||||||
|  |       assert(0); | ||||||
|  | #endif | ||||||
|  |       return h; | ||||||
|  |     } | ||||||
|  |     static inline void  HtoS (__m128i h,__m128 &sa,__m128 &sb) { | ||||||
|  | #ifdef USE_FP16 | ||||||
|  |       sa = _mm_cvtph_ps(h);  | ||||||
|  |       h =  (__m128i)_my_alignr_epi32((__m128i)h,(__m128i)h,2); | ||||||
|  |       sb = _mm_cvtph_ps(h); | ||||||
|  | #else | ||||||
|  |       assert(0); | ||||||
|  | #endif | ||||||
|  |     } | ||||||
|  |     static inline __m128 DtoS (__m128d a,__m128d b) { | ||||||
|  |       __m128 sa = _mm_cvtpd_ps(a); | ||||||
|  |       __m128 sb = _mm_cvtpd_ps(b); | ||||||
|  |       __m128 s = _mm_shuffle_ps(sa,sb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); | ||||||
|  |       return s; | ||||||
|  |     } | ||||||
|  |     static inline void StoD (__m128 s,__m128d &a,__m128d &b) { | ||||||
|  |       a = _mm_cvtps_pd(s); | ||||||
|  |       s = (__m128)_my_alignr_epi32((__m128i)s,(__m128i)s,2); | ||||||
|  |       b = _mm_cvtps_pd(s); | ||||||
|  |     } | ||||||
|  |     static inline __m128i DtoH (__m128d a,__m128d b,__m128d c,__m128d d) { | ||||||
|  |       __m128 sa,sb; | ||||||
|  |       sa = DtoS(a,b); | ||||||
|  |       sb = DtoS(c,d); | ||||||
|  |       return StoH(sa,sb); | ||||||
|  |     } | ||||||
|  |     static inline void HtoD (__m128i h,__m128d &a,__m128d &b,__m128d &c,__m128d &d) { | ||||||
|  |       __m128 sa,sb; | ||||||
|  |       HtoS(h,sa,sb); | ||||||
|  |       StoD(sa,a,b); | ||||||
|  |       StoD(sb,c,d); | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |  | ||||||
|   struct Exchange{ |   struct Exchange{ | ||||||
|     // 3210 ordering |     // 3210 ordering | ||||||
|     static inline void Exchange0(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ |     static inline void Exchange0(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ | ||||||
| @@ -335,8 +386,10 @@ namespace Optimization { | |||||||
|       out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2)); |       out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2)); | ||||||
|     }; |     }; | ||||||
|     static inline void Exchange1(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ |     static inline void Exchange1(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ | ||||||
|       out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); |       out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); /*ACEG*/ | ||||||
|       out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); |       out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); /*BDFH*/ | ||||||
|  |       out1= _mm_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ | ||||||
|  |       out2= _mm_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ | ||||||
|     }; |     }; | ||||||
|     static inline void Exchange2(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ |     static inline void Exchange2(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ | ||||||
|       assert(0); |       assert(0); | ||||||
| @@ -384,13 +437,8 @@ namespace Optimization { | |||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| #ifndef _mm_alignr_epi64 |     template<int n> static inline __m128  tRotate(__m128  in){ return (__m128)_my_alignr_epi32((__m128i)in,(__m128i)in,n); }; | ||||||
| #define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) |     template<int n> static inline __m128d tRotate(__m128d in){ return (__m128d)_my_alignr_epi64((__m128i)in,(__m128i)in,n); }; | ||||||
| #define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) |  | ||||||
| #endif  |  | ||||||
|  |  | ||||||
|     template<int n> static inline __m128  tRotate(__m128  in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); }; |  | ||||||
|     template<int n> static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); }; |  | ||||||
|  |  | ||||||
|   }; |   }; | ||||||
|   ////////////////////////////////////////////// |   ////////////////////////////////////////////// | ||||||
| @@ -450,6 +498,7 @@ namespace Optimization { | |||||||
| ////////////////////////////////////////////////////////////////////////////////////// | ////////////////////////////////////////////////////////////////////////////////////// | ||||||
| // Here assign types  | // Here assign types  | ||||||
|  |  | ||||||
|  |   typedef __m128i SIMD_Htype;  // Single precision type | ||||||
|   typedef __m128  SIMD_Ftype;  // Single precision type |   typedef __m128  SIMD_Ftype;  // Single precision type | ||||||
|   typedef __m128d SIMD_Dtype; // Double precision type |   typedef __m128d SIMD_Dtype; // Double precision type | ||||||
|   typedef __m128i SIMD_Itype; // Integer type |   typedef __m128i SIMD_Itype; // Integer type | ||||||
|   | |||||||
| @@ -2,7 +2,7 @@ | |||||||
|  |  | ||||||
| Grid physics library, www.github.com/paboyle/Grid | Grid physics library, www.github.com/paboyle/Grid | ||||||
|  |  | ||||||
| Source file: ./lib/simd/Grid_vector_types.h | Source file: ./lib/simd/Grid_vector_type.h | ||||||
|  |  | ||||||
| Copyright (C) 2015 | Copyright (C) 2015 | ||||||
|  |  | ||||||
| @@ -358,16 +358,12 @@ class Grid_simd { | |||||||
|   { |   { | ||||||
|     if       (n==3) { |     if       (n==3) { | ||||||
|       Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v); |       Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v); | ||||||
|       //      std::cout << " Exchange3 "<< out1<<" "<< out2<<" <- " << in1 << " "<<in2<<std::endl; |  | ||||||
|     } else if(n==2) { |     } else if(n==2) { | ||||||
|       Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v); |       Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v); | ||||||
|       //      std::cout << " Exchange2 "<< out1<<" "<< out2<<" <- " << in1 << " "<<in2<<std::endl; |  | ||||||
|     } else if(n==1) { |     } else if(n==1) { | ||||||
|       Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v); |       Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v); | ||||||
|       //      std::cout << " Exchange1 "<< out1<<" "<< out2<<" <- " << in1 << " "<<in2<<std::endl; |  | ||||||
|     } else if(n==0) {  |     } else if(n==0) {  | ||||||
|       Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v); |       Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v); | ||||||
|       //      std::cout << " Exchange0 "<< out1<<" "<< out2<<" <- " << in1 << " "<<in2<<std::endl; |  | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
| @@ -415,7 +411,6 @@ template <class S, class V, IfNotComplex<S> = 0> | |||||||
| inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) { | inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) { | ||||||
|   nrot = nrot % Grid_simd<S, V>::Nsimd(); |   nrot = nrot % Grid_simd<S, V>::Nsimd(); | ||||||
|   Grid_simd<S, V> ret; |   Grid_simd<S, V> ret; | ||||||
|   //    std::cout << "Rotate Real by "<<nrot<<std::endl; |  | ||||||
|   ret.v = Optimization::Rotate::rotate(b.v, nrot); |   ret.v = Optimization::Rotate::rotate(b.v, nrot); | ||||||
|   return ret; |   return ret; | ||||||
| } | } | ||||||
| @@ -423,7 +418,6 @@ template <class S, class V, IfComplex<S> = 0> | |||||||
| inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) { | inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) { | ||||||
|   nrot = nrot % Grid_simd<S, V>::Nsimd(); |   nrot = nrot % Grid_simd<S, V>::Nsimd(); | ||||||
|   Grid_simd<S, V> ret; |   Grid_simd<S, V> ret; | ||||||
|   //    std::cout << "Rotate Complex by "<<nrot<<std::endl; |  | ||||||
|   ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot); |   ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot); | ||||||
|   return ret; |   return ret; | ||||||
| } | } | ||||||
| @@ -431,14 +425,12 @@ template <class S, class V, IfNotComplex<S> =0> | |||||||
| inline void rotate( Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot) | inline void rotate( Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot) | ||||||
| { | { | ||||||
|   nrot = nrot % Grid_simd<S,V>::Nsimd(); |   nrot = nrot % Grid_simd<S,V>::Nsimd(); | ||||||
|   //    std::cout << "Rotate Real by "<<nrot<<std::endl; |  | ||||||
|   ret.v = Optimization::Rotate::rotate(b.v,nrot); |   ret.v = Optimization::Rotate::rotate(b.v,nrot); | ||||||
| } | } | ||||||
| template <class S, class V, IfComplex<S> =0>  | template <class S, class V, IfComplex<S> =0>  | ||||||
| inline void rotate(Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot) | inline void rotate(Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot) | ||||||
| { | { | ||||||
|   nrot = nrot % Grid_simd<S,V>::Nsimd(); |   nrot = nrot % Grid_simd<S,V>::Nsimd(); | ||||||
|   //    std::cout << "Rotate Complex by "<<nrot<<std::endl; |  | ||||||
|   ret.v = Optimization::Rotate::rotate(b.v,2*nrot); |   ret.v = Optimization::Rotate::rotate(b.v,2*nrot); | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -698,7 +690,6 @@ inline Grid_simd<S, V> innerProduct(const Grid_simd<S, V> &l, | |||||||
|                                     const Grid_simd<S, V> &r) { |                                     const Grid_simd<S, V> &r) { | ||||||
|   return conjugate(l) * r; |   return conjugate(l) * r; | ||||||
| } | } | ||||||
|  |  | ||||||
| template <class S, class V> | template <class S, class V> | ||||||
| inline Grid_simd<S, V> outerProduct(const Grid_simd<S, V> &l, | inline Grid_simd<S, V> outerProduct(const Grid_simd<S, V> &l, | ||||||
|                                     const Grid_simd<S, V> &r) { |                                     const Grid_simd<S, V> &r) { | ||||||
| @@ -758,6 +749,67 @@ typedef Grid_simd<std::complex<float>, SIMD_Ftype> vComplexF; | |||||||
| typedef Grid_simd<std::complex<double>, SIMD_Dtype> vComplexD; | typedef Grid_simd<std::complex<double>, SIMD_Dtype> vComplexD; | ||||||
| typedef Grid_simd<Integer, SIMD_Itype> vInteger; | typedef Grid_simd<Integer, SIMD_Itype> vInteger; | ||||||
|  |  | ||||||
|  | // Half precision; no arithmetic support | ||||||
|  | typedef Grid_simd<uint16_t, SIMD_Htype>               vRealH; | ||||||
|  | typedef Grid_simd<std::complex<uint16_t>, SIMD_Htype> vComplexH; | ||||||
|  |  | ||||||
|  | inline void precisionChange(vRealF    *out,vRealD    *in,int nvec) | ||||||
|  | { | ||||||
|  |   assert((nvec&0x1)==0); | ||||||
|  |   for(int m=0;m*2<nvec;m++){ | ||||||
|  |     int n=m*2; | ||||||
|  |     out[m].v=Optimization::PrecisionChange::DtoS(in[n].v,in[n+1].v); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | inline void precisionChange(vRealH    *out,vRealD    *in,int nvec) | ||||||
|  | { | ||||||
|  |   assert((nvec&0x3)==0); | ||||||
|  |   for(int m=0;m*4<nvec;m++){ | ||||||
|  |     int n=m*4; | ||||||
|  |     out[m].v=Optimization::PrecisionChange::DtoH(in[n].v,in[n+1].v,in[n+2].v,in[n+3].v); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | inline void precisionChange(vRealH    *out,vRealF    *in,int nvec) | ||||||
|  | { | ||||||
|  |   assert((nvec&0x1)==0); | ||||||
|  |   for(int m=0;m*2<nvec;m++){ | ||||||
|  |     int n=m*2; | ||||||
|  |     out[m].v=Optimization::PrecisionChange::StoH(in[n].v,in[n+1].v); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | inline void precisionChange(vRealD    *out,vRealF    *in,int nvec) | ||||||
|  | { | ||||||
|  |   assert((nvec&0x1)==0); | ||||||
|  |   for(int m=0;m*2<nvec;m++){ | ||||||
|  |     int n=m*2; | ||||||
|  |     Optimization::PrecisionChange::StoD(in[m].v,out[n].v,out[n+1].v); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | inline void precisionChange(vRealD    *out,vRealH    *in,int nvec) | ||||||
|  | { | ||||||
|  |   assert((nvec&0x3)==0); | ||||||
|  |   for(int m=0;m*4<nvec;m++){ | ||||||
|  |     int n=m*4; | ||||||
|  |     Optimization::PrecisionChange::HtoD(in[m].v,out[n].v,out[n+1].v,out[n+2].v,out[n+3].v); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | inline void precisionChange(vRealF    *out,vRealH    *in,int nvec) | ||||||
|  | { | ||||||
|  |   assert((nvec&0x1)==0); | ||||||
|  |   for(int m=0;m*2<nvec;m++){ | ||||||
|  |     int n=m*2; | ||||||
|  |     Optimization::PrecisionChange::HtoS(in[m].v,out[n].v,out[n+1].v); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | inline void precisionChange(vComplexF *out,vComplexD *in,int nvec){ precisionChange((vRealF *)out,(vRealD *)in,nvec);} | ||||||
|  | inline void precisionChange(vComplexH *out,vComplexD *in,int nvec){ precisionChange((vRealH *)out,(vRealD *)in,nvec);} | ||||||
|  | inline void precisionChange(vComplexH *out,vComplexF *in,int nvec){ precisionChange((vRealH *)out,(vRealF *)in,nvec);} | ||||||
|  | inline void precisionChange(vComplexD *out,vComplexF *in,int nvec){ precisionChange((vRealD *)out,(vRealF *)in,nvec);} | ||||||
|  | inline void precisionChange(vComplexD *out,vComplexH *in,int nvec){ precisionChange((vRealD *)out,(vRealH *)in,nvec);} | ||||||
|  | inline void precisionChange(vComplexF *out,vComplexH *in,int nvec){ precisionChange((vRealF *)out,(vRealH *)in,nvec);} | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| // Check our vector types are of an appropriate size. | // Check our vector types are of an appropriate size. | ||||||
| #if defined QPX | #if defined QPX | ||||||
| static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect"); | static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect"); | ||||||
|   | |||||||
| @@ -56,11 +56,11 @@ class iScalar { | |||||||
|   typedef vtype element; |   typedef vtype element; | ||||||
|   typedef typename GridTypeMapper<vtype>::scalar_type scalar_type; |   typedef typename GridTypeMapper<vtype>::scalar_type scalar_type; | ||||||
|   typedef typename GridTypeMapper<vtype>::vector_type vector_type; |   typedef typename GridTypeMapper<vtype>::vector_type vector_type; | ||||||
|  |   typedef typename GridTypeMapper<vtype>::vector_typeD vector_typeD; | ||||||
|   typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v; |   typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v; | ||||||
|   typedef iScalar<tensor_reduced_v> tensor_reduced; |  | ||||||
|   typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object; |   typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object; | ||||||
|  |   typedef iScalar<tensor_reduced_v> tensor_reduced; | ||||||
|   typedef iScalar<recurse_scalar_object> scalar_object; |   typedef iScalar<recurse_scalar_object> scalar_object; | ||||||
|  |  | ||||||
|   // substitutes a real or complex version with same tensor structure |   // substitutes a real or complex version with same tensor structure | ||||||
|   typedef iScalar<typename GridTypeMapper<vtype>::Complexified> Complexified; |   typedef iScalar<typename GridTypeMapper<vtype>::Complexified> Complexified; | ||||||
|   typedef iScalar<typename GridTypeMapper<vtype>::Realified> Realified; |   typedef iScalar<typename GridTypeMapper<vtype>::Realified> Realified; | ||||||
| @@ -77,8 +77,12 @@ class iScalar { | |||||||
|   iScalar<vtype> & operator= (const iScalar<vtype> ©me) = default; |   iScalar<vtype> & operator= (const iScalar<vtype> ©me) = default; | ||||||
|   iScalar<vtype> & operator= (iScalar<vtype> &©me) = default; |   iScalar<vtype> & operator= (iScalar<vtype> &©me) = default; | ||||||
|   */ |   */ | ||||||
|   iScalar(scalar_type s) |  | ||||||
|       : _internal(s){};  // recurse down and hit the constructor for vector_type |   //  template<int N=0> | ||||||
|  |   //  iScalar(EnableIf<isSIMDvectorized<vector_type>, vector_type> s) : _internal(s){};  // recurse down and hit the constructor for vector_type | ||||||
|  |  | ||||||
|  |   iScalar(scalar_type s) : _internal(s){};  // recurse down and hit the constructor for vector_type | ||||||
|  |  | ||||||
|   iScalar(const Zero &z) { *this = zero; }; |   iScalar(const Zero &z) { *this = zero; }; | ||||||
|  |  | ||||||
|   iScalar<vtype> &operator=(const Zero &hero) { |   iScalar<vtype> &operator=(const Zero &hero) { | ||||||
| @@ -134,32 +138,27 @@ class iScalar { | |||||||
|   strong_inline const vtype &operator()(void) const { return _internal; } |   strong_inline const vtype &operator()(void) const { return _internal; } | ||||||
|  |  | ||||||
|   // Type casts meta programmed, must be pure scalar to match TensorRemove |   // Type casts meta programmed, must be pure scalar to match TensorRemove | ||||||
|   template <class U = vtype, class V = scalar_type, IfComplex<V> = 0, |   template <class U = vtype, class V = scalar_type, IfComplex<V> = 0, IfNotSimd<U> = 0> | ||||||
|             IfNotSimd<U> = 0> |  | ||||||
|   operator ComplexF() const { |   operator ComplexF() const { | ||||||
|     return (TensorRemove(_internal)); |     return (TensorRemove(_internal)); | ||||||
|   }; |   }; | ||||||
|   template <class U = vtype, class V = scalar_type, IfComplex<V> = 0, |   template <class U = vtype, class V = scalar_type, IfComplex<V> = 0, IfNotSimd<U> = 0> | ||||||
|             IfNotSimd<U> = 0> |  | ||||||
|   operator ComplexD() const { |   operator ComplexD() const { | ||||||
|     return (TensorRemove(_internal)); |     return (TensorRemove(_internal)); | ||||||
|   }; |   }; | ||||||
|   //  template<class U=vtype,class V=scalar_type,IfComplex<V> = 0,IfNotSimd<U> = |   //  template<class U=vtype,class V=scalar_type,IfComplex<V> = 0,IfNotSimd<U> = | ||||||
|   //  0> operator RealD    () const { return(real(TensorRemove(_internal))); } |   //  0> operator RealD    () const { return(real(TensorRemove(_internal))); } | ||||||
|   template <class U = vtype, class V = scalar_type, IfReal<V> = 0, |   template <class U = vtype, class V = scalar_type, IfReal<V> = 0,IfNotSimd<U> = 0> | ||||||
|             IfNotSimd<U> = 0> |  | ||||||
|   operator RealD() const { |   operator RealD() const { | ||||||
|     return TensorRemove(_internal); |     return TensorRemove(_internal); | ||||||
|   } |   } | ||||||
|   template <class U = vtype, class V = scalar_type, IfInteger<V> = 0, |   template <class U = vtype, class V = scalar_type, IfInteger<V> = 0, IfNotSimd<U> = 0> | ||||||
|             IfNotSimd<U> = 0> |  | ||||||
|   operator Integer() const { |   operator Integer() const { | ||||||
|     return Integer(TensorRemove(_internal)); |     return Integer(TensorRemove(_internal)); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   // convert from a something to a scalar via constructor of something arg |   // convert from a something to a scalar via constructor of something arg | ||||||
|   template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type |   template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type * = nullptr> | ||||||
|                          * = nullptr> |  | ||||||
|     strong_inline iScalar<vtype> operator=(T arg) { |     strong_inline iScalar<vtype> operator=(T arg) { | ||||||
|     _internal = arg; |     _internal = arg; | ||||||
|     return *this; |     return *this; | ||||||
| @@ -193,6 +192,7 @@ class iVector { | |||||||
|   typedef vtype element; |   typedef vtype element; | ||||||
|   typedef typename GridTypeMapper<vtype>::scalar_type scalar_type; |   typedef typename GridTypeMapper<vtype>::scalar_type scalar_type; | ||||||
|   typedef typename GridTypeMapper<vtype>::vector_type vector_type; |   typedef typename GridTypeMapper<vtype>::vector_type vector_type; | ||||||
|  |   typedef typename GridTypeMapper<vtype>::vector_typeD vector_typeD; | ||||||
|   typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v; |   typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v; | ||||||
|   typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object; |   typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object; | ||||||
|   typedef iScalar<tensor_reduced_v> tensor_reduced; |   typedef iScalar<tensor_reduced_v> tensor_reduced; | ||||||
| @@ -305,6 +305,7 @@ class iMatrix { | |||||||
|   typedef vtype element; |   typedef vtype element; | ||||||
|   typedef typename GridTypeMapper<vtype>::scalar_type scalar_type; |   typedef typename GridTypeMapper<vtype>::scalar_type scalar_type; | ||||||
|   typedef typename GridTypeMapper<vtype>::vector_type vector_type; |   typedef typename GridTypeMapper<vtype>::vector_type vector_type; | ||||||
|  |   typedef typename GridTypeMapper<vtype>::vector_typeD vector_typeD; | ||||||
|   typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v; |   typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v; | ||||||
|   typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object; |   typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -35,13 +35,71 @@ namespace Grid { | |||||||
|   // innerProduct Matrix x Matrix -> Scalar |   // innerProduct Matrix x Matrix -> Scalar | ||||||
|   /////////////////////////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   template<class sobj> inline RealD norm2(const sobj &arg){ |   template<class sobj> inline RealD norm2(const sobj &arg){ | ||||||
|       typedef typename sobj::scalar_type scalar; |     auto nrm = innerProductD(arg,arg); | ||||||
|       decltype(innerProduct(arg,arg)) nrm; |  | ||||||
|       nrm = innerProduct(arg,arg); |  | ||||||
|     RealD ret = real(nrm); |     RealD ret = real(nrm); | ||||||
|     return ret; |     return ret; | ||||||
|   } |   } | ||||||
|  |   ////////////////////////////////////// | ||||||
|  |   // If single promote to double and sum 2x | ||||||
|  |   ////////////////////////////////////// | ||||||
|  |  | ||||||
|  | inline ComplexD innerProductD(const ComplexF &l,const ComplexF &r){  return innerProduct(l,r); } | ||||||
|  | inline ComplexD innerProductD(const ComplexD &l,const ComplexD &r){  return innerProduct(l,r); } | ||||||
|  | inline RealD    innerProductD(const RealD    &l,const RealD    &r){  return innerProduct(l,r); } | ||||||
|  | inline RealD    innerProductD(const RealF    &l,const RealF    &r){  return innerProduct(l,r); } | ||||||
|  |  | ||||||
|  | inline vComplexD innerProductD(const vComplexD &l,const vComplexD &r){  return innerProduct(l,r); } | ||||||
|  | inline vRealD    innerProductD(const vRealD    &l,const vRealD    &r){  return innerProduct(l,r); } | ||||||
|  | inline vComplexD innerProductD(const vComplexF &l,const vComplexF &r){   | ||||||
|  |   vComplexD la,lb; | ||||||
|  |   vComplexD ra,rb; | ||||||
|  |   Optimization::PrecisionChange::StoD(l.v,la.v,lb.v); | ||||||
|  |   Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v); | ||||||
|  |   return innerProduct(la,ra) + innerProduct(lb,rb);  | ||||||
|  | } | ||||||
|  | inline vRealD innerProductD(const vRealF &l,const vRealF &r){   | ||||||
|  |   vRealD la,lb; | ||||||
|  |   vRealD ra,rb; | ||||||
|  |   Optimization::PrecisionChange::StoD(l.v,la.v,lb.v); | ||||||
|  |   Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v); | ||||||
|  |   return innerProduct(la,ra) + innerProduct(lb,rb);  | ||||||
|  | } | ||||||
|  |  | ||||||
|  |   template<class l,class r,int N> inline | ||||||
|  |   auto innerProductD (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(innerProductD(lhs._internal[0],rhs._internal[0]))> | ||||||
|  |   { | ||||||
|  |     typedef decltype(innerProductD(lhs._internal[0],rhs._internal[0])) ret_t; | ||||||
|  |     iScalar<ret_t> ret; | ||||||
|  |     ret=zero; | ||||||
|  |     for(int c1=0;c1<N;c1++){ | ||||||
|  |       ret._internal += innerProductD(lhs._internal[c1],rhs._internal[c1]); | ||||||
|  |     } | ||||||
|  |     return ret; | ||||||
|  |   } | ||||||
|  |   template<class l,class r,int N> inline | ||||||
|  |   auto innerProductD (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(innerProductD(lhs._internal[0][0],rhs._internal[0][0]))> | ||||||
|  |   { | ||||||
|  |     typedef decltype(innerProductD(lhs._internal[0][0],rhs._internal[0][0])) ret_t; | ||||||
|  |     iScalar<ret_t> ret; | ||||||
|  |     iScalar<ret_t> tmp; | ||||||
|  |     ret=zero; | ||||||
|  |     for(int c1=0;c1<N;c1++){ | ||||||
|  |     for(int c2=0;c2<N;c2++){ | ||||||
|  |       ret._internal+=innerProductD(lhs._internal[c1][c2],rhs._internal[c1][c2]); | ||||||
|  |     }} | ||||||
|  |     return ret; | ||||||
|  |   } | ||||||
|  |   template<class l,class r> inline | ||||||
|  |   auto innerProductD (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(innerProductD(lhs._internal,rhs._internal))> | ||||||
|  |   { | ||||||
|  |     typedef decltype(innerProductD(lhs._internal,rhs._internal)) ret_t; | ||||||
|  |     iScalar<ret_t> ret; | ||||||
|  |     ret._internal = innerProductD(lhs._internal,rhs._internal); | ||||||
|  |     return ret; | ||||||
|  |   } | ||||||
|  |   ////////////////////// | ||||||
|  |   // Keep same precison | ||||||
|  |   ////////////////////// | ||||||
|   template<class l,class r,int N> inline |   template<class l,class r,int N> inline | ||||||
|   auto innerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0],rhs._internal[0]))> |   auto innerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0],rhs._internal[0]))> | ||||||
|   { |   { | ||||||
|   | |||||||
| @@ -53,6 +53,7 @@ namespace Grid { | |||||||
|   public: |   public: | ||||||
|     typedef typename T::scalar_type scalar_type; |     typedef typename T::scalar_type scalar_type; | ||||||
|     typedef typename T::vector_type vector_type; |     typedef typename T::vector_type vector_type; | ||||||
|  |     typedef typename T::vector_typeD vector_typeD; | ||||||
|     typedef typename T::tensor_reduced tensor_reduced; |     typedef typename T::tensor_reduced tensor_reduced; | ||||||
|     typedef typename T::scalar_object scalar_object; |     typedef typename T::scalar_object scalar_object; | ||||||
|     typedef typename T::Complexified Complexified; |     typedef typename T::Complexified Complexified; | ||||||
| @@ -67,6 +68,7 @@ namespace Grid { | |||||||
|   public: |   public: | ||||||
|     typedef RealF scalar_type; |     typedef RealF scalar_type; | ||||||
|     typedef RealF vector_type; |     typedef RealF vector_type; | ||||||
|  |     typedef RealD vector_typeD; | ||||||
|     typedef RealF tensor_reduced ; |     typedef RealF tensor_reduced ; | ||||||
|     typedef RealF scalar_object; |     typedef RealF scalar_object; | ||||||
|     typedef ComplexF Complexified; |     typedef ComplexF Complexified; | ||||||
| @@ -77,6 +79,7 @@ namespace Grid { | |||||||
|   public: |   public: | ||||||
|     typedef RealD scalar_type; |     typedef RealD scalar_type; | ||||||
|     typedef RealD vector_type; |     typedef RealD vector_type; | ||||||
|  |     typedef RealD vector_typeD; | ||||||
|     typedef RealD tensor_reduced; |     typedef RealD tensor_reduced; | ||||||
|     typedef RealD scalar_object; |     typedef RealD scalar_object; | ||||||
|     typedef ComplexD Complexified; |     typedef ComplexD Complexified; | ||||||
| @@ -87,6 +90,7 @@ namespace Grid { | |||||||
|   public: |   public: | ||||||
|     typedef ComplexF scalar_type; |     typedef ComplexF scalar_type; | ||||||
|     typedef ComplexF vector_type; |     typedef ComplexF vector_type; | ||||||
|  |     typedef ComplexD vector_typeD; | ||||||
|     typedef ComplexF tensor_reduced; |     typedef ComplexF tensor_reduced; | ||||||
|     typedef ComplexF scalar_object; |     typedef ComplexF scalar_object; | ||||||
|     typedef ComplexF Complexified; |     typedef ComplexF Complexified; | ||||||
| @@ -97,6 +101,7 @@ namespace Grid { | |||||||
|   public: |   public: | ||||||
|     typedef ComplexD scalar_type; |     typedef ComplexD scalar_type; | ||||||
|     typedef ComplexD vector_type; |     typedef ComplexD vector_type; | ||||||
|  |     typedef ComplexD vector_typeD; | ||||||
|     typedef ComplexD tensor_reduced; |     typedef ComplexD tensor_reduced; | ||||||
|     typedef ComplexD scalar_object; |     typedef ComplexD scalar_object; | ||||||
|     typedef ComplexD Complexified; |     typedef ComplexD Complexified; | ||||||
| @@ -107,6 +112,7 @@ namespace Grid { | |||||||
|   public: |   public: | ||||||
|     typedef Integer scalar_type; |     typedef Integer scalar_type; | ||||||
|     typedef Integer vector_type; |     typedef Integer vector_type; | ||||||
|  |     typedef Integer vector_typeD; | ||||||
|     typedef Integer tensor_reduced; |     typedef Integer tensor_reduced; | ||||||
|     typedef Integer scalar_object; |     typedef Integer scalar_object; | ||||||
|     typedef void Complexified; |     typedef void Complexified; | ||||||
| @@ -118,6 +124,7 @@ namespace Grid { | |||||||
|   public: |   public: | ||||||
|     typedef RealF  scalar_type; |     typedef RealF  scalar_type; | ||||||
|     typedef vRealF vector_type; |     typedef vRealF vector_type; | ||||||
|  |     typedef vRealD vector_typeD; | ||||||
|     typedef vRealF tensor_reduced; |     typedef vRealF tensor_reduced; | ||||||
|     typedef RealF  scalar_object; |     typedef RealF  scalar_object; | ||||||
|     typedef vComplexF Complexified; |     typedef vComplexF Complexified; | ||||||
| @@ -128,6 +135,7 @@ namespace Grid { | |||||||
|   public: |   public: | ||||||
|     typedef RealD  scalar_type; |     typedef RealD  scalar_type; | ||||||
|     typedef vRealD vector_type; |     typedef vRealD vector_type; | ||||||
|  |     typedef vRealD vector_typeD; | ||||||
|     typedef vRealD tensor_reduced; |     typedef vRealD tensor_reduced; | ||||||
|     typedef RealD  scalar_object; |     typedef RealD  scalar_object; | ||||||
|     typedef vComplexD Complexified; |     typedef vComplexD Complexified; | ||||||
| @@ -138,6 +146,7 @@ namespace Grid { | |||||||
|   public: |   public: | ||||||
|     typedef ComplexF  scalar_type; |     typedef ComplexF  scalar_type; | ||||||
|     typedef vComplexF vector_type; |     typedef vComplexF vector_type; | ||||||
|  |     typedef vComplexD vector_typeD; | ||||||
|     typedef vComplexF tensor_reduced; |     typedef vComplexF tensor_reduced; | ||||||
|     typedef ComplexF  scalar_object; |     typedef ComplexF  scalar_object; | ||||||
|     typedef vComplexF Complexified; |     typedef vComplexF Complexified; | ||||||
| @@ -148,6 +157,7 @@ namespace Grid { | |||||||
|   public: |   public: | ||||||
|     typedef ComplexD  scalar_type; |     typedef ComplexD  scalar_type; | ||||||
|     typedef vComplexD vector_type; |     typedef vComplexD vector_type; | ||||||
|  |     typedef vComplexD vector_typeD; | ||||||
|     typedef vComplexD tensor_reduced; |     typedef vComplexD tensor_reduced; | ||||||
|     typedef ComplexD  scalar_object; |     typedef ComplexD  scalar_object; | ||||||
|     typedef vComplexD Complexified; |     typedef vComplexD Complexified; | ||||||
| @@ -158,6 +168,7 @@ namespace Grid { | |||||||
|   public: |   public: | ||||||
|     typedef  Integer scalar_type; |     typedef  Integer scalar_type; | ||||||
|     typedef vInteger vector_type; |     typedef vInteger vector_type; | ||||||
|  |     typedef vInteger vector_typeD; | ||||||
|     typedef vInteger tensor_reduced; |     typedef vInteger tensor_reduced; | ||||||
|     typedef  Integer scalar_object; |     typedef  Integer scalar_object; | ||||||
|     typedef void Complexified; |     typedef void Complexified; | ||||||
| @@ -241,7 +252,8 @@ namespace Grid { | |||||||
|   template<typename T> |   template<typename T> | ||||||
|   class isSIMDvectorized{ |   class isSIMDvectorized{ | ||||||
|     template<typename U> |     template<typename U> | ||||||
|     static typename std::enable_if< !std::is_same< typename GridTypeMapper<typename getVectorType<U>::type>::scalar_type,   typename GridTypeMapper<typename getVectorType<U>::type>::vector_type>::value, char>::type test(void *); |     static typename std::enable_if< !std::is_same< typename GridTypeMapper<typename getVectorType<U>::type>::scalar_type,    | ||||||
|  |       typename GridTypeMapper<typename getVectorType<U>::type>::vector_type>::value, char>::type test(void *); | ||||||
|  |  | ||||||
|     template<typename U> |     template<typename U> | ||||||
|     static double test(...); |     static double test(...); | ||||||
|   | |||||||
| @@ -311,8 +311,8 @@ void Grid_init(int *argc,char ***argv) | |||||||
|     std::cout<<GridLogMessage<<std::endl; |     std::cout<<GridLogMessage<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"Performance:"<<std::endl; |     std::cout<<GridLogMessage<<"Performance:"<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<std::endl; |     std::cout<<GridLogMessage<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"  --comms-isend   : Asynchronous MPI calls; several dirs at a time "<<std::endl;     |     std::cout<<GridLogMessage<<"  --comms-concurrent : Asynchronous MPI calls; several dirs at a time "<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<"  --comms-sendrecv: Synchronous MPI calls; one dirs at a time "<<std::endl;     |     std::cout<<GridLogMessage<<"  --comms-sequential : Synchronous MPI calls; one dirs at a time "<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<"  --comms-overlap : Overlap comms with compute "<<std::endl;     |     std::cout<<GridLogMessage<<"  --comms-overlap : Overlap comms with compute "<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<std::endl; |     std::cout<<GridLogMessage<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"  --dslash-generic: Wilson kernel for generic Nc"<<std::endl;     |     std::cout<<GridLogMessage<<"  --dslash-generic: Wilson kernel for generic Nc"<<std::endl;     | ||||||
| @@ -457,5 +457,6 @@ void Grid_debug_handler_init(void) | |||||||
|  |  | ||||||
|   sigaction(SIGFPE,&sa,NULL); |   sigaction(SIGFPE,&sa,NULL); | ||||||
|   sigaction(SIGKILL,&sa,NULL); |   sigaction(SIGKILL,&sa,NULL); | ||||||
|  |   sigaction(SIGILL,&sa,NULL); | ||||||
| } | } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -308,18 +308,23 @@ public: | |||||||
|   int n; |   int n; | ||||||
|   funcExchange(int _n) { n=_n;}; |   funcExchange(int _n) { n=_n;}; | ||||||
|   template<class vec>    void operator()(vec &r1,vec &r2,vec &i1,vec &i2) const { exchange(r1,r2,i1,i2,n);} |   template<class vec>    void operator()(vec &r1,vec &r2,vec &i1,vec &i2) const { exchange(r1,r2,i1,i2,n);} | ||||||
|   template<class scal>   void apply(std::vector<scal> &r1,std::vector<scal> &r2,std::vector<scal> &in1,std::vector<scal> &in2)  const {  |   template<class scal>   void apply(std::vector<scal> &r1, | ||||||
|  | 				    std::vector<scal> &r2, | ||||||
|  | 				    std::vector<scal> &in1, | ||||||
|  | 				    std::vector<scal> &in2)  const  | ||||||
|  |   {  | ||||||
|     int sz=in1.size(); |     int sz=in1.size(); | ||||||
|  |  | ||||||
|      |  | ||||||
|     int msk = sz>>(n+1); |     int msk = sz>>(n+1); | ||||||
|  |  | ||||||
|     int j1=0; |     for(int i=0;i<sz;i++) { | ||||||
|     int j2=0; |       int j1 = i&(~msk); | ||||||
|     for(int i=0;i<sz;i++) if ( (i&msk) == 0 ) r1[j1++] = in1[ i ]; |       int j2 = i|msk; | ||||||
|     for(int i=0;i<sz;i++) if ( (i&msk) == 0 ) r1[j1++] = in2[ i ]; |       if  ( (i&msk) == 0 ) { r1[i]=in1[j1];} | ||||||
|     for(int i=0;i<sz;i++) if ( (i&msk)  ) r2[j2++] = in1[ i ]; |       else                 { r1[i]=in2[j1];} | ||||||
|     for(int i=0;i<sz;i++) if ( (i&msk)  ) r2[j2++] = in2[ i ]; |  | ||||||
|  |       if  ( (i&msk) == 0 ) { r2[i]=in1[j2];} | ||||||
|  |       else                 { r2[i]=in2[j2];} | ||||||
|  |     }       | ||||||
|   } |   } | ||||||
|   std::string name(void) const { return std::string("Exchange"); } |   std::string name(void) const { return std::string("Exchange"); } | ||||||
| }; | }; | ||||||
| @@ -454,8 +459,8 @@ void ExchangeTester(const functor &func) | |||||||
|  |  | ||||||
|   std::cout<<GridLogMessage << " " << func.name() << " " <<func.n <<std::endl; |   std::cout<<GridLogMessage << " " << func.name() << " " <<func.n <<std::endl; | ||||||
|  |  | ||||||
|   //  for(int i=0;i<Nsimd;i++) std::cout << " i "<<i<<" "<<reference1[i]<<" "<<result1[i]<<std::endl; |   //for(int i=0;i<Nsimd;i++) std::cout << " i "<<i<<" ref "<<reference1[i]<<" res "<<result1[i]<<std::endl; | ||||||
|   //  for(int i=0;i<Nsimd;i++) std::cout << " i "<<i<<" "<<reference2[i]<<" "<<result2[i]<<std::endl; |   //for(int i=0;i<Nsimd;i++) std::cout << " i "<<i<<" ref "<<reference2[i]<<" res "<<result2[i]<<std::endl; | ||||||
|  |  | ||||||
|   for(int i=0;i<Nsimd;i++){ |   for(int i=0;i<Nsimd;i++){ | ||||||
|     int found=0; |     int found=0; | ||||||
| @@ -465,7 +470,7 @@ void ExchangeTester(const functor &func) | |||||||
| 	//	std::cout << " i "<<i<<" j "<<j<<" "<<reference1[j]<<" "<<result1[i]<<std::endl; | 	//	std::cout << " i "<<i<<" j "<<j<<" "<<reference1[j]<<" "<<result1[i]<<std::endl; | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|     assert(found==1); |     //    assert(found==1); | ||||||
|   } |   } | ||||||
|   for(int i=0;i<Nsimd;i++){ |   for(int i=0;i<Nsimd;i++){ | ||||||
|     int found=0; |     int found=0; | ||||||
| @@ -475,15 +480,24 @@ void ExchangeTester(const functor &func) | |||||||
| 	//	std::cout << " i "<<i<<" j "<<j<<" "<<reference2[j]<<" "<<result2[i]<<std::endl; | 	//	std::cout << " i "<<i<<" j "<<j<<" "<<reference2[j]<<" "<<result2[i]<<std::endl; | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|     assert(found==1); |     //    assert(found==1); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   /* | ||||||
|  |   for(int i=0;i<Nsimd;i++){ | ||||||
|  |     std::cout << " i "<< i | ||||||
|  | 	      <<" result1  "<<result1[i] | ||||||
|  | 	      <<" result2  "<<result2[i] | ||||||
|  | 	      <<" test1  "<<test1[i] | ||||||
|  | 	      <<" test2  "<<test2[i] | ||||||
|  | 	      <<" input1 "<<input1[i] | ||||||
|  | 	      <<" input2 "<<input2[i]<<std::endl; | ||||||
|  |   } | ||||||
|  |   */ | ||||||
|   for(int i=0;i<Nsimd;i++){ |   for(int i=0;i<Nsimd;i++){ | ||||||
|     assert(test1[i]==input1[i]); |     assert(test1[i]==input1[i]); | ||||||
|     assert(test2[i]==input2[i]); |     assert(test2[i]==input2[i]); | ||||||
|   }//    std::cout << " i "<< i<<" test1"<<test1[i]<<" "<<input1[i]<<std::endl; |   } | ||||||
|     //    std::cout << " i "<< i<<" test2"<<test2[i]<<" "<<input2[i]<<std::endl; |  | ||||||
|   //  } |  | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -678,5 +692,69 @@ int main (int argc, char ** argv) | |||||||
|   IntTester(funcMinus()); |   IntTester(funcMinus()); | ||||||
|   IntTester(funcTimes()); |   IntTester(funcTimes()); | ||||||
|  |  | ||||||
|  |   std::cout<<GridLogMessage << "==================================="<<  std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "Testing precisionChange            "<<  std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "==================================="<<  std::endl; | ||||||
|  |   { | ||||||
|  |     GridSerialRNG          sRNG; | ||||||
|  |     sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9})); | ||||||
|  |     const int Ndp = 16; | ||||||
|  |     const int Nsp = Ndp/2; | ||||||
|  |     const int Nhp = Ndp/4; | ||||||
|  |     std::vector<vRealH,alignedAllocator<vRealH> > H (Nhp); | ||||||
|  |     std::vector<vRealF,alignedAllocator<vRealF> > F (Nsp); | ||||||
|  |     std::vector<vRealF,alignedAllocator<vRealF> > FF(Nsp); | ||||||
|  |     std::vector<vRealD,alignedAllocator<vRealD> > D (Ndp); | ||||||
|  |     std::vector<vRealD,alignedAllocator<vRealD> > DD(Ndp); | ||||||
|  |     for(int i=0;i<16;i++){ | ||||||
|  |       random(sRNG,D[i]); | ||||||
|  |     } | ||||||
|  |     // Double to Single | ||||||
|  |     precisionChange(&F[0],&D[0],Ndp); | ||||||
|  |     precisionChange(&DD[0],&F[0],Ndp); | ||||||
|  |     std::cout << GridLogMessage<<"Double to single"; | ||||||
|  |     for(int i=0;i<Ndp;i++){ | ||||||
|  |       //      std::cout << "DD["<<i<<"] = "<< DD[i]<<" "<<D[i]<<" "<<DD[i]-D[i] <<std::endl;  | ||||||
|  |       DD[i] = DD[i] - D[i]; | ||||||
|  |       decltype(innerProduct(DD[0],DD[0])) nrm; | ||||||
|  |       nrm = innerProduct(DD[i],DD[i]); | ||||||
|  |       auto tmp = Reduce(nrm); | ||||||
|  |       //      std::cout << tmp << std::endl; | ||||||
|  |       assert( tmp < 1.0e-14 );  | ||||||
|  |     } | ||||||
|  |     std::cout <<" OK ! "<<std::endl; | ||||||
|  |  | ||||||
|  |     // Double to Half | ||||||
|  | #ifdef USE_FP16 | ||||||
|  |     std::cout << GridLogMessage<< "Double to half" ; | ||||||
|  |     precisionChange(&H[0],&D[0],Ndp); | ||||||
|  |     precisionChange(&DD[0],&H[0],Ndp); | ||||||
|  |     for(int i=0;i<Ndp;i++){ | ||||||
|  |       //      std::cout << "DD["<<i<<"] = "<< DD[i]<<" "<<D[i]<<" "<<DD[i]-D[i]<<std::endl;  | ||||||
|  |       DD[i] = DD[i] - D[i]; | ||||||
|  |       decltype(innerProduct(DD[0],DD[0])) nrm; | ||||||
|  |       nrm = innerProduct(DD[i],DD[i]); | ||||||
|  |       auto tmp = Reduce(nrm); | ||||||
|  |       //      std::cout << tmp << std::endl; | ||||||
|  |       assert( tmp < 1.0e-3 );  | ||||||
|  |     } | ||||||
|  |     std::cout <<" OK ! "<<std::endl; | ||||||
|  |  | ||||||
|  |     std::cout << GridLogMessage<< "Single to half"; | ||||||
|  |     // Single to Half | ||||||
|  |     precisionChange(&H[0] ,&F[0],Nsp); | ||||||
|  |     precisionChange(&FF[0],&H[0],Nsp); | ||||||
|  |     for(int i=0;i<Nsp;i++){ | ||||||
|  |       //      std::cout << "FF["<<i<<"] = "<< FF[i]<<" "<<F[i]<<" "<<FF[i]-F[i]<<std::endl;  | ||||||
|  |       FF[i] = FF[i] - F[i]; | ||||||
|  |       decltype(innerProduct(FF[0],FF[0])) nrm; | ||||||
|  |       nrm = innerProduct(FF[i],FF[i]); | ||||||
|  |       auto tmp = Reduce(nrm); | ||||||
|  |       //      std::cout << tmp << std::endl; | ||||||
|  |       assert( tmp < 1.0e-3 );  | ||||||
|  |     } | ||||||
|  |     std::cout <<" OK ! "<<std::endl; | ||||||
|  | #endif | ||||||
|  |   } | ||||||
|   Grid_finalize(); |   Grid_finalize(); | ||||||
| } | } | ||||||
|   | |||||||
| @@ -148,11 +148,13 @@ class FourierAcceleratedGaugeFixer  : public Gimpl { | |||||||
|     Complex psqMax(16.0); |     Complex psqMax(16.0); | ||||||
|     Fp =  psqMax*one/psq; |     Fp =  psqMax*one/psq; | ||||||
|  |  | ||||||
|  |     /* | ||||||
|     static int once; |     static int once; | ||||||
|     if ( once == 0 ) {  |     if ( once == 0 ) {  | ||||||
|       std::cout << " Fp " << Fp <<std::endl; |       std::cout << " Fp " << Fp <<std::endl; | ||||||
|       once ++; |       once ++; | ||||||
|     } |       }*/ | ||||||
|  |  | ||||||
|     pokeSite(TComplex(1.0),Fp,coor); |     pokeSite(TComplex(1.0),Fp,coor); | ||||||
|  |  | ||||||
|     dmuAmu_p  = dmuAmu_p * Fp;  |     dmuAmu_p  = dmuAmu_p * Fp;  | ||||||
|   | |||||||
| @@ -2,11 +2,10 @@ | |||||||
|  |  | ||||||
|     Grid physics library, www.github.com/paboyle/Grid  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|     Source file: ./tests/Test_wilson_even_odd.cc |     Source file: ./tests/Test_wilson_tm_even_odd.cc | ||||||
|  |  | ||||||
|     Copyright (C) 2015 |     Copyright (C) 2015 | ||||||
|  |  | ||||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> |  | ||||||
| Author: paboyle <paboyle@ph.ed.ac.uk> | Author: paboyle <paboyle@ph.ed.ac.uk> | ||||||
|  |  | ||||||
|     This program is free software; you can redistribute it and/or modify |     This program is free software; you can redistribute it and/or modify | ||||||
| @@ -89,8 +88,8 @@ int main (int argc, char ** argv) | |||||||
|   } |   } | ||||||
|  |  | ||||||
|   RealD mass=0.1; |   RealD mass=0.1; | ||||||
|   RealD mu  = 0.1; |  | ||||||
|   WilsonTMFermionR Dw(Umu,Grid,RBGrid,mass,mu); |   WilsonFermionR Dw(Umu,Grid,RBGrid,mass); | ||||||
|  |  | ||||||
|   LatticeFermion src_e   (&RBGrid); |   LatticeFermion src_e   (&RBGrid); | ||||||
|   LatticeFermion src_o   (&RBGrid); |   LatticeFermion src_o   (&RBGrid); | ||||||
| @@ -207,7 +206,7 @@ int main (int argc, char ** argv) | |||||||
|   pickCheckerboard(Odd ,phi_o,phi); |   pickCheckerboard(Odd ,phi_o,phi); | ||||||
|   RealD t1,t2; |   RealD t1,t2; | ||||||
|  |  | ||||||
|   SchurDiagMooeeOperator<WilsonTMFermionR,LatticeFermion> HermOpEO(Dw); |   SchurDiagMooeeOperator<WilsonFermionR,LatticeFermion> HermOpEO(Dw); | ||||||
|   HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); |   HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); | ||||||
|   HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); |   HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -2,10 +2,11 @@ | |||||||
| 
 | 
 | ||||||
|     Grid physics library, www.github.com/paboyle/Grid  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
| 
 | 
 | ||||||
|     Source file: ./tests/Test_wilson_tm_even_odd.cc |     Source file: ./tests/Test_wilson_even_odd.cc | ||||||
| 
 | 
 | ||||||
|     Copyright (C) 2015 |     Copyright (C) 2015 | ||||||
| 
 | 
 | ||||||
|  | Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
| Author: paboyle <paboyle@ph.ed.ac.uk> | Author: paboyle <paboyle@ph.ed.ac.uk> | ||||||
| 
 | 
 | ||||||
|     This program is free software; you can redistribute it and/or modify |     This program is free software; you can redistribute it and/or modify | ||||||
| @@ -88,8 +89,8 @@ int main (int argc, char ** argv) | |||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|   RealD mass=0.1; |   RealD mass=0.1; | ||||||
| 
 |   RealD mu  = 0.1; | ||||||
|   WilsonFermionR Dw(Umu,Grid,RBGrid,mass); |   WilsonTMFermionR Dw(Umu,Grid,RBGrid,mass,mu); | ||||||
| 
 | 
 | ||||||
|   LatticeFermion src_e   (&RBGrid); |   LatticeFermion src_e   (&RBGrid); | ||||||
|   LatticeFermion src_o   (&RBGrid); |   LatticeFermion src_o   (&RBGrid); | ||||||
| @@ -206,7 +207,7 @@ int main (int argc, char ** argv) | |||||||
|   pickCheckerboard(Odd ,phi_o,phi); |   pickCheckerboard(Odd ,phi_o,phi); | ||||||
|   RealD t1,t2; |   RealD t1,t2; | ||||||
| 
 | 
 | ||||||
|   SchurDiagMooeeOperator<WilsonFermionR,LatticeFermion> HermOpEO(Dw); |   SchurDiagMooeeOperator<WilsonTMFermionR,LatticeFermion> HermOpEO(Dw); | ||||||
|   HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); |   HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); | ||||||
|   HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); |   HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); | ||||||
| 
 | 
 | ||||||
| @@ -115,8 +115,8 @@ int main (int argc, char ** argv) | |||||||
|   RNG.SeedFixedIntegers(seeds); |   RNG.SeedFixedIntegers(seeds); | ||||||
|  |  | ||||||
|  |  | ||||||
|   RealD alpha = 1.0; |   RealD alpha = 1.2; | ||||||
|   RealD beta  = 0.03; |   RealD beta  = 0.1; | ||||||
|   RealD mu    = 0.0; |   RealD mu    = 0.0; | ||||||
|   int order = 11; |   int order = 11; | ||||||
|   ChebyshevLanczos<LatticeComplex> Cheby(alpha,beta,mu,order); |   ChebyshevLanczos<LatticeComplex> Cheby(alpha,beta,mu,order); | ||||||
| @@ -131,10 +131,9 @@ int main (int argc, char ** argv) | |||||||
|   const int Nit= 10000; |   const int Nit= 10000; | ||||||
|  |  | ||||||
|   int Nconv; |   int Nconv; | ||||||
|   RealD eresid = 1.0e-8; |   RealD eresid = 1.0e-6; | ||||||
|  |  | ||||||
|   ImplicitlyRestartedLanczos<LatticeComplex> IRL(HermOp,X,Nk,Nm,eresid,Nit); |   ImplicitlyRestartedLanczos<LatticeComplex> IRL(HermOp,X,Nk,Nm,eresid,Nit); | ||||||
|  |  | ||||||
|   ImplicitlyRestartedLanczos<LatticeComplex> ChebyIRL(HermOp,Cheby,Nk,Nm,eresid,Nit); |   ImplicitlyRestartedLanczos<LatticeComplex> ChebyIRL(HermOp,Cheby,Nk,Nm,eresid,Nit); | ||||||
|  |  | ||||||
|   LatticeComplex src(grid); gaussian(RNG,src); |   LatticeComplex src(grid); gaussian(RNG,src); | ||||||
| @@ -145,9 +144,9 @@ int main (int argc, char ** argv) | |||||||
|   } |   } | ||||||
|    |    | ||||||
|   { |   { | ||||||
|     //    std::vector<RealD>          eval(Nm); |     std::vector<RealD>          eval(Nm); | ||||||
|     //    std::vector<LatticeComplex> evec(Nm,grid); |     std::vector<LatticeComplex> evec(Nm,grid); | ||||||
|     //    ChebyIRL.calc(eval,evec,src, Nconv); |     ChebyIRL.calc(eval,evec,src, Nconv); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   Grid_finalize(); |   Grid_finalize(); | ||||||
|   | |||||||
| @@ -89,7 +89,7 @@ int main(int argc, char** argv) { | |||||||
|   GridStopWatch CGTimer; |   GridStopWatch CGTimer; | ||||||
|  |  | ||||||
|   SchurDiagMooeeOperator<DomainWallFermionR, LatticeFermion> HermOpEO(Ddwf); |   SchurDiagMooeeOperator<DomainWallFermionR, LatticeFermion> HermOpEO(Ddwf); | ||||||
|   ConjugateGradient<LatticeFermion> CG(1.0e-8, 10000, 0);// switch off the assert |   ConjugateGradient<LatticeFermion> CG(1.0e-5, 10000, 0);// switch off the assert | ||||||
|  |  | ||||||
|   CGTimer.Start(); |   CGTimer.Start(); | ||||||
|   CG(HermOpEO, src_o, result_o); |   CG(HermOpEO, src_o, result_o); | ||||||
|   | |||||||
| @@ -73,7 +73,7 @@ int main (int argc, char ** argv) | |||||||
|   DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); |   DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); | ||||||
|  |  | ||||||
|   MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermOp(Ddwf); |   MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermOp(Ddwf); | ||||||
|   ConjugateGradient<LatticeFermion> CG(1.0e-8,10000); |   ConjugateGradient<LatticeFermion> CG(1.0e-6,10000); | ||||||
|   CG(HermOp,src,result); |   CG(HermOp,src,result); | ||||||
|  |  | ||||||
|   Grid_finalize(); |   Grid_finalize(); | ||||||
|   | |||||||
							
								
								
									
										119
									
								
								tests/solver/Test_staggered_block_cg_unprec.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										119
									
								
								tests/solver/Test_staggered_block_cg_unprec.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,119 @@ | |||||||
|  |     /************************************************************************************* | ||||||
|  |  | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  |     Source file: ./tests/Test_wilson_cg_unprec.cc | ||||||
|  |  | ||||||
|  |     Copyright (C) 2015 | ||||||
|  |  | ||||||
|  | Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | ||||||
|  | Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |  | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |  | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  |     *************************************************************************************/ | ||||||
|  |     /*  END LEGAL */ | ||||||
|  | #include <Grid/Grid.h> | ||||||
|  | #include <Grid/algorithms/iterative/BlockConjugateGradient.h> | ||||||
|  |  | ||||||
|  | using namespace std; | ||||||
|  | using namespace Grid; | ||||||
|  | using namespace Grid::QCD; | ||||||
|  |  | ||||||
|  | template<class d> | ||||||
|  | struct scal { | ||||||
|  |   d internal; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  |   Gamma::Algebra Gmu [] = { | ||||||
|  |     Gamma::Algebra::GammaX, | ||||||
|  |     Gamma::Algebra::GammaY, | ||||||
|  |     Gamma::Algebra::GammaZ, | ||||||
|  |     Gamma::Algebra::GammaT | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  | int main (int argc, char ** argv) | ||||||
|  | { | ||||||
|  |   typedef typename ImprovedStaggeredFermion5DR::FermionField FermionField;  | ||||||
|  |   typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField;  | ||||||
|  |   typename ImprovedStaggeredFermion5DR::ImplParams params;  | ||||||
|  |  | ||||||
|  |   const int Ls=4; | ||||||
|  |  | ||||||
|  |   Grid_init(&argc,&argv); | ||||||
|  |  | ||||||
|  |   std::vector<int> latt_size   = GridDefaultLatt(); | ||||||
|  |   std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); | ||||||
|  |   std::vector<int> mpi_layout  = GridDefaultMpi(); | ||||||
|  |  | ||||||
|  |   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); | ||||||
|  |   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); | ||||||
|  |   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); | ||||||
|  |   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); | ||||||
|  |  | ||||||
|  |   std::vector<int> seeds({1,2,3,4}); | ||||||
|  |   GridParallelRNG pRNG(UGrid );  pRNG.SeedFixedIntegers(seeds); | ||||||
|  |   GridParallelRNG pRNG5(FGrid);  pRNG5.SeedFixedIntegers(seeds); | ||||||
|  |  | ||||||
|  |   FermionField src(FGrid); random(pRNG5,src); | ||||||
|  |   FermionField result(FGrid); result=zero; | ||||||
|  |   RealD nrm = norm2(src); | ||||||
|  |  | ||||||
|  |   LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu); | ||||||
|  |  | ||||||
|  |   RealD mass=0.01; | ||||||
|  |   ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); | ||||||
|  |   MdagMLinearOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds); | ||||||
|  |  | ||||||
|  |   ConjugateGradient<FermionField> CG(1.0e-8,10000); | ||||||
|  |   BlockConjugateGradient<FermionField> BCG(1.0e-8,10000); | ||||||
|  |   MultiRHSConjugateGradient<FermionField> mCG(1.0e-8,10000); | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||||
|  |   std::cout << GridLogMessage << " Calling 4d CG "<<std::endl; | ||||||
|  |   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||||
|  |   ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass); | ||||||
|  |   MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d); | ||||||
|  |   FermionField src4d(UGrid); random(pRNG,src4d); | ||||||
|  |   FermionField result4d(UGrid); result4d=zero; | ||||||
|  |   CG(HermOp4d,src4d,result4d); | ||||||
|  |   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||||
|  |   std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl; | ||||||
|  |   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||||
|  |   result=zero; | ||||||
|  |   CG(HermOp,src,result); | ||||||
|  |   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||||
|  |   std::cout << GridLogMessage << " Calling multiRHS CG for "<<Ls <<" right hand sides" <<std::endl; | ||||||
|  |   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||||
|  |   result=zero; | ||||||
|  |   mCG(HermOp,src,result); | ||||||
|  |   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||||
|  |   std::cout << GridLogMessage << " Calling Block CG for "<<Ls <<" right hand sides" <<std::endl; | ||||||
|  |   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||||
|  |   result=zero; | ||||||
|  |   BCG(HermOp,src,result); | ||||||
|  |   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   Grid_finalize(); | ||||||
|  | } | ||||||
							
								
								
									
										82
									
								
								tests/solver/Test_staggered_cg_unprec.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										82
									
								
								tests/solver/Test_staggered_cg_unprec.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,82 @@ | |||||||
|  |     /************************************************************************************* | ||||||
|  |  | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  |     Source file: ./tests/Test_wilson_cg_unprec.cc | ||||||
|  |  | ||||||
|  |     Copyright (C) 2015 | ||||||
|  |  | ||||||
|  | Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | ||||||
|  | Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |  | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |  | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  |     *************************************************************************************/ | ||||||
|  |     /*  END LEGAL */ | ||||||
|  | #include <Grid/Grid.h> | ||||||
|  | #include <Grid/algorithms/iterative/BlockConjugateGradient.h> | ||||||
|  |  | ||||||
|  | using namespace std; | ||||||
|  | using namespace Grid; | ||||||
|  | using namespace Grid::QCD; | ||||||
|  |  | ||||||
|  | template<class d> | ||||||
|  | struct scal { | ||||||
|  |   d internal; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  |   Gamma::Algebra Gmu [] = { | ||||||
|  |     Gamma::Algebra::GammaX, | ||||||
|  |     Gamma::Algebra::GammaY, | ||||||
|  |     Gamma::Algebra::GammaZ, | ||||||
|  |     Gamma::Algebra::GammaT | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  | int main (int argc, char ** argv) | ||||||
|  | { | ||||||
|  |   typedef typename ImprovedStaggeredFermionR::FermionField FermionField;  | ||||||
|  |   typedef typename ImprovedStaggeredFermionR::ComplexField ComplexField;  | ||||||
|  |   typename ImprovedStaggeredFermionR::ImplParams params;  | ||||||
|  |  | ||||||
|  |   Grid_init(&argc,&argv); | ||||||
|  |  | ||||||
|  |   std::vector<int> latt_size   = GridDefaultLatt(); | ||||||
|  |   std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); | ||||||
|  |   std::vector<int> mpi_layout  = GridDefaultMpi(); | ||||||
|  |   GridCartesian               Grid(latt_size,simd_layout,mpi_layout); | ||||||
|  |   GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout); | ||||||
|  |  | ||||||
|  |   std::vector<int> seeds({1,2,3,4}); | ||||||
|  |   GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds); | ||||||
|  |  | ||||||
|  |   FermionField src(&Grid); random(pRNG,src); | ||||||
|  |   RealD nrm = norm2(src); | ||||||
|  |   FermionField result(&Grid); result=zero; | ||||||
|  |   LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); | ||||||
|  |  | ||||||
|  |   double volume=1; | ||||||
|  |   for(int mu=0;mu<Nd;mu++){ | ||||||
|  |     volume=volume*latt_size[mu]; | ||||||
|  |   }   | ||||||
|  |    | ||||||
|  |   RealD mass=0.1; | ||||||
|  |   ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass); | ||||||
|  |  | ||||||
|  |   MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp(Ds); | ||||||
|  |   CG(HermOp,src,result); | ||||||
|  |  | ||||||
|  |   Grid_finalize(); | ||||||
|  | } | ||||||
		Reference in New Issue
	
	Block a user