From b9113ed310f4933b0754e6113906909581bdc623 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 13 Apr 2017 12:02:12 -0400 Subject: [PATCH 01/11] Patches for knl --- lib/simd/Grid_avx512.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index dae3c1c7..0c4061fb 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -343,11 +343,12 @@ namespace Optimization { struct PrecisionChange { static inline __m512i StoH (__m512 a,__m512 b) { + __m512i h; #ifdef USE_FP16 __m256i ha = _mm512_cvtps_ph(a,0); __m256i hb = _mm512_cvtps_ph(b,0); - __m512 h = _mm512_castps256_ps512(ha); - h = _mm512_insertf256_ps(h,hb,1); + h =(__m512i) _mm512_castps256_ps512(ha); + h =(__m512i) _mm512_insertf64x4((__m512d)h,(__m512d)hb,1); #else assert(0); #endif @@ -365,12 +366,12 @@ namespace Optimization { __m256 sa = _mm512_cvtpd_ps(a); __m256 sb = _mm512_cvtpd_ps(b); __m512 s = _mm512_castps256_ps512(sa); - s = _mm512_insertf256_ps(s,sb,1); + s =(__m512) _mm512_insertf64x4((__m512d)s,(__m256d)sb,1); return s; } static inline void StoD (__m512 s,__m512d &a,__m512d &b) { - a = _mm512_cvtps_pd(_mm512_extractf256_ps(s,0)); - b = _mm512_cvtps_pd(_mm512_extractf256_ps(s,1)); + a = _mm512_cvtps_pd((__m256)_mm512_extractf64x4_pd((__m512d)s,0)); + b = _mm512_cvtps_pd((__m256)_mm512_extractf64x4_pd((__m512d)s,1)); } static inline __m512i DtoH (__m512d a,__m512d b,__m512d c,__m512d d) { __m512 sa,sb; @@ -581,7 +582,9 @@ namespace Optimization { ////////////////////////////////////////////////////////////////////////////////////// // Here assign types - typedef __m512 SIMD_Ftype; // Single precision type + + typedef __m512i SIMD_Htype; // Single precision type + typedef __m512 SIMD_Ftype; // Single precision type typedef __m512d SIMD_Dtype; // Double precision type typedef __m512i SIMD_Itype; // Integer type From 951be75292043995589bcb780b1e9702e4ea8c14 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 13 Apr 2017 17:35:11 +0100 Subject: [PATCH 02/11] Half precision conversion working on AVX512 now too --- lib/simd/Grid_avx512.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index 0c4061fb..9156f113 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -347,8 +347,8 @@ namespace Optimization { #ifdef USE_FP16 __m256i ha = _mm512_cvtps_ph(a,0); __m256i hb = _mm512_cvtps_ph(b,0); - h =(__m512i) _mm512_castps256_ps512(ha); - h =(__m512i) _mm512_insertf64x4((__m512d)h,(__m512d)hb,1); + h =(__m512i) _mm512_castps256_ps512((__m256)ha); + h =(__m512i) _mm512_insertf64x4((__m512d)h,(__m256d)hb,1); #else assert(0); #endif @@ -356,8 +356,8 @@ namespace Optimization { } static inline void HtoS (__m512i h,__m512 &sa,__m512 &sb) { #ifdef USE_FP16 - sa = _mm512_cvtph_ps(_mm512_extractf256_ps(h,0)); - sb = _mm512_cvtph_ps(_mm512_extractf256_ps(h,1)); + sa = _mm512_cvtph_ps((__m256i)_mm512_extractf64x4_pd((__m512d)h,0)); + sb = _mm512_cvtph_ps((__m256i)_mm512_extractf64x4_pd((__m512d)h,1)); #else assert(0); #endif From 3ca41458a3be2f2eb055947ab647c49fb1ae782a Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 14 Apr 2017 14:20:54 +0100 Subject: [PATCH 03/11] Fix to no USE_FP16 case --- lib/simd/Grid_avx.h | 5 +++-- lib/simd/Grid_avx512.h | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index 73792f84..5c16dd71 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -470,13 +470,14 @@ namespace Optimization { return in; }; }; - +#define USE_FP16 struct PrecisionChange { static inline __m256i StoH (__m256 a,__m256 b) { + __m256 h; #ifdef USE_FP16 __m128i ha = _mm256_cvtps_ph(a,0); __m128i hb = _mm256_cvtps_ph(b,0); - __m256 h = _mm256_castps128_ps256(ha); + h = _mm256_castps128_ps256(ha); h = _mm256_insertf128_ps(h,hb,1); #else assert(0); diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index 9156f113..ba054665 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -340,7 +340,7 @@ namespace Optimization { }; }; - +#define USE_FP16 struct PrecisionChange { static inline __m512i StoH (__m512 a,__m512 b) { __m512i h; From a9c22d5f4347d4882b513ed63e47be63e7118d61 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 14 Apr 2017 14:38:49 +0100 Subject: [PATCH 04/11] Verbose removal --- tests/core/Test_fft_gfix.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/core/Test_fft_gfix.cc b/tests/core/Test_fft_gfix.cc index c6b77a13..7938241e 100644 --- a/tests/core/Test_fft_gfix.cc +++ b/tests/core/Test_fft_gfix.cc @@ -148,11 +148,13 @@ class FourierAcceleratedGaugeFixer : public Gimpl { Complex psqMax(16.0); Fp = psqMax*one/psq; + /* static int once; if ( once == 0 ) { std::cout << " Fp " << Fp < Date: Sat, 15 Apr 2017 08:54:11 +0100 Subject: [PATCH 05/11] Cleaning up the dense matrix and lanczos sector --- TODO | 62 ++- lib/algorithms/Algorithms.h | 2 +- .../{iterative => densematrix}/DenseMatrix.h | 0 .../{iterative => densematrix}/Francis.h | 0 .../{iterative => densematrix}/Householder.h | 0 .../iterative/ImplicitlyRestartedLanczos.h | 9 +- lib/algorithms/iterative/Matrix.h | 453 ------------------ lib/algorithms/iterative/MatrixUtils.h | 75 --- lib/algorithms/iterative/TODO | 15 - lib/algorithms/iterative/bisec.c | 122 ----- lib/algorithms/iterative/get_eig.c | 1 - lib/util/Init.cc | 4 +- tests/debug/Test_synthetic_lanczos.cc | 13 +- 13 files changed, 61 insertions(+), 695 deletions(-) rename lib/algorithms/{iterative => densematrix}/DenseMatrix.h (100%) rename lib/algorithms/{iterative => densematrix}/Francis.h (100%) rename lib/algorithms/{iterative => densematrix}/Householder.h (100%) delete mode 100644 lib/algorithms/iterative/Matrix.h delete mode 100644 lib/algorithms/iterative/MatrixUtils.h delete mode 100644 lib/algorithms/iterative/TODO delete mode 100644 lib/algorithms/iterative/bisec.c delete mode 100644 lib/algorithms/iterative/get_eig.c diff --git a/TODO b/TODO index df8554cc..91034f20 100644 --- a/TODO +++ b/TODO @@ -1,6 +1,27 @@ TODO: --------------- +Peter's work list: + +-- Merge high precision reduction into develop +-- Physical propagator interface +-- Precision conversion and sort out localConvert +-- slice* linalg routines for multiRHS, BlockCG +-- Profile CG, BlockCG, etc... Flop count/rate +-- Binary I/O speed up & x-strips +-- Half-precision comms +-- multiRHS DWF; benchmark on Cori/BNL for comms elimination +-- GaugeFix into central location +-- Help Julia with NPR code +-- Switch to measurements +-- Multigrid Wilson and DWF, compare to other Multigrid implementations +-- Remove DenseVector, DenseMatrix; Use Eigen instead. +-- quaternions -- Might not need + + +-- Conserved currents + +----- * Forces; the UdSdU term in gauge force term is half of what I think it should be. This is a consequence of taking ONLY the first term in: @@ -21,16 +42,8 @@ TODO: This means we must double the force in the Test_xxx_force routines, and is the origin of the factor of two. This 2x is applied by hand in the fermion routines and in the Test_rect_force routine. - -Policies: - -* Link smearing/boundary conds; Policy class based implementation ; framework more in place - * Support different boundary conditions (finite temp, chem. potential ... ) -* Support different fermion representations? - - contained entirely within the integrator presently - - Sign of force term. - Reversibility test. @@ -41,11 +54,6 @@ Policies: - Audit oIndex usage for cb behaviour -- Rectangle gauge actions. - Iwasaki, - Symanzik, - ... etc... - - Prepare multigrid for HMC. - Alternate setup schemes. - Support for ILDG --- ugly, not done @@ -55,9 +63,11 @@ Policies: - FFTnD ? - Gparity; hand opt use template specialisation elegance to enable the optimised paths ? + - Gparity force term; Gparity (R)HMC. -- Random number state save restore + - Mobius implementation clean up to rmove #if 0 stale code sequences + - CG -- profile carefully, kernel fusion, whole CG performance measurements. ================================================================ @@ -90,6 +100,7 @@ Insert/Extract Not sure of status of this -- reverify. Things are working nicely now though. * Make the Tensor types and Complex etc... play more nicely. + - TensorRemove is a hack, come up with a long term rationalised approach to Complex vs. Scalar > > QDP forces use of "toDouble" to get back to non tensor scalar. This role is presently taken TensorRemove, but I want to introduce a syntax that does not require this. @@ -112,6 +123,8 @@ Not sure of status of this -- reverify. Things are working nicely now though. RECENT --------------- + - Support different fermion representations? -- DONE + - contained entirely within the integrator presently - Clean up HMC -- DONE - LorentzScalar gets Gauge link type (cleaner). -- DONE - Simplified the integrators a bit. -- DONE @@ -123,6 +136,26 @@ RECENT - Parallel io improvements -- DONE - Plaquette and link trace checks into nersc reader from the Grid_nersc_io.cc test. -- DONE + +DONE: +- MultiArray -- MultiRHS done +- ConjugateGradientMultiShift -- DONE +- MCR -- DONE +- Remez -- Mike or Boost? -- DONE +- Proto (ET) -- DONE +- uBlas -- DONE ; Eigen +- Potentially Useful Boost libraries -- DONE ; Eigen +- Aligned allocator; memory pool -- DONE +- Multiprecision -- DONE +- Serialization -- DONE +- Regex -- Not needed +- Tokenize -- Why? + +- Random number state save restore -- DONE +- Rectangle gauge actions. -- DONE + Iwasaki, + Symanzik, + ... etc... Done: Cayley, Partial , ContFrac force terms. DONE @@ -207,6 +240,7 @@ Done FUNCTIONALITY: it pleases me to keep track of things I have done (keeps me arguably sane) ====================================================================================================== +* Link smearing/boundary conds; Policy class based implementation ; framework more in place -- DONE * Command line args for geometry, simd, etc. layout. Is it necessary to have -- DONE user pass these? Is this a QCD specific? diff --git a/lib/algorithms/Algorithms.h b/lib/algorithms/Algorithms.h index 1b82f0ce..5123c7a1 100644 --- a/lib/algorithms/Algorithms.h +++ b/lib/algorithms/Algorithms.h @@ -46,7 +46,7 @@ Author: Peter Boyle #include // Lanczos support -#include +//#include #include #include #include diff --git a/lib/algorithms/iterative/DenseMatrix.h b/lib/algorithms/densematrix/DenseMatrix.h similarity index 100% rename from lib/algorithms/iterative/DenseMatrix.h rename to lib/algorithms/densematrix/DenseMatrix.h diff --git a/lib/algorithms/iterative/Francis.h b/lib/algorithms/densematrix/Francis.h similarity index 100% rename from lib/algorithms/iterative/Francis.h rename to lib/algorithms/densematrix/Francis.h diff --git a/lib/algorithms/iterative/Householder.h b/lib/algorithms/densematrix/Householder.h similarity index 100% rename from lib/algorithms/iterative/Householder.h rename to lib/algorithms/densematrix/Householder.h diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h index 233eb8f5..3aa54360 100644 --- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h +++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h @@ -30,6 +30,7 @@ Author: paboyle #define GRID_IRL_H #include //memset + #ifdef USE_LAPACK void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e, double *vl, double *vu, int *il, int *iu, double *abstol, @@ -37,8 +38,9 @@ void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e, double *work, int *lwork, int *iwork, int *liwork, int *info); #endif -#include "DenseMatrix.h" -#include "EigenSort.h" + +#include +#include namespace Grid { @@ -1088,8 +1090,6 @@ static void Lock(DenseMatrix &H, // Hess mtx int dfg, bool herm) { - - //ForceTridiagonal(H); int M = H.dim; @@ -1121,7 +1121,6 @@ static void Lock(DenseMatrix &H, // Hess mtx AH = Hermitian(QQ)*AH; AH = AH*QQ; - for(int i=con;i - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ -#ifndef MATRIX_H -#define MATRIX_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -/** Sign function **/ -template T sign(T p){return ( p/abs(p) );} - -///////////////////////////////////////////////////////////////////////////////////////////////////////// -///////////////////// Hijack STL containers for our wicked means ///////////////////////////////////////// -///////////////////////////////////////////////////////////////////////////////////////////////////////// -template using Vector = Vector; -template using Matrix = Vector >; - -template void Resize(Vector & vec, int N) { vec.resize(N); } - -template void Resize(Matrix & mat, int N, int M) { - mat.resize(N); - for(int i=0;i void Size(Vector & vec, int &N) -{ - N= vec.size(); -} -template void Size(Matrix & mat, int &N,int &M) -{ - N= mat.size(); - M= mat[0].size(); -} -template void SizeSquare(Matrix & mat, int &N) -{ - int M; Size(mat,N,M); - assert(N==M); -} -template void SizeSame(Matrix & mat1,Matrix &mat2, int &N1,int &M1) -{ - int N2,M2; - Size(mat1,N1,M1); - Size(mat2,N2,M2); - assert(N1==N2); - assert(M1==M2); -} - -//***************************************** -//* (Complex) Vector operations * -//***************************************** - -/**Conj of a Vector **/ -template Vector conj(Vector p){ - Vector q(p.size()); - for(int i=0;i T norm(Vector p){ - T sum = 0; - for(int i=0;i T norm2(Vector p){ - T sum = 0; - for(int i=0;i T trace(Vector p){ - T sum = 0; - for(int i=0;i void Fill(Vector &p, T c){ - for(int i=0;i void normalize(Vector &p){ - T m = norm(p); - if( abs(m) > 0.0) for(int i=0;i Vector times(Vector p, U s){ - for(int i=0;i Vector times(U s, Vector p){ - for(int i=0;i T inner(Vector a, Vector b){ - T m = 0.; - for(int i=0;i Vector add(Vector a, Vector b){ - Vector m(a.size()); - for(int i=0;i Vector sub(Vector a, Vector b){ - Vector m(a.size()); - for(int i=0;i void Fill(Matrix & mat, T&val) { - int N,M; - Size(mat,N,M); - for(int i=0;i Transpose(Matrix & mat){ - int N,M; - Size(mat,N,M); - Matrix C; Resize(C,M,N); - for(int i=0;i void Unity(Matrix &mat){ - int N; SizeSquare(mat,N); - for(int i=0;i -void PlusUnit(Matrix & A,T c){ - int dim; SizeSquare(A,dim); - for(int i=0;i HermitianConj(Matrix &mat){ - - int dim; SizeSquare(mat,dim); - - Matrix C; Resize(C,dim,dim); - - for(int i=0;i diag(Matrix &A) -{ - int dim; SizeSquare(A,dim); - Vector d; Resize(d,dim); - - for(int i=0;i operator *(Vector &B,Matrix &A) -{ - int K,M,N; - Size(B,K); - Size(A,M,N); - assert(K==M); - - Vector C; Resize(C,N); - - for(int j=0;j inv_diag(Matrix & A){ - int dim; SizeSquare(A,dim); - Vector d; Resize(d,dim); - for(int i=0;i operator + (Matrix &A,Matrix &B) -{ - int N,M ; SizeSame(A,B,N,M); - Matrix C; Resize(C,N,M); - for(int i=0;i operator- (Matrix & A,Matrix &B){ - int N,M ; SizeSame(A,B,N,M); - Matrix C; Resize(C,N,M); - for(int i=0;i operator* (Matrix & A,T c){ - int N,M; Size(A,N,M); - Matrix C; Resize(C,N,M); - for(int i=0;i operator* (Matrix &A,Matrix &B){ - int K,L,N,M; - Size(A,K,L); - Size(B,N,M); assert(L==N); - Matrix C; Resize(C,K,M); - - for(int i=0;i operator* (Matrix &A,Vector &B){ - int M,N,K; - Size(A,N,M); - Size(B,K); assert(K==M); - Vector C; Resize(C,N); - for(int i=0;i T LargestDiag(Matrix &A) -{ - int dim ; SizeSquare(A,dim); - - T ld = abs(A[0][0]); - for(int i=1;i abs(ld) ){ld = cf;} - } - return ld; -} - -/** Look for entries on the leading subdiagonal that are smaller than 'small' **/ -template int Chop_subdiag(Matrix &A,T norm, int offset, U small) -{ - int dim; SizeSquare(A,dim); - for(int l = dim - 1 - offset; l >= 1; l--) { - if((U)abs(A[l][l - 1]) < (U)small) { - A[l][l-1]=(U)0.0; - return l; - } - } - return 0; -} - -/** Look for entries on the leading subdiagonal that are smaller than 'small' **/ -template int Chop_symm_subdiag(Matrix & A,T norm, int offset, U small) -{ - int dim; SizeSquare(A,dim); - for(int l = dim - 1 - offset; l >= 1; l--) { - if((U)abs(A[l][l - 1]) < (U)small) { - A[l][l - 1] = (U)0.0; - A[l - 1][l] = (U)0.0; - return l; - } - } - return 0; -} -/**Assign a submatrix to a larger one**/ -template -void AssignSubMtx(Matrix & A,int row_st, int row_end, int col_st, int col_end, Matrix &S) -{ - for(int i = row_st; i -Matrix GetSubMtx(Matrix &A,int row_st, int row_end, int col_st, int col_end) -{ - Matrix H; Resize(row_end - row_st,col_end-col_st); - - for(int i = row_st; i -void AssignSubMtx(Matrix & A,int row_st, int row_end, int col_st, int col_end, Matrix &S) -{ - for(int i = row_st; i T proj(Matrix A, Vector B){ - int dim; SizeSquare(A,dim); - int dimB; Size(B,dimB); - assert(dimB==dim); - T C = 0; - for(int i=0;i q Q -template void times(Vector &q, Matrix &Q) -{ - int M; SizeSquare(Q,M); - int N; Size(q,N); - assert(M==N); - - times(q,Q,N); -} - -/// q -> q Q -template void times(multi1d &q, Matrix &Q, int N) -{ - GridBase *grid = q[0]._grid; - int M; SizeSquare(Q,M); - int K; Size(q,K); - assert(N S(N,grid ); - for(int j=0;j - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ -#ifndef GRID_MATRIX_UTILS_H -#define GRID_MATRIX_UTILS_H - -namespace Grid { - - namespace MatrixUtils { - - template inline void Size(Matrix& A,int &N,int &M){ - N=A.size(); assert(N>0); - M=A[0].size(); - for(int i=0;i inline void SizeSquare(Matrix& A,int &N) - { - int M; - Size(A,N,M); - assert(N==M); - } - - template inline void Fill(Matrix& A,T & val) - { - int N,M; - Size(A,N,M); - for(int i=0;i inline void Diagonal(Matrix& A,T & val) - { - int N; - SizeSquare(A,N); - for(int i=0;i inline void Identity(Matrix& A) - { - Fill(A,0.0); - Diagonal(A,1.0); - } - - }; -} -#endif diff --git a/lib/algorithms/iterative/TODO b/lib/algorithms/iterative/TODO deleted file mode 100644 index ca3bca3b..00000000 --- a/lib/algorithms/iterative/TODO +++ /dev/null @@ -1,15 +0,0 @@ -- ConjugateGradientMultiShift -- MCR - -- Potentially Useful Boost libraries - -- MultiArray -- Aligned allocator; memory pool -- Remez -- Mike or Boost? -- Multiprecision -- quaternians -- Tokenize -- Serialization -- Regex -- Proto (ET) -- uBlas diff --git a/lib/algorithms/iterative/bisec.c b/lib/algorithms/iterative/bisec.c deleted file mode 100644 index a5117aee..00000000 --- a/lib/algorithms/iterative/bisec.c +++ /dev/null @@ -1,122 +0,0 @@ -#include -#include -#include - -struct Bisection { - -static void get_eig2(int row_num,std::vector &ALPHA,std::vector &BETA, std::vector & eig) -{ - int i,j; - std::vector evec1(row_num+3); - std::vector evec2(row_num+3); - RealD eps2; - ALPHA[1]=0.; - BETHA[1]=0.; - for(i=0;imag(evec2[i+1])) { - swap(evec2+i,evec2+i+1); - swapped=1; - } - } - end--; - for(i=end-1;i>=begin;i--){ - if(mag(evec2[i])>mag(evec2[i+1])) { - swap(evec2+i,evec2+i+1); - swapped=1; - } - } - begin++; - } - - for(i=0;i &c, - std::vector &b, - int n, - int m1, - int m2, - RealD eps1, - RealD relfeh, - std::vector &x, - RealD &eps2) -{ - std::vector wu(n+2); - - RealD h,q,x1,xu,x0,xmin,xmax; - int i,a,k; - - b[1]=0.0; - xmin=c[n]-fabs(b[n]); - xmax=c[n]+fabs(b[n]); - for(i=1;ixmax) xmax= c[i]+h; - if(c[i]-h0.0 ? xmax : -xmin); - if(eps1<=0.0) eps1=eps2; - eps2=0.5*eps1+7.0*(eps2); - x0=xmax; - for(i=m1;i<=m2;i++){ - x[i]=xmax; - wu[i]=xmin; - } - - for(k=m2;k>=m1;k--){ - xu=xmin; - i=k; - do{ - if(xu=m1); - if(x0>x[k]) x0=x[k]; - while((x0-xu)>2*relfeh*(fabs(xu)+fabs(x0))+eps1){ - x1=(xu+x0)/2; - - a=0; - q=1.0; - for(i=1;i<=n;i++){ - q=c[i]-x1-((q!=0.0)? b[i]*b[i]/q:fabs(b[i])/relfeh); - if(q<0) a++; - } - // printf("x1=%e a=%d\n",x1,a); - if(ax1) x[a]=x1; - } - }else x0=x1; - } - x[k]=(x0+xu)/2; - } -} -} diff --git a/lib/algorithms/iterative/get_eig.c b/lib/algorithms/iterative/get_eig.c deleted file mode 100644 index d3f5a12f..00000000 --- a/lib/algorithms/iterative/get_eig.c +++ /dev/null @@ -1 +0,0 @@ - diff --git a/lib/util/Init.cc b/lib/util/Init.cc index 990a4e0f..d0685480 100644 --- a/lib/util/Init.cc +++ b/lib/util/Init.cc @@ -311,8 +311,8 @@ void Grid_init(int *argc,char ***argv) std::cout< Cheby(alpha,beta,mu,order); @@ -131,10 +131,9 @@ int main (int argc, char ** argv) const int Nit= 10000; int Nconv; - RealD eresid = 1.0e-8; + RealD eresid = 1.0e-6; ImplicitlyRestartedLanczos IRL(HermOp,X,Nk,Nm,eresid,Nit); - ImplicitlyRestartedLanczos ChebyIRL(HermOp,Cheby,Nk,Nm,eresid,Nit); LatticeComplex src(grid); gaussian(RNG,src); @@ -145,9 +144,9 @@ int main (int argc, char ** argv) } { - // std::vector eval(Nm); - // std::vector evec(Nm,grid); - // ChebyIRL.calc(eval,evec,src, Nconv); + std::vector eval(Nm); + std::vector evec(Nm,grid); + ChebyIRL.calc(eval,evec,src, Nconv); } Grid_finalize(); From 441a52ee5d9917ef0c7d2de179adcc9dab71804d Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 15 Apr 2017 10:57:21 +0100 Subject: [PATCH 06/11] First cut at higher precision reduction --- lib/lattice/Lattice_reduction.h | 197 ++++++++++++++++---------------- lib/simd/Grid_vector_types.h | 1 - lib/tensors/Tensor_class.h | 3 + lib/tensors/Tensor_inner.h | 142 ++++++++++++++++------- lib/tensors/Tensor_traits.h | 10 ++ 5 files changed, 210 insertions(+), 143 deletions(-) diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h index 45a88a64..bb7808b8 100644 --- a/lib/lattice/Lattice_reduction.h +++ b/lib/lattice/Lattice_reduction.h @@ -38,110 +38,108 @@ namespace Grid { //////////////////////////////////////////////////////////////////////////////////////////////////// // Deterministic Reduction operations //////////////////////////////////////////////////////////////////////////////////////////////////// - template inline RealD norm2(const Lattice &arg){ - ComplexD nrm = innerProduct(arg,arg); - return std::real(nrm); +template inline RealD norm2(const Lattice &arg){ + ComplexD nrm = innerProduct(arg,arg); + return std::real(nrm); +} + +template +inline ComplexD innerProduct(const Lattice &left,const Lattice &right) +{ + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_typeD vector_type; + scalar_type nrm; + + GridBase *grid = left._grid; + + std::vector > sumarray(grid->SumArraySize()); + for(int i=0;iSumArraySize();i++){ + sumarray[i]=zero; } - - template - inline ComplexD innerProduct(const Lattice &left,const Lattice &right) - { - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - scalar_type nrm; - - GridBase *grid = left._grid; - - std::vector > sumarray(grid->SumArraySize()); - for(int i=0;iSumArraySize();i++){ - sumarray[i]=zero; - } - - parallel_for(int thr=0;thrSumArraySize();thr++){ - int nwork, mywork, myoff; - GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); - - decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation - for(int ss=myoff;ssSumArraySize();i++){ - vvnrm = vvnrm+sumarray[i]; - } - nrm = Reduce(vvnrm);// sum across simd - right._grid->GlobalSum(nrm); - return nrm; + + parallel_for(int thr=0;thrSumArraySize();thr++){ + int nwork, mywork, myoff; + GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); + + decltype(innerProductD(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation + for(int ss=myoff;ssSumArraySize();i++){ + vvnrm = vvnrm+sumarray[i]; + } + nrm = Reduce(vvnrm);// sum across simd + right._grid->GlobalSum(nrm); + return nrm; +} + +template +inline auto sum(const LatticeUnaryExpression & expr) + ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object +{ + return sum(closure(expr)); +} - template - inline auto sum(const LatticeUnaryExpression & expr) - ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object - { - return sum(closure(expr)); - } - - template - inline auto sum(const LatticeBinaryExpression & expr) +template +inline auto sum(const LatticeBinaryExpression & expr) ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object - { - return sum(closure(expr)); +{ + return sum(closure(expr)); +} + + +template +inline auto sum(const LatticeTrinaryExpression & expr) + ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)), + eval(0,std::get<1>(expr.second)), + eval(0,std::get<2>(expr.second)) + ))::scalar_object +{ + return sum(closure(expr)); +} + +template +inline typename vobj::scalar_object sum(const Lattice &arg) +{ + GridBase *grid=arg._grid; + int Nsimd = grid->Nsimd(); + + std::vector > sumarray(grid->SumArraySize()); + for(int i=0;iSumArraySize();i++){ + sumarray[i]=zero; + } + + parallel_for(int thr=0;thrSumArraySize();thr++){ + int nwork, mywork, myoff; + GridThread::GetWork(grid->oSites(),thr,mywork,myoff); + + vobj vvsum=zero; + for(int ss=myoff;ss - inline auto sum(const LatticeTrinaryExpression & expr) - ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)), - eval(0,std::get<1>(expr.second)), - eval(0,std::get<2>(expr.second)) - ))::scalar_object - { - return sum(closure(expr)); - } - - template - inline typename vobj::scalar_object sum(const Lattice &arg){ - - GridBase *grid=arg._grid; - int Nsimd = grid->Nsimd(); - - std::vector > sumarray(grid->SumArraySize()); - for(int i=0;iSumArraySize();i++){ - sumarray[i]=zero; - } - - parallel_for(int thr=0;thrSumArraySize();thr++){ - int nwork, mywork, myoff; - GridThread::GetWork(grid->oSites(),thr,mywork,myoff); - - vobj vvsum=zero; - for(int ss=myoff;ssSumArraySize();i++){ - vsum = vsum+sumarray[i]; - } - - typedef typename vobj::scalar_object sobj; - sobj ssum=zero; - - std::vector buf(Nsimd); - extract(vsum,buf); - - for(int i=0;iGlobalSum(ssum); - - return ssum; - } - - + sumarray[thr]=vvsum; + } + + vobj vsum=zero; // sum across threads + for(int i=0;iSumArraySize();i++){ + vsum = vsum+sumarray[i]; + } + + typedef typename vobj::scalar_object sobj; + sobj ssum=zero; + + std::vector buf(Nsimd); + extract(vsum,buf); + + for(int i=0;iGlobalSum(ssum); + + return ssum; +} template inline void sliceSum(const Lattice &Data,std::vector &result,int orthogdim) { @@ -214,7 +212,6 @@ template inline void sliceSum(const Lattice &Data,std::vector< result[t]=gsum; } - } diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index ccedf99c..248a625c 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -694,7 +694,6 @@ inline Grid_simd innerProduct(const Grid_simd &l, const Grid_simd &r) { return conjugate(l) * r; } - template inline Grid_simd outerProduct(const Grid_simd &l, const Grid_simd &r) { diff --git a/lib/tensors/Tensor_class.h b/lib/tensors/Tensor_class.h index e0b69eb0..9b806d75 100644 --- a/lib/tensors/Tensor_class.h +++ b/lib/tensors/Tensor_class.h @@ -56,6 +56,7 @@ class iScalar { typedef vtype element; typedef typename GridTypeMapper::scalar_type scalar_type; typedef typename GridTypeMapper::vector_type vector_type; + typedef typename GridTypeMapper::vector_typeD vector_typeD; typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; typedef iScalar tensor_reduced; typedef typename GridTypeMapper::scalar_object recurse_scalar_object; @@ -193,6 +194,7 @@ class iVector { typedef vtype element; typedef typename GridTypeMapper::scalar_type scalar_type; typedef typename GridTypeMapper::vector_type vector_type; + typedef typename GridTypeMapper::vector_typeD vector_typeD; typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; typedef typename GridTypeMapper::scalar_object recurse_scalar_object; typedef iScalar tensor_reduced; @@ -305,6 +307,7 @@ class iMatrix { typedef vtype element; typedef typename GridTypeMapper::scalar_type scalar_type; typedef typename GridTypeMapper::vector_type vector_type; + typedef typename GridTypeMapper::vector_typeD vector_typeD; typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; typedef typename GridTypeMapper::scalar_object recurse_scalar_object; diff --git a/lib/tensors/Tensor_inner.h b/lib/tensors/Tensor_inner.h index 22284be4..46185652 100644 --- a/lib/tensors/Tensor_inner.h +++ b/lib/tensors/Tensor_inner.h @@ -29,51 +29,109 @@ Author: Peter Boyle #ifndef GRID_MATH_INNER_H #define GRID_MATH_INNER_H namespace Grid { - /////////////////////////////////////////////////////////////////////////////////////// - // innerProduct Scalar x Scalar -> Scalar - // innerProduct Vector x Vector -> Scalar - // innerProduct Matrix x Matrix -> Scalar - /////////////////////////////////////////////////////////////////////////////////////// - template inline RealD norm2(const sobj &arg){ - typedef typename sobj::scalar_type scalar; - decltype(innerProduct(arg,arg)) nrm; - nrm = innerProduct(arg,arg); - RealD ret = real(nrm); - return ret; - } + /////////////////////////////////////////////////////////////////////////////////////// + // innerProduct Scalar x Scalar -> Scalar + // innerProduct Vector x Vector -> Scalar + // innerProduct Matrix x Matrix -> Scalar + /////////////////////////////////////////////////////////////////////////////////////// + template inline RealD norm2(const sobj &arg){ + auto nrm = innerProductD(arg,arg); + RealD ret = real(nrm); + return ret; + } + ////////////////////////////////////// + // If single promote to double and sum 2x + ////////////////////////////////////// - template inline - auto innerProduct (const iVector& lhs,const iVector& rhs) -> iScalar - { - typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t; - iScalar ret; - ret=zero; - for(int c1=0;c1 inline + auto innerProductD (const iVector& lhs,const iVector& rhs) -> iScalar + { + typedef decltype(innerProductD(lhs._internal[0],rhs._internal[0])) ret_t; + iScalar ret; + ret=zero; + for(int c1=0;c1 inline - auto innerProduct (const iMatrix& lhs,const iMatrix& rhs) -> iScalar - { - typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t; - iScalar ret; - iScalar tmp; - ret=zero; - for(int c1=0;c1 inline - auto innerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar - { - typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t; - iScalar ret; - ret._internal = innerProduct(lhs._internal,rhs._internal); - return ret; + return ret; + } + template inline + auto innerProductD (const iMatrix& lhs,const iMatrix& rhs) -> iScalar + { + typedef decltype(innerProductD(lhs._internal[0][0],rhs._internal[0][0])) ret_t; + iScalar ret; + iScalar tmp; + ret=zero; + for(int c1=0;c1 inline + auto innerProductD (const iScalar& lhs,const iScalar& rhs) -> iScalar + { + typedef decltype(innerProductD(lhs._internal,rhs._internal)) ret_t; + iScalar ret; + ret._internal = innerProductD(lhs._internal,rhs._internal); + return ret; + } + ////////////////////// + // Keep same precison + ////////////////////// + template inline + auto innerProduct (const iVector& lhs,const iVector& rhs) -> iScalar + { + typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t; + iScalar ret; + ret=zero; + for(int c1=0;c1 inline + auto innerProduct (const iMatrix& lhs,const iMatrix& rhs) -> iScalar + { + typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t; + iScalar ret; + iScalar tmp; + ret=zero; + for(int c1=0;c1 inline + auto innerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar + { + typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t; + iScalar ret; + ret._internal = innerProduct(lhs._internal,rhs._internal); + return ret; + } } #endif diff --git a/lib/tensors/Tensor_traits.h b/lib/tensors/Tensor_traits.h index 777b398d..4dcfd9b1 100644 --- a/lib/tensors/Tensor_traits.h +++ b/lib/tensors/Tensor_traits.h @@ -53,6 +53,7 @@ namespace Grid { public: typedef typename T::scalar_type scalar_type; typedef typename T::vector_type vector_type; + typedef typename T::vector_typeD vector_typeD; typedef typename T::tensor_reduced tensor_reduced; typedef typename T::scalar_object scalar_object; typedef typename T::Complexified Complexified; @@ -67,6 +68,7 @@ namespace Grid { public: typedef RealF scalar_type; typedef RealF vector_type; + typedef RealD vector_typeD; typedef RealF tensor_reduced ; typedef RealF scalar_object; typedef ComplexF Complexified; @@ -77,6 +79,7 @@ namespace Grid { public: typedef RealD scalar_type; typedef RealD vector_type; + typedef RealD vector_typeD; typedef RealD tensor_reduced; typedef RealD scalar_object; typedef ComplexD Complexified; @@ -87,6 +90,7 @@ namespace Grid { public: typedef ComplexF scalar_type; typedef ComplexF vector_type; + typedef ComplexD vector_typeD; typedef ComplexF tensor_reduced; typedef ComplexF scalar_object; typedef ComplexF Complexified; @@ -97,6 +101,7 @@ namespace Grid { public: typedef ComplexD scalar_type; typedef ComplexD vector_type; + typedef ComplexD vector_typeD; typedef ComplexD tensor_reduced; typedef ComplexD scalar_object; typedef ComplexD Complexified; @@ -118,6 +123,7 @@ namespace Grid { public: typedef RealF scalar_type; typedef vRealF vector_type; + typedef vRealD vector_typeD; typedef vRealF tensor_reduced; typedef RealF scalar_object; typedef vComplexF Complexified; @@ -128,6 +134,7 @@ namespace Grid { public: typedef RealD scalar_type; typedef vRealD vector_type; + typedef vRealD vector_typeD; typedef vRealD tensor_reduced; typedef RealD scalar_object; typedef vComplexD Complexified; @@ -138,6 +145,7 @@ namespace Grid { public: typedef ComplexF scalar_type; typedef vComplexF vector_type; + typedef vComplexD vector_typeD; typedef vComplexF tensor_reduced; typedef ComplexF scalar_object; typedef vComplexF Complexified; @@ -148,6 +156,7 @@ namespace Grid { public: typedef ComplexD scalar_type; typedef vComplexD vector_type; + typedef vComplexD vector_typeD; typedef vComplexD tensor_reduced; typedef ComplexD scalar_object; typedef vComplexD Complexified; @@ -158,6 +167,7 @@ namespace Grid { public: typedef Integer scalar_type; typedef vInteger vector_type; + typedef vInteger vector_typeD; typedef vInteger tensor_reduced; typedef Integer scalar_object; typedef void Complexified; From bf516c3b8127667129daa993a61d1f2755039407 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 15 Apr 2017 12:27:28 +0100 Subject: [PATCH 07/11] higher precision reduction variables in norm and inner product --- lib/lattice/Lattice_reduction.h | 3 ++- tests/solver/Test_dwf_cg_prec.cc | 2 +- tests/solver/Test_dwf_cg_unprec.cc | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h index bb7808b8..e12bf0dd 100644 --- a/lib/lattice/Lattice_reduction.h +++ b/lib/lattice/Lattice_reduction.h @@ -42,7 +42,7 @@ template inline RealD norm2(const Lattice &arg){ ComplexD nrm = innerProduct(arg,arg); return std::real(nrm); } - +// Double inner product template inline ComplexD innerProduct(const Lattice &left,const Lattice &right) { @@ -102,6 +102,7 @@ inline auto sum(const LatticeTrinaryExpression & expr) return sum(closure(expr)); } +// FIXME precision promoted summation template inline typename vobj::scalar_object sum(const Lattice &arg) { diff --git a/tests/solver/Test_dwf_cg_prec.cc b/tests/solver/Test_dwf_cg_prec.cc index 30436e36..0ede352e 100644 --- a/tests/solver/Test_dwf_cg_prec.cc +++ b/tests/solver/Test_dwf_cg_prec.cc @@ -89,7 +89,7 @@ int main(int argc, char** argv) { GridStopWatch CGTimer; SchurDiagMooeeOperator HermOpEO(Ddwf); - ConjugateGradient CG(1.0e-8, 10000, 0);// switch off the assert + ConjugateGradient CG(1.0e-5, 10000, 0);// switch off the assert CGTimer.Start(); CG(HermOpEO, src_o, result_o); diff --git a/tests/solver/Test_dwf_cg_unprec.cc b/tests/solver/Test_dwf_cg_unprec.cc index 131f6ce1..38f632ef 100644 --- a/tests/solver/Test_dwf_cg_unprec.cc +++ b/tests/solver/Test_dwf_cg_unprec.cc @@ -73,7 +73,7 @@ int main (int argc, char ** argv) DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); MdagMLinearOperator HermOp(Ddwf); - ConjugateGradient CG(1.0e-8,10000); + ConjugateGradient CG(1.0e-6,10000); CG(HermOp,src,result); Grid_finalize(); From 7ede6961269516638aa21560f9511ec5140fbcb2 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 16 Apr 2017 23:40:00 +0100 Subject: [PATCH 08/11] Non compile of tests fixed --- TODO | 17 +- .../iterative/BlockConjugateGradient.h | 201 ------------------ lib/lattice/Lattice_reduction.h | 159 ++++++++++++++ lib/simd/Grid_vector_types.h | 2 +- lib/tensors/Tensor_traits.h | 1 + 5 files changed, 170 insertions(+), 210 deletions(-) diff --git a/TODO b/TODO index 91034f20..27579ad3 100644 --- a/TODO +++ b/TODO @@ -3,19 +3,20 @@ TODO: Peter's work list: --- Merge high precision reduction into develop +-- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started +-- Merge high precision reduction into develop <-- done +-- Precision conversion and sort out localConvert <-- -- Physical propagator interface --- Precision conversion and sort out localConvert --- slice* linalg routines for multiRHS, BlockCG + +-- multiRHS DWF; benchmark on Cori/BNL for comms elimination + -- slice* linalg routines for multiRHS, BlockCG <-- started + -- Profile CG, BlockCG, etc... Flop count/rate -- Binary I/O speed up & x-strips --- Half-precision comms --- multiRHS DWF; benchmark on Cori/BNL for comms elimination +-- Half-precision comms <-- started -- GaugeFix into central location --- Help Julia with NPR code --- Switch to measurements +-- FFTfix in sensible place -- Multigrid Wilson and DWF, compare to other Multigrid implementations --- Remove DenseVector, DenseMatrix; Use Eigen instead. -- quaternions -- Might not need diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h index 0f4a3a80..1db89512 100644 --- a/lib/algorithms/iterative/BlockConjugateGradient.h +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -30,210 +30,9 @@ directory #ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H #define GRID_BLOCK_CONJUGATE_GRADIENT_H -#include namespace Grid { -GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) -{ - int NN = BlockSolverGrid->_ndimension; - int nsimd = BlockSolverGrid->Nsimd(); - - std::vector latt_phys(0); - std::vector simd_phys(0); - std::vector mpi_phys(0); - - for(int d=0;d_fdimensions[d]); - simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); - mpi_phys.push_back(BlockSolverGrid->_processors[d]); - } - } - return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); -} - ////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Need to move sliceInnerProduct, sliceAxpy, sliceNorm etc... into lattice sector along with sliceSum - ////////////////////////////////////////////////////////////////////////////////////////////////////////////// -template -static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) -{ - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - int Nblock = X._grid->GlobalDimensions()[Orthog]; - - GridBase *FullGrid = X._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - Lattice Xslice(SliceGrid); - Lattice Rslice(SliceGrid); - // FIXME: Implementation is slow - // If we based this on Cshift it would work for spread out - // but it would be even slower - // - // Repeated extract slice is inefficient - // - // Best base the linear combination by constructing a - // set of vectors of size grid->_rdimensions[Orthog]. - for(int i=0;i -static void sliceMaddVector (Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, - int Orthog,RealD scale=1.0) -{ - // FIXME: Implementation is slow - // Best base the linear combination by constructing a - // set of vectors of size grid->_rdimensions[Orthog]. - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - int Nblock = X._grid->GlobalDimensions()[Orthog]; - - GridBase *FullGrid = X._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - Lattice Xslice(SliceGrid); - Lattice Rslice(SliceGrid); - // If we based this on Cshift it would work for spread out - // but it would be even slower - for(int i=0;i -static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) -{ - // FIXME: Implementation is slow - // Not sure of best solution.. think about it - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - GridBase *FullGrid = lhs._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - int Nblock = FullGrid->GlobalDimensions()[Orthog]; - - Lattice Lslice(SliceGrid); - Lattice Rslice(SliceGrid); - - mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); - - for(int i=0;i -static void sliceInnerProductVector( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) -{ - // FIXME: Implementation is slow - // Look at localInnerProduct implementation, - // and do inside a site loop with block strided iterators - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - typedef typename vobj::tensor_reduced scalar; - typedef typename scalar::scalar_object scomplex; - - int Nblock = lhs._grid->GlobalDimensions()[Orthog]; - - vec.resize(Nblock); - std::vector sip(Nblock); - Lattice IP(lhs._grid); - - IP=localInnerProduct(lhs,rhs); - sliceSum(IP,sip,Orthog); - - for(int ss=0;ss -static void sliceNorm (std::vector &sn,const Lattice &rhs,int Orthog) { - - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - int Nblock = rhs._grid->GlobalDimensions()[Orthog]; - std::vector ip(Nblock); - sn.resize(Nblock); - - sliceInnerProductVector(ip,rhs,rhs,Orthog); - for(int ss=0;ss -static void sliceInnerProductMatrixOld( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) -{ - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - typedef typename vobj::tensor_reduced scalar; - typedef typename scalar::scalar_object scomplex; - - int Nblock = lhs._grid->GlobalDimensions()[Orthog]; - - std::cout << " sliceInnerProductMatrix Dim "< IP(lhs._grid); - std::vector sip(Nblock); - - mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); - - Lattice tmp = rhs; - - for(int s1=0;s1 #ifndef GRID_LATTICE_REDUCTION_H #define GRID_LATTICE_REDUCTION_H +#include + namespace Grid { #ifdef GRID_WARN_SUBOPTIMAL #warning "Optimisation alert all these reduction loops are NOT threaded " @@ -215,6 +217,163 @@ template inline void sliceSum(const Lattice &Data,std::vector< } } +inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) + { + int NN = BlockSolverGrid->_ndimension; + int nsimd = BlockSolverGrid->Nsimd(); + + std::vector latt_phys(0); + std::vector simd_phys(0); + std::vector mpi_phys(0); + + for(int d=0;d_fdimensions[d]); + simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); + mpi_phys.push_back(BlockSolverGrid->_processors[d]); + } + } + return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); + } + ////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Need to move sliceInnerProduct, sliceAxpy, sliceNorm etc... into lattice sector along with sliceSum + ////////////////////////////////////////////////////////////////////////////////////////////////////////////// +template + static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) + { + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + // FIXME: Implementation is slow + // If we based this on Cshift it would work for spread out + // but it would be even slower + // + // Repeated extract slice is inefficient + // + // Best base the linear combination by constructing a + // set of vectors of size grid->_rdimensions[Orthog]. + for(int i=0;i + static void sliceMaddVector (Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, + int Orthog,RealD scale=1.0) + { + // FIXME: Implementation is slow + // Best base the linear combination by constructing a + // set of vectors of size grid->_rdimensions[Orthog]. + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + // If we based this on Cshift it would work for spread out + // but it would be even slower + for(int i=0;i + static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) + { + // FIXME: Implementation is slow + // Not sure of best solution.. think about it + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + GridBase *FullGrid = lhs._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + int Nblock = FullGrid->GlobalDimensions()[Orthog]; + + Lattice Lslice(SliceGrid); + Lattice Rslice(SliceGrid); + + mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); + + for(int i=0;i + static void sliceInnerProductVector( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) + { + // FIXME: Implementation is slow + // Look at localInnerProduct implementation, + // and do inside a site loop with block strided iterators + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced scalar; + typedef typename scalar::scalar_object scomplex; + + int Nblock = lhs._grid->GlobalDimensions()[Orthog]; + + vec.resize(Nblock); + std::vector sip(Nblock); + Lattice IP(lhs._grid); + + IP=localInnerProduct(lhs,rhs); + sliceSum(IP,sip,Orthog); + + for(int ss=0;ss + static void sliceNorm (std::vector &sn,const Lattice &rhs,int Orthog) { + + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = rhs._grid->GlobalDimensions()[Orthog]; + std::vector ip(Nblock); + sn.resize(Nblock); + + sliceInnerProductVector(ip,rhs,rhs,Orthog); + for(int ss=0;ss Date: Mon, 17 Apr 2017 10:50:19 +0100 Subject: [PATCH 09/11] MultiRHS working, starting to optimise. Block doesn't and I thought it already was; puzzled. --- .../iterative/BlockConjugateGradient.h | 2 + lib/lattice/Lattice_reduction.h | 209 ++++++++++++++---- 2 files changed, 170 insertions(+), 41 deletions(-) diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h index 1db89512..42a9af56 100644 --- a/lib/algorithms/iterative/BlockConjugateGradient.h +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -256,8 +256,10 @@ void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) Linop.HermOp(P, AP); // Alpha + // sliceInnerProductVectorTest(v_pAp_test,P,AP,Orthog); sliceInnerProductVector(v_pAp,P,AP,Orthog); for(int b=0;b &left,const Lattice &righ GridBase *grid = left._grid; std::vector > sumarray(grid->SumArraySize()); - for(int i=0;iSumArraySize();i++){ - sumarray[i]=zero; - } parallel_for(int thr=0;thrSumArraySize();thr++){ int nwork, mywork, myoff; @@ -152,7 +149,6 @@ template inline void sliceSum(const Lattice &Data,std::vector< // FIXME // std::cout<SumArraySize()<<" threads "<_ndimension; const int Nsimd = grid->Nsimd(); @@ -164,8 +160,8 @@ template inline void sliceSum(const Lattice &Data,std::vector< int rd=grid->_rdimensions[orthogdim]; std::vector > lvSum(rd); // will locally sum vectors first - std::vector lsSum(ld,zero); // sum across these down to scalars - std::vector extracted(Nsimd); // splitting the SIMD + std::vector lsSum(ld,zero); // sum across these down to scalars + std::vector extracted(Nsimd); // splitting the SIMD result.resize(fd); // And then global sum to return the same vector to every node for IO to file for(int r=0;r inline void sliceSum(const Lattice &Data,std::vector< std::vector coor(Nd); // sum over reduced dimension planes, breaking out orthog dir - for(int ss=0;ssoSites();ss++){ Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions); int r = coor[orthogdim]; lvSum[r]=lvSum[r]+Data._odata[ss]; - } + } // Sum across simd lanes in the plane, breaking out orthog dir. std::vector icoor(Nd); @@ -217,6 +212,171 @@ template inline void sliceSum(const Lattice &Data,std::vector< } } +template + static void sliceInnerProductVectorSlow( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) + { + // FIXME: Implementation is slow + // Look at localInnerProduct implementation, + // and do inside a site loop with block strided iterators + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced scalar; + typedef typename scalar::scalar_object scomplex; + + int Nblock = lhs._grid->GlobalDimensions()[Orthog]; + + vec.resize(Nblock); + std::vector sip(Nblock); + Lattice IP(lhs._grid); + + IP=localInnerProduct(lhs,rhs); + sliceSum(IP,sip,Orthog); + + for(int ss=0;ss +static void sliceInnerProductVector( std::vector & result, const Lattice &lhs,const Lattice &rhs,int orthogdim) +{ + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_type scalar_type; + GridBase *grid = lhs._grid; + assert(grid!=NULL); + conformable(grid,rhs._grid); + + // FIXME + // std::cout<SumArraySize()<<" threads "<_ndimension; + const int Nsimd = grid->Nsimd(); + + assert(orthogdim >= 0); + assert(orthogdim < Nd); + + int fd=grid->_fdimensions[orthogdim]; + int ld=grid->_ldimensions[orthogdim]; + int rd=grid->_rdimensions[orthogdim]; + + std::vector > lvSum(rd); // will locally sum vectors first + std::vector lsSum(ld,scalar_type(0.0)); // sum across these down to scalars + std::vector > extracted(Nsimd); // splitting the SIMD + + result.resize(fd); // And then global sum to return the same vector to every node for IO to file + for(int r=0;r coor(Nd); + vector_type vv; + PARALLEL_FOR_LOOP_INTERN + for(int ss=0;ssoSites();ss++){ + Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions); + int r = coor[orthogdim]; + vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss])); + PARALLEL_CRITICAL { // ouch slow rfo thrashing atomic fp add + lvSum[r]=lvSum[r]+vv; + } + } + } + + // Sum across simd lanes in the plane, breaking out orthog dir. + std::vector icoor(Nd); + for(int rt=0;rt temp; temp._internal = lvSum[rt]; + extract(temp,extracted); + + for(int idx=0;idxiCoorFromIindex(icoor,idx); + + int ldx =rt+icoor[orthogdim]*rd; + + lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal; + + } + } + + // sum over nodes. + scalar_type gsum; + for(int t=0;t_processor_coor[orthogdim] ) { + gsum=lsSum[lt]; + } else { + gsum=scalar_type(0.0); + } + + grid->GlobalSum(gsum); + + result[t]=gsum; + } +} +#if 0 +template +static void sliceInnerProductVector( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) +{ + // FIXME: Implementation is slow + // Look at sliceSum implementation, + // and do inside a site loop with block strided iterators + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced scalar; + typedef typename scalar::scalar_object scomplex; + + GridBase * grid = lhs._grid; + + + const int Nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); + + assert(orthogdim >= 0); + assert(orthogdim < Nd); + + int fd=grid->_fdimensions[orthogdim]; + int ld=grid->_ldimensions[orthogdim]; + int rd=grid->_rdimensions[orthogdim]; + + int Nblock = grid->GlobalDimensions()[Orthog]; + int Nrblock = grid->_rdimensions[Orthog]; + int Nthr = grid->SumArraySize(); + + std::vector > sumarray(Nrblock*Nthr); + + parallel_for(int thr=0;thrSumArraySize();thr++){ + + int nwork, mywork, myoff; + + for(int rb=0;rboSites()/Nrblock),thr,mywork,myoff); + int off = rb * grid->_slice_ + vector_type vnrm=zero; // private to thread; sub summation + for(int ss=myoff;ss sip(Nblock); + Lattice IP(lhs._grid); + + IP=localInnerProduct(lhs,rhs); + sliceSum(IP,sip,Orthog); + + for(int ss=0;ss_ndimension; @@ -322,41 +482,8 @@ template mat(i,j) = innerProduct(Lslice,Rslice); } } -#undef FORCE_DIAG -#ifdef FORCE_DIAG - for(int i=0;i - static void sliceInnerProductVector( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) - { - // FIXME: Implementation is slow - // Look at localInnerProduct implementation, - // and do inside a site loop with block strided iterators - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - typedef typename vobj::tensor_reduced scalar; - typedef typename scalar::scalar_object scomplex; - - int Nblock = lhs._grid->GlobalDimensions()[Orthog]; - - vec.resize(Nblock); - std::vector sip(Nblock); - Lattice IP(lhs._grid); - - IP=localInnerProduct(lhs,rhs); - sliceSum(IP,sip,Orthog); - - for(int ss=0;ss static void sliceNorm (std::vector &sn,const Lattice &rhs,int Orthog) { From 8e161152e457954ba2493b9d807516ae0f2eb030 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 18 Apr 2017 10:51:55 +0100 Subject: [PATCH 10/11] MultiRHS solver improvements with slice operations moved into lattice and sped up. Block solver requires a lot of performance work. --- .../iterative/BlockConjugateGradient.h | 85 +++- lib/algorithms/iterative/ConjugateGradient.h | 39 +- lib/lattice/Lattice_reduction.h | 457 +++++++++--------- lib/log/Log.h | 4 +- lib/simd/Grid_vector_types.h | 4 - lib/tensors/Tensor_class.h | 28 +- lib/tensors/Tensor_traits.h | 3 +- .../solver/Test_staggered_block_cg_unprec.cc | 30 +- tests/solver/Test_staggered_cg_unprec.cc | 1 - 9 files changed, 366 insertions(+), 285 deletions(-) diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h index 42a9af56..d90194ae 100644 --- a/lib/algorithms/iterative/BlockConjugateGradient.h +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -60,8 +60,8 @@ void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) { int Orthog = 0; // First dimension is block dim Nblock = Src._grid->_fdimensions[Orthog]; - std::cout< &Linop, const Field &Src, Field &Psi) Field AP(Src); Field R(Src); - GridStopWatch LinalgTimer; - GridStopWatch MatrixTimer; - GridStopWatch SolverTimer; - Eigen::MatrixXcd m_pAp = Eigen::MatrixXcd::Identity(Nblock,Nblock); Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock); Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(Nblock,Nblock); @@ -116,33 +112,49 @@ void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) P = R; sliceInnerProductMatrix(m_rr,R,R,Orthog); + GridStopWatch sliceInnerTimer; + GridStopWatch sliceMaddTimer; + GridStopWatch MatrixTimer; + GridStopWatch SolverTimer; + SolverTimer.Start(); + int k; for (k = 1; k <= MaxIterations; k++) { RealD rrsum=0; for(int b=0;b &Linop, const Field &Src, Field &Psi) if ( max_resid < Tolerance*Tolerance ) { - std::cout << GridLogMessage<<" Block solver has converged in " - < &Linop, const Field &Src, Field &Psi) { int Orthog = 0; // First dimension is block dim Nblock = Src._grid->_fdimensions[Orthog]; - std::cout< &Linop, const Field &Src, Field &Psi) P = R; sliceNorm(v_rr,R,Orthog); + GridStopWatch sliceInnerTimer; + GridStopWatch sliceMaddTimer; + GridStopWatch sliceNormTimer; + GridStopWatch MatrixTimer; + GridStopWatch SolverTimer; + + SolverTimer.Start(); int k; for (k = 1; k <= MaxIterations; k++) { RealD rrsum=0; for(int b=0;b &Linop, const Field &Src, Field &Psi) } if ( max_resid < Tolerance*Tolerance ) { - std::cout << GridLogMessage<<" MultiRHS solver has converged in " - < { cp = a; ssq = norm2(src); - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: guess " << guess << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: src " << ssq << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: mp " << d << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: mmp " << b << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: cp,r " << cp << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: p " << a << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: guess " << guess << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: src " << ssq << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: mp " << d << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: mmp " << b << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: cp,r " << cp << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: p " << a << std::endl; RealD rsq = Tolerance * Tolerance * ssq; @@ -144,19 +138,20 @@ class ConjugateGradient : public OperatorFunction { RealD resnorm = sqrt(norm2(p)); RealD true_residual = resnorm / srcnorm; - std::cout << GridLogMessage - << "ConjugateGradient: Converged on iteration " << k << std::endl; - std::cout << GridLogMessage << "Computed residual " << sqrt(cp / ssq) - << " true residual " << true_residual << " target " - << Tolerance << std::endl; - std::cout << GridLogMessage << "Time elapsed: Iterations " - << SolverTimer.Elapsed() << " Matrix " - << MatrixTimer.Elapsed() << " Linalg " - << LinalgTimer.Elapsed(); - std::cout << std::endl; + std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl; + std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)< inline RealD norm2(const Lattice &arg){ ComplexD nrm = innerProduct(arg,arg); return std::real(nrm); } + // Double inner product template inline ComplexD innerProduct(const Lattice &left,const Lattice &right) @@ -101,7 +102,6 @@ inline auto sum(const LatticeTrinaryExpression & expr) return sum(closure(expr)); } -// FIXME precision promoted summation template inline typename vobj::scalar_object sum(const Lattice &arg) { @@ -141,14 +141,22 @@ inline typename vobj::scalar_object sum(const Lattice &arg) return ssum; } + +////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc... +////////////////////////////////////////////////////////////////////////////////////////////////////////////// + template inline void sliceSum(const Lattice &Data,std::vector &result,int orthogdim) { + /////////////////////////////////////////////////////// + // FIXME precision promoted summation + // may be important for correlation functions + // But easily avoided by using double precision fields + /////////////////////////////////////////////////////// typedef typename vobj::scalar_object sobj; GridBase *grid = Data._grid; assert(grid!=NULL); - // FIXME - // std::cout<SumArraySize()<<" threads "<_ndimension; const int Nsimd = grid->Nsimd(); @@ -163,18 +171,27 @@ template inline void sliceSum(const Lattice &Data,std::vector< std::vector lsSum(ld,zero); // sum across these down to scalars std::vector extracted(Nsimd); // splitting the SIMD - result.resize(fd); // And then global sum to return the same vector to every node for IO to file + result.resize(fd); // And then global sum to return the same vector to every node for(int r=0;r coor(Nd); + int e1= grid->_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; // sum over reduced dimension planes, breaking out orthog dir - for(int ss=0;ssoSites();ss++){ - Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions); - int r = coor[orthogdim]; - lvSum[r]=lvSum[r]+Data._odata[ss]; + // Parallel over orthog direction + parallel_for(int r=0;r_ostride[orthogdim]; // base offset for start of plane + + for(int n=0;n inline void sliceSum(const Lattice &Data,std::vector< } } -template - static void sliceInnerProductVectorSlow( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) - { - // FIXME: Implementation is slow - // Look at localInnerProduct implementation, - // and do inside a site loop with block strided iterators - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - typedef typename vobj::tensor_reduced scalar; - typedef typename scalar::scalar_object scomplex; - - int Nblock = lhs._grid->GlobalDimensions()[Orthog]; - - vec.resize(Nblock); - std::vector sip(Nblock); - Lattice IP(lhs._grid); - - IP=localInnerProduct(lhs,rhs); - sliceSum(IP,sip,Orthog); - - for(int ss=0;ss static void sliceInnerProductVector( std::vector & result, const Lattice &lhs,const Lattice &rhs,int orthogdim) { @@ -247,8 +238,6 @@ static void sliceInnerProductVector( std::vector & result, const Latti assert(grid!=NULL); conformable(grid,rhs._grid); - // FIXME - // std::cout<SumArraySize()<<" threads "<_ndimension; const int Nsimd = grid->Nsimd(); @@ -268,16 +257,18 @@ static void sliceInnerProductVector( std::vector & result, const Latti lvSum[r]=zero; } - // sum over reduced dimension planes, breaking out orthog dir - PARALLEL_REGION { - std::vector coor(Nd); - vector_type vv; - PARALLEL_FOR_LOOP_INTERN - for(int ss=0;ssoSites();ss++){ - Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions); - int r = coor[orthogdim]; - vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss])); - PARALLEL_CRITICAL { // ouch slow rfo thrashing atomic fp add + int e1= grid->_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; + + parallel_for(int r=0;r_ostride[orthogdim]; // base offset for start of plane + + for(int n=0;n & result, const Latti std::vector icoor(Nd); for(int rt=0;rt temp; temp._internal = lvSum[rt]; + iScalar temp; + temp._internal = lvSum[rt]; extract(temp,extracted); for(int idx=0;idx & result, const Latti result[t]=gsum; } } -#if 0 template -static void sliceInnerProductVector( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) +static void sliceNorm (std::vector &sn,const Lattice &rhs,int Orthog) { - // FIXME: Implementation is slow - // Look at sliceSum implementation, - // and do inside a site loop with block strided iterators - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - typedef typename vobj::tensor_reduced scalar; - typedef typename scalar::scalar_object scomplex; - - GridBase * grid = lhs._grid; - - - const int Nd = grid->_ndimension; - const int Nsimd = grid->Nsimd(); - - assert(orthogdim >= 0); - assert(orthogdim < Nd); - - int fd=grid->_fdimensions[orthogdim]; - int ld=grid->_ldimensions[orthogdim]; - int rd=grid->_rdimensions[orthogdim]; - - int Nblock = grid->GlobalDimensions()[Orthog]; - int Nrblock = grid->_rdimensions[Orthog]; - int Nthr = grid->SumArraySize(); - - std::vector > sumarray(Nrblock*Nthr); - - parallel_for(int thr=0;thrSumArraySize();thr++){ - - int nwork, mywork, myoff; - - for(int rb=0;rboSites()/Nrblock),thr,mywork,myoff); - int off = rb * grid->_slice_ - vector_type vnrm=zero; // private to thread; sub summation - for(int ss=myoff;ss sip(Nblock); - Lattice IP(lhs._grid); - - IP=localInnerProduct(lhs,rhs); - sliceSum(IP,sip,Orthog); - - for(int ss=0;ss_ndimension; - int nsimd = BlockSolverGrid->Nsimd(); - - std::vector latt_phys(0); - std::vector simd_phys(0); - std::vector mpi_phys(0); - - for(int d=0;d_fdimensions[d]); - simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); - mpi_phys.push_back(BlockSolverGrid->_processors[d]); - } - } - return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); - } - ////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Need to move sliceInnerProduct, sliceAxpy, sliceNorm etc... into lattice sector along with sliceSum - ////////////////////////////////////////////////////////////////////////////////////////////////////////////// -template - static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) - { - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - int Nblock = X._grid->GlobalDimensions()[Orthog]; - - GridBase *FullGrid = X._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - Lattice Xslice(SliceGrid); - Lattice Rslice(SliceGrid); - // FIXME: Implementation is slow - // If we based this on Cshift it would work for spread out - // but it would be even slower - // - // Repeated extract slice is inefficient - // - // Best base the linear combination by constructing a - // set of vectors of size grid->_rdimensions[Orthog]. - for(int i=0;i - static void sliceMaddVector (Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, - int Orthog,RealD scale=1.0) - { - // FIXME: Implementation is slow - // Best base the linear combination by constructing a - // set of vectors of size grid->_rdimensions[Orthog]. - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - int Nblock = X._grid->GlobalDimensions()[Orthog]; - - GridBase *FullGrid = X._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - Lattice Xslice(SliceGrid); - Lattice Rslice(SliceGrid); - // If we based this on Cshift it would work for spread out - // but it would be even slower - for(int i=0;i - static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) - { - // FIXME: Implementation is slow - // Not sure of best solution.. think about it - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - GridBase *FullGrid = lhs._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - int Nblock = FullGrid->GlobalDimensions()[Orthog]; - - Lattice Lslice(SliceGrid); - Lattice Rslice(SliceGrid); - - mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); - - for(int i=0;i - static void sliceNorm (std::vector &sn,const Lattice &rhs,int Orthog) { - typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; @@ -499,9 +324,207 @@ template for(int ss=0;ss +static void sliceMaddVector(Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, + int orthogdim,RealD scale=1.0) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced tensor_reduced; + + GridBase *grid = X._grid; + + int Nsimd =grid->Nsimd(); + int Nblock =grid->GlobalDimensions()[orthogdim]; + + int fd =grid->_fdimensions[orthogdim]; + int ld =grid->_ldimensions[orthogdim]; + int rd =grid->_rdimensions[orthogdim]; + + int e1 =grid->_slice_nblock[orthogdim]; + int e2 =grid->_slice_block [orthogdim]; + int stride =grid->_slice_stride[orthogdim]; + + std::vector icoor; + + for(int r=0;r_ostride[orthogdim]; // base offset for start of plane + + vector_type av; + + for(int l=0;liCoorFromIindex(icoor,l); + int ldx =r+icoor[orthogdim]*rd; + scalar_type *as =(scalar_type *)&av; + as[l] = scalar_type(a[ldx])*scale; + } + + tensor_reduced at; at=av; + + parallel_for_nest2(int n=0;n +static void sliceMaddVectorSlow (Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, + int Orthog,RealD scale=1.0) +{ + // FIXME: Implementation is slow + // Best base the linear combination by constructing a + // set of vectors of size grid->_rdimensions[Orthog]. + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + // If we based this on Cshift it would work for spread out + // but it would be even slower + for(int i=0;i +static void sliceInnerProductVectorSlow( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) + { + // FIXME: Implementation is slow + // Look at localInnerProduct implementation, + // and do inside a site loop with block strided iterators + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced scalar; + typedef typename scalar::scalar_object scomplex; + + int Nblock = lhs._grid->GlobalDimensions()[Orthog]; + + vec.resize(Nblock); + std::vector sip(Nblock); + Lattice IP(lhs._grid); + + IP=localInnerProduct(lhs,rhs); + sliceSum(IP,sip,Orthog); + + for(int ss=0;ss_rdimensions[Orthog]. +////////////////////////////////////////////////////////////////////////////////////////// + +inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) +{ + int NN = BlockSolverGrid->_ndimension; + int nsimd = BlockSolverGrid->Nsimd(); + + std::vector latt_phys(0); + std::vector simd_phys(0); + std::vector mpi_phys(0); + + for(int d=0;d_fdimensions[d]); + simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); + mpi_phys.push_back(BlockSolverGrid->_processors[d]); + } + } + return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); } + + +template +static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + + for(int i=0;i +static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) +{ + // FIXME: Implementation is slow + // Not sure of best solution.. think about it + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + GridBase *FullGrid = lhs._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + int Nblock = FullGrid->GlobalDimensions()[Orthog]; + + Lattice Lslice(SliceGrid); + Lattice Rslice(SliceGrid); + + mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); + + for(int i=0;i = 0> inline Grid_simd rotate(Grid_simd b, int nrot) { nrot = nrot % Grid_simd::Nsimd(); Grid_simd ret; - // std::cout << "Rotate Real by "< = 0> inline Grid_simd rotate(Grid_simd b, int nrot) { nrot = nrot % Grid_simd::Nsimd(); Grid_simd ret; - // std::cout << "Rotate Complex by "< =0> inline void rotate( Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); - // std::cout << "Rotate Real by "< =0> inline void rotate(Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); - // std::cout << "Rotate Complex by "<::vector_type vector_type; typedef typename GridTypeMapper::vector_typeD vector_typeD; typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; - typedef iScalar tensor_reduced; typedef typename GridTypeMapper::scalar_object recurse_scalar_object; + typedef iScalar tensor_reduced; typedef iScalar scalar_object; - // substitutes a real or complex version with same tensor structure typedef iScalar::Complexified> Complexified; typedef iScalar::Realified> Realified; @@ -78,8 +77,12 @@ class iScalar { iScalar & operator= (const iScalar ©me) = default; iScalar & operator= (iScalar &©me) = default; */ - iScalar(scalar_type s) - : _internal(s){}; // recurse down and hit the constructor for vector_type + + // template + // iScalar(EnableIf, vector_type> s) : _internal(s){}; // recurse down and hit the constructor for vector_type + + iScalar(scalar_type s) : _internal(s){}; // recurse down and hit the constructor for vector_type + iScalar(const Zero &z) { *this = zero; }; iScalar &operator=(const Zero &hero) { @@ -135,33 +138,28 @@ class iScalar { strong_inline const vtype &operator()(void) const { return _internal; } // Type casts meta programmed, must be pure scalar to match TensorRemove - template = 0, - IfNotSimd = 0> + template = 0, IfNotSimd = 0> operator ComplexF() const { return (TensorRemove(_internal)); }; - template = 0, - IfNotSimd = 0> + template = 0, IfNotSimd = 0> operator ComplexD() const { return (TensorRemove(_internal)); }; // template = 0,IfNotSimd = // 0> operator RealD () const { return(real(TensorRemove(_internal))); } - template = 0, - IfNotSimd = 0> + template = 0,IfNotSimd = 0> operator RealD() const { return TensorRemove(_internal); } - template = 0, - IfNotSimd = 0> + template = 0, IfNotSimd = 0> operator Integer() const { return Integer(TensorRemove(_internal)); } // convert from a something to a scalar via constructor of something arg - template ::value, T>::type - * = nullptr> - strong_inline iScalar operator=(T arg) { + template ::value, T>::type * = nullptr> + strong_inline iScalar operator=(T arg) { _internal = arg; return *this; } diff --git a/lib/tensors/Tensor_traits.h b/lib/tensors/Tensor_traits.h index e630c217..80009391 100644 --- a/lib/tensors/Tensor_traits.h +++ b/lib/tensors/Tensor_traits.h @@ -252,7 +252,8 @@ namespace Grid { template class isSIMDvectorized{ template - static typename std::enable_if< !std::is_same< typename GridTypeMapper::type>::scalar_type, typename GridTypeMapper::type>::vector_type>::value, char>::type test(void *); + static typename std::enable_if< !std::is_same< typename GridTypeMapper::type>::scalar_type, + typename GridTypeMapper::type>::vector_type>::value, char>::type test(void *); template static double test(...); diff --git a/tests/solver/Test_staggered_block_cg_unprec.cc b/tests/solver/Test_staggered_block_cg_unprec.cc index e9142f8c..44e8fb52 100644 --- a/tests/solver/Test_staggered_block_cg_unprec.cc +++ b/tests/solver/Test_staggered_block_cg_unprec.cc @@ -51,7 +51,7 @@ int main (int argc, char ** argv) typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField; typename ImprovedStaggeredFermion5DR::ImplParams params; - const int Ls=8; + const int Ls=4; Grid_init(&argc,&argv); @@ -76,24 +76,44 @@ int main (int argc, char ** argv) RealD mass=0.01; ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); - MdagMLinearOperator HermOp(Ds); ConjugateGradient CG(1.0e-8,10000); BlockConjugateGradient BCG(1.0e-8,10000); MultiRHSConjugateGradient mCG(1.0e-8,10000); - std::cout << GridLogMessage << " Calling CG "< HermOp4d(Ds4d); + FermionField src4d(UGrid); random(pRNG,src4d); + FermionField result4d(UGrid); result4d=zero; + CG(HermOp4d,src4d,result4d); + std::cout << GridLogMessage << "************************************************************************ "< HermOp(Ds); - ConjugateGradient CG(1.0e-8,10000); CG(HermOp,src,result); Grid_finalize(); From a839d5bc55d900f74289ffba21b7f0bcbe8bcfa2 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 18 Apr 2017 11:22:17 +0100 Subject: [PATCH 11/11] Updated todo list --- TODO | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/TODO b/TODO index 27579ad3..a2b7bf03 100644 --- a/TODO +++ b/TODO @@ -2,25 +2,24 @@ TODO: --------------- Peter's work list: +1)- Half-precision comms <-- started -- SIMD is prepared +2)- Precision conversion and sort out localConvert <-- --- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started --- Merge high precision reduction into develop <-- done --- Precision conversion and sort out localConvert <-- +3)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started +4)- Binary I/O speed up & x-strips + +-- Profile CG, BlockCG, etc... Flop count/rate -- PARTIAL, time but no flop/s yet -- Physical propagator interface - --- multiRHS DWF; benchmark on Cori/BNL for comms elimination - -- slice* linalg routines for multiRHS, BlockCG <-- started - --- Profile CG, BlockCG, etc... Flop count/rate --- Binary I/O speed up & x-strips --- Half-precision comms <-- started --- GaugeFix into central location --- FFTfix in sensible place --- Multigrid Wilson and DWF, compare to other Multigrid implementations --- quaternions -- Might not need - - -- Conserved currents +-- GaugeFix into central location + +-- Multigrid Wilson and DWF, compare to other Multigrid implementations +-- HDCR resume + +Recent DONE +-- Merge high precision reduction into develop +-- multiRHS DWF; benchmark on Cori/BNL for comms elimination + -- slice* linalg routines for multiRHS, BlockCG ----- * Forces; the UdSdU term in gauge force term is half of what I think it should