diff --git a/.travis.yml b/.travis.yml index bc6dd0ef..c62aab12 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,9 +7,10 @@ cache: matrix: include: - os: osx - osx_image: xcode7.2 + osx_image: xcode8.3 compiler: clang - compiler: gcc + sudo: required addons: apt: sources: @@ -24,6 +25,7 @@ matrix: - binutils-dev env: VERSION=-4.9 - compiler: gcc + sudo: required addons: apt: sources: @@ -73,8 +75,6 @@ before_install: - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install openmpi; fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi install: - export CC=$CC$VERSION @@ -92,15 +92,14 @@ script: - cd build - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none - make -j4 - - ./benchmarks/Benchmark_dwf --threads 1 + - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - echo make clean - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none - make -j4 - - ./benchmarks/Benchmark_dwf --threads 1 + - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - echo make clean - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi - - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto - - make -j4 + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make -j4; fi - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi diff --git a/TODO b/TODO index df8554cc..672879cd 100644 --- a/TODO +++ b/TODO @@ -1,6 +1,26 @@ TODO: --------------- +Peter's work list: +2)- Precision conversion and sort out localConvert <-- +3)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started +4)- Binary I/O speed up & x-strips +-- Profile CG, BlockCG, etc... Flop count/rate -- PARTIAL, time but no flop/s yet +-- Physical propagator interface +-- Conserved currents +-- GaugeFix into central location +-- Multigrid Wilson and DWF, compare to other Multigrid implementations +-- HDCR resume + +Recent DONE +-- Cut down the exterior overhead <-- DONE +-- Interior legs from SHM comms <-- DONE +-- Half-precision comms <-- DONE +-- Merge high precision reduction into develop +-- multiRHS DWF; benchmark on Cori/BNL for comms elimination + -- slice* linalg routines for multiRHS, BlockCG + +----- * Forces; the UdSdU term in gauge force term is half of what I think it should be. This is a consequence of taking ONLY the first term in: @@ -21,16 +41,8 @@ TODO: This means we must double the force in the Test_xxx_force routines, and is the origin of the factor of two. This 2x is applied by hand in the fermion routines and in the Test_rect_force routine. - -Policies: - -* Link smearing/boundary conds; Policy class based implementation ; framework more in place - * Support different boundary conditions (finite temp, chem. potential ... ) -* Support different fermion representations? - - contained entirely within the integrator presently - - Sign of force term. - Reversibility test. @@ -41,11 +53,6 @@ Policies: - Audit oIndex usage for cb behaviour -- Rectangle gauge actions. - Iwasaki, - Symanzik, - ... etc... - - Prepare multigrid for HMC. - Alternate setup schemes. - Support for ILDG --- ugly, not done @@ -55,9 +62,11 @@ Policies: - FFTnD ? - Gparity; hand opt use template specialisation elegance to enable the optimised paths ? + - Gparity force term; Gparity (R)HMC. -- Random number state save restore + - Mobius implementation clean up to rmove #if 0 stale code sequences + - CG -- profile carefully, kernel fusion, whole CG performance measurements. ================================================================ @@ -90,6 +99,7 @@ Insert/Extract Not sure of status of this -- reverify. Things are working nicely now though. * Make the Tensor types and Complex etc... play more nicely. + - TensorRemove is a hack, come up with a long term rationalised approach to Complex vs. Scalar > > QDP forces use of "toDouble" to get back to non tensor scalar. This role is presently taken TensorRemove, but I want to introduce a syntax that does not require this. @@ -112,6 +122,8 @@ Not sure of status of this -- reverify. Things are working nicely now though. RECENT --------------- + - Support different fermion representations? -- DONE + - contained entirely within the integrator presently - Clean up HMC -- DONE - LorentzScalar gets Gauge link type (cleaner). -- DONE - Simplified the integrators a bit. -- DONE @@ -123,6 +135,26 @@ RECENT - Parallel io improvements -- DONE - Plaquette and link trace checks into nersc reader from the Grid_nersc_io.cc test. -- DONE + +DONE: +- MultiArray -- MultiRHS done +- ConjugateGradientMultiShift -- DONE +- MCR -- DONE +- Remez -- Mike or Boost? -- DONE +- Proto (ET) -- DONE +- uBlas -- DONE ; Eigen +- Potentially Useful Boost libraries -- DONE ; Eigen +- Aligned allocator; memory pool -- DONE +- Multiprecision -- DONE +- Serialization -- DONE +- Regex -- Not needed +- Tokenize -- Why? + +- Random number state save restore -- DONE +- Rectangle gauge actions. -- DONE + Iwasaki, + Symanzik, + ... etc... Done: Cayley, Partial , ContFrac force terms. DONE @@ -207,6 +239,7 @@ Done FUNCTIONALITY: it pleases me to keep track of things I have done (keeps me arguably sane) ====================================================================================================== +* Link smearing/boundary conds; Policy class based implementation ; framework more in place -- DONE * Command line args for geometry, simd, etc. layout. Is it necessary to have -- DONE user pass these? Is this a QCD specific? diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index 686d00a1..7009980c 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -152,9 +152,6 @@ int main (int argc, char ** argv) RealD NP = UGrid->_Nprocessors; - std::cout << GridLogMessage << "Creating action operator " << std::endl; - DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); - std::cout << GridLogMessage<< "*****************************************************************" <Barrier(); Dw.ZeroCounters(); Dw.Dhop(src,result,0); + std::cout<Barrier(); + DwH.ZeroCounters(); + DwH.Dhop(src,result,0); + double t0=usecond(); + for(int i=0;iBarrier(); + + double volume=Ls; for(int mu=0;mu #include // Lanczos support -#include +//#include #include #include #include diff --git a/lib/algorithms/approx/.dirstamp b/lib/algorithms/approx/.dirstamp deleted file mode 100644 index e69de29b..00000000 diff --git a/lib/algorithms/iterative/DenseMatrix.h b/lib/algorithms/densematrix/DenseMatrix.h similarity index 100% rename from lib/algorithms/iterative/DenseMatrix.h rename to lib/algorithms/densematrix/DenseMatrix.h diff --git a/lib/algorithms/iterative/Francis.h b/lib/algorithms/densematrix/Francis.h similarity index 100% rename from lib/algorithms/iterative/Francis.h rename to lib/algorithms/densematrix/Francis.h diff --git a/lib/algorithms/iterative/Householder.h b/lib/algorithms/densematrix/Householder.h similarity index 100% rename from lib/algorithms/iterative/Householder.h rename to lib/algorithms/densematrix/Householder.h diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h new file mode 100644 index 00000000..d90194ae --- /dev/null +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -0,0 +1,366 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/algorithms/iterative/BlockConjugateGradient.h + +Copyright (C) 2017 + +Author: Azusa Yamaguchi +Author: Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H +#define GRID_BLOCK_CONJUGATE_GRADIENT_H + + +namespace Grid { + +////////////////////////////////////////////////////////////////////////// +// Block conjugate gradient. Dimension zero should be the block direction +////////////////////////////////////////////////////////////////////////// +template +class BlockConjugateGradient : public OperatorFunction { + public: + + typedef typename Field::scalar_type scomplex; + + const int blockDim = 0; + + int Nblock; + bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. + // Defaults true. + RealD Tolerance; + Integer MaxIterations; + Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion + + BlockConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true) + : Tolerance(tol), + MaxIterations(maxit), + ErrorOnNoConverge(err_on_no_conv){}; + +void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) +{ + int Orthog = 0; // First dimension is block dim + Nblock = Src._grid->_fdimensions[Orthog]; + + std::cout< residuals(Nblock); + std::vector ssq(Nblock); + + sliceNorm(ssq,Src,Orthog); + RealD sssum=0; + for(int b=0;b max_resid ) max_resid = rr; + } + + if ( max_resid < Tolerance*Tolerance ) { + + SolverTimer.Stop(); + + std::cout << GridLogMessage<<"BlockCG converged in "< +class MultiRHSConjugateGradient : public OperatorFunction { + public: + + typedef typename Field::scalar_type scomplex; + + const int blockDim = 0; + + int Nblock; + bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. + // Defaults true. + RealD Tolerance; + Integer MaxIterations; + Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion + + MultiRHSConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true) + : Tolerance(tol), + MaxIterations(maxit), + ErrorOnNoConverge(err_on_no_conv){}; + +void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) +{ + int Orthog = 0; // First dimension is block dim + Nblock = Src._grid->_fdimensions[Orthog]; + + std::cout< v_pAp(Nblock); + std::vector v_rr (Nblock); + std::vector v_rr_inv(Nblock); + std::vector v_alpha(Nblock); + std::vector v_beta(Nblock); + + // Initial residual computation & set up + std::vector residuals(Nblock); + std::vector ssq(Nblock); + + sliceNorm(ssq,Src,Orthog); + RealD sssum=0; + for(int b=0;b max_resid ) max_resid = rr; + } + + if ( max_resid < Tolerance*Tolerance ) { + + SolverTimer.Stop(); + + std::cout << GridLogMessage<<"MultiRHS solver converged in " < { cp = a; ssq = norm2(src); - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: guess " << guess << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: src " << ssq << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: mp " << d << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: mmp " << b << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: cp,r " << cp << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: p " << a << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: guess " << guess << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: src " << ssq << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: mp " << d << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: mmp " << b << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: cp,r " << cp << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: p " << a << std::endl; RealD rsq = Tolerance * Tolerance * ssq; @@ -99,8 +93,7 @@ class ConjugateGradient : public OperatorFunction { } std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: k=0 residual " << cp << " target " << rsq - << std::endl; + << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl; GridStopWatch LinalgTimer; GridStopWatch MatrixTimer; @@ -145,19 +138,20 @@ class ConjugateGradient : public OperatorFunction { RealD resnorm = sqrt(norm2(p)); RealD true_residual = resnorm / srcnorm; - std::cout << GridLogMessage - << "ConjugateGradient: Converged on iteration " << k << std::endl; - std::cout << GridLogMessage << "Computed residual " << sqrt(cp / ssq) - << " true residual " << true_residual << " target " - << Tolerance << std::endl; - std::cout << GridLogMessage << "Time elapsed: Iterations " - << SolverTimer.Elapsed() << " Matrix " - << MatrixTimer.Elapsed() << " Linalg " - << LinalgTimer.Elapsed(); - std::cout << std::endl; + std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl; + std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)< #define GRID_IRL_H #include //memset + #ifdef USE_LAPACK #ifdef USE_MKL #include @@ -43,8 +44,9 @@ void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e, //#include #endif #endif -#include "DenseMatrix.h" -#include "EigenSort.h" + +#include +#include // eliminate temorary vector in calc() #define MEM_SAVE @@ -1321,8 +1323,6 @@ static void Lock(DenseMatrix &H, ///Hess mtx int dfg, bool herm) { - - //ForceTridiagonal(H); int M = H.dim; @@ -1354,7 +1354,6 @@ static void Lock(DenseMatrix &H, ///Hess mtx AH = Hermitian(QQ)*AH; AH = AH*QQ; - for(int i=con;i - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ -#ifndef MATRIX_H -#define MATRIX_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -/** Sign function **/ -template T sign(T p){return ( p/abs(p) );} - -///////////////////////////////////////////////////////////////////////////////////////////////////////// -///////////////////// Hijack STL containers for our wicked means ///////////////////////////////////////// -///////////////////////////////////////////////////////////////////////////////////////////////////////// -template using Vector = Vector; -template using Matrix = Vector >; - -template void Resize(Vector & vec, int N) { vec.resize(N); } - -template void Resize(Matrix & mat, int N, int M) { - mat.resize(N); - for(int i=0;i void Size(Vector & vec, int &N) -{ - N= vec.size(); -} -template void Size(Matrix & mat, int &N,int &M) -{ - N= mat.size(); - M= mat[0].size(); -} -template void SizeSquare(Matrix & mat, int &N) -{ - int M; Size(mat,N,M); - assert(N==M); -} -template void SizeSame(Matrix & mat1,Matrix &mat2, int &N1,int &M1) -{ - int N2,M2; - Size(mat1,N1,M1); - Size(mat2,N2,M2); - assert(N1==N2); - assert(M1==M2); -} - -//***************************************** -//* (Complex) Vector operations * -//***************************************** - -/**Conj of a Vector **/ -template Vector conj(Vector p){ - Vector q(p.size()); - for(int i=0;i T norm(Vector p){ - T sum = 0; - for(int i=0;i T norm2(Vector p){ - T sum = 0; - for(int i=0;i T trace(Vector p){ - T sum = 0; - for(int i=0;i void Fill(Vector &p, T c){ - for(int i=0;i void normalize(Vector &p){ - T m = norm(p); - if( abs(m) > 0.0) for(int i=0;i Vector times(Vector p, U s){ - for(int i=0;i Vector times(U s, Vector p){ - for(int i=0;i T inner(Vector a, Vector b){ - T m = 0.; - for(int i=0;i Vector add(Vector a, Vector b){ - Vector m(a.size()); - for(int i=0;i Vector sub(Vector a, Vector b){ - Vector m(a.size()); - for(int i=0;i void Fill(Matrix & mat, T&val) { - int N,M; - Size(mat,N,M); - for(int i=0;i Transpose(Matrix & mat){ - int N,M; - Size(mat,N,M); - Matrix C; Resize(C,M,N); - for(int i=0;i void Unity(Matrix &mat){ - int N; SizeSquare(mat,N); - for(int i=0;i -void PlusUnit(Matrix & A,T c){ - int dim; SizeSquare(A,dim); - for(int i=0;i HermitianConj(Matrix &mat){ - - int dim; SizeSquare(mat,dim); - - Matrix C; Resize(C,dim,dim); - - for(int i=0;i diag(Matrix &A) -{ - int dim; SizeSquare(A,dim); - Vector d; Resize(d,dim); - - for(int i=0;i operator *(Vector &B,Matrix &A) -{ - int K,M,N; - Size(B,K); - Size(A,M,N); - assert(K==M); - - Vector C; Resize(C,N); - - for(int j=0;j inv_diag(Matrix & A){ - int dim; SizeSquare(A,dim); - Vector d; Resize(d,dim); - for(int i=0;i operator + (Matrix &A,Matrix &B) -{ - int N,M ; SizeSame(A,B,N,M); - Matrix C; Resize(C,N,M); - for(int i=0;i operator- (Matrix & A,Matrix &B){ - int N,M ; SizeSame(A,B,N,M); - Matrix C; Resize(C,N,M); - for(int i=0;i operator* (Matrix & A,T c){ - int N,M; Size(A,N,M); - Matrix C; Resize(C,N,M); - for(int i=0;i operator* (Matrix &A,Matrix &B){ - int K,L,N,M; - Size(A,K,L); - Size(B,N,M); assert(L==N); - Matrix C; Resize(C,K,M); - - for(int i=0;i operator* (Matrix &A,Vector &B){ - int M,N,K; - Size(A,N,M); - Size(B,K); assert(K==M); - Vector C; Resize(C,N); - for(int i=0;i T LargestDiag(Matrix &A) -{ - int dim ; SizeSquare(A,dim); - - T ld = abs(A[0][0]); - for(int i=1;i abs(ld) ){ld = cf;} - } - return ld; -} - -/** Look for entries on the leading subdiagonal that are smaller than 'small' **/ -template int Chop_subdiag(Matrix &A,T norm, int offset, U small) -{ - int dim; SizeSquare(A,dim); - for(int l = dim - 1 - offset; l >= 1; l--) { - if((U)abs(A[l][l - 1]) < (U)small) { - A[l][l-1]=(U)0.0; - return l; - } - } - return 0; -} - -/** Look for entries on the leading subdiagonal that are smaller than 'small' **/ -template int Chop_symm_subdiag(Matrix & A,T norm, int offset, U small) -{ - int dim; SizeSquare(A,dim); - for(int l = dim - 1 - offset; l >= 1; l--) { - if((U)abs(A[l][l - 1]) < (U)small) { - A[l][l - 1] = (U)0.0; - A[l - 1][l] = (U)0.0; - return l; - } - } - return 0; -} -/**Assign a submatrix to a larger one**/ -template -void AssignSubMtx(Matrix & A,int row_st, int row_end, int col_st, int col_end, Matrix &S) -{ - for(int i = row_st; i -Matrix GetSubMtx(Matrix &A,int row_st, int row_end, int col_st, int col_end) -{ - Matrix H; Resize(row_end - row_st,col_end-col_st); - - for(int i = row_st; i -void AssignSubMtx(Matrix & A,int row_st, int row_end, int col_st, int col_end, Matrix &S) -{ - for(int i = row_st; i T proj(Matrix A, Vector B){ - int dim; SizeSquare(A,dim); - int dimB; Size(B,dimB); - assert(dimB==dim); - T C = 0; - for(int i=0;i q Q -template void times(Vector &q, Matrix &Q) -{ - int M; SizeSquare(Q,M); - int N; Size(q,N); - assert(M==N); - - times(q,Q,N); -} - -/// q -> q Q -template void times(multi1d &q, Matrix &Q, int N) -{ - GridBase *grid = q[0]._grid; - int M; SizeSquare(Q,M); - int K; Size(q,K); - assert(N S(N,grid ); - for(int j=0;j - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ -#ifndef GRID_MATRIX_UTILS_H -#define GRID_MATRIX_UTILS_H - -namespace Grid { - - namespace MatrixUtils { - - template inline void Size(Matrix& A,int &N,int &M){ - N=A.size(); assert(N>0); - M=A[0].size(); - for(int i=0;i inline void SizeSquare(Matrix& A,int &N) - { - int M; - Size(A,N,M); - assert(N==M); - } - - template inline void Fill(Matrix& A,T & val) - { - int N,M; - Size(A,N,M); - for(int i=0;i inline void Diagonal(Matrix& A,T & val) - { - int N; - SizeSquare(A,N); - for(int i=0;i inline void Identity(Matrix& A) - { - Fill(A,0.0); - Diagonal(A,1.0); - } - - }; -} -#endif diff --git a/lib/algorithms/iterative/TODO b/lib/algorithms/iterative/TODO deleted file mode 100644 index ca3bca3b..00000000 --- a/lib/algorithms/iterative/TODO +++ /dev/null @@ -1,15 +0,0 @@ -- ConjugateGradientMultiShift -- MCR - -- Potentially Useful Boost libraries - -- MultiArray -- Aligned allocator; memory pool -- Remez -- Mike or Boost? -- Multiprecision -- quaternians -- Tokenize -- Serialization -- Regex -- Proto (ET) -- uBlas diff --git a/lib/algorithms/iterative/bisec.c b/lib/algorithms/iterative/bisec.c deleted file mode 100644 index a5117aee..00000000 --- a/lib/algorithms/iterative/bisec.c +++ /dev/null @@ -1,122 +0,0 @@ -#include -#include -#include - -struct Bisection { - -static void get_eig2(int row_num,std::vector &ALPHA,std::vector &BETA, std::vector & eig) -{ - int i,j; - std::vector evec1(row_num+3); - std::vector evec2(row_num+3); - RealD eps2; - ALPHA[1]=0.; - BETHA[1]=0.; - for(i=0;imag(evec2[i+1])) { - swap(evec2+i,evec2+i+1); - swapped=1; - } - } - end--; - for(i=end-1;i>=begin;i--){ - if(mag(evec2[i])>mag(evec2[i+1])) { - swap(evec2+i,evec2+i+1); - swapped=1; - } - } - begin++; - } - - for(i=0;i &c, - std::vector &b, - int n, - int m1, - int m2, - RealD eps1, - RealD relfeh, - std::vector &x, - RealD &eps2) -{ - std::vector wu(n+2); - - RealD h,q,x1,xu,x0,xmin,xmax; - int i,a,k; - - b[1]=0.0; - xmin=c[n]-fabs(b[n]); - xmax=c[n]+fabs(b[n]); - for(i=1;ixmax) xmax= c[i]+h; - if(c[i]-h0.0 ? xmax : -xmin); - if(eps1<=0.0) eps1=eps2; - eps2=0.5*eps1+7.0*(eps2); - x0=xmax; - for(i=m1;i<=m2;i++){ - x[i]=xmax; - wu[i]=xmin; - } - - for(k=m2;k>=m1;k--){ - xu=xmin; - i=k; - do{ - if(xu=m1); - if(x0>x[k]) x0=x[k]; - while((x0-xu)>2*relfeh*(fabs(xu)+fabs(x0))+eps1){ - x1=(xu+x0)/2; - - a=0; - q=1.0; - for(i=1;i<=n;i++){ - q=c[i]-x1-((q!=0.0)? b[i]*b[i]/q:fabs(b[i])/relfeh); - if(q<0) a++; - } - // printf("x1=%e a=%d\n",x1,a); - if(ax1) x[a]=x1; - } - }else x0=x1; - } - x[k]=(x0+xu)/2; - } -} -} diff --git a/lib/algorithms/iterative/get_eig.c b/lib/algorithms/iterative/get_eig.c deleted file mode 100644 index d3f5a12f..00000000 --- a/lib/algorithms/iterative/get_eig.c +++ /dev/null @@ -1 +0,0 @@ - diff --git a/lib/communicator/.dirstamp b/lib/communicator/.dirstamp deleted file mode 100644 index e69de29b..00000000 diff --git a/lib/cshift/Cshift_common.h b/lib/cshift/Cshift_common.h index 48708800..1be672e8 100644 --- a/lib/cshift/Cshift_common.h +++ b/lib/cshift/Cshift_common.h @@ -30,21 +30,11 @@ Author: Peter Boyle namespace Grid { -template -class SimpleCompressor { -public: - void Point(int) {}; - - vobj operator() (const vobj &arg) { - return arg; - } -}; - /////////////////////////////////////////////////////////////////// -// Gather for when there is no need to SIMD split with compression +// Gather for when there is no need to SIMD split /////////////////////////////////////////////////////////////////// -template void -Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0) +template void +Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimension,int plane,int cbmask, int off=0) { int rd = rhs._grid->_rdimensions[dimension]; @@ -62,7 +52,7 @@ Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimen for(int b=0;b &rhs,commVector &buffer,int dimen } } parallel_for(int i=0;i void -Gather_plane_extract(const Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask,compressor &compress) +template void +Gather_plane_extract(const Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask) { int rd = rhs._grid->_rdimensions[dimension]; @@ -109,8 +98,8 @@ Gather_plane_extract(const Lattice &rhs,std::vector(temp,pointers,offset); + vobj temp =rhs._odata[so+o+b]; + extract(temp,pointers,offset); } } @@ -127,32 +116,14 @@ Gather_plane_extract(const Lattice &rhs,std::vector(temp,pointers,offset); + vobj temp =rhs._odata[so+o+b]; + extract(temp,pointers,offset); } } } } } -////////////////////////////////////////////////////// -// Gather for when there is no need to SIMD split -////////////////////////////////////////////////////// -template void Gather_plane_simple (const Lattice &rhs,commVector &buffer, int dimension,int plane,int cbmask) -{ - SimpleCompressor dontcompress; - Gather_plane_simple (rhs,buffer,dimension,plane,cbmask,dontcompress); -} - -////////////////////////////////////////////////////// -// Gather for when there *is* need to SIMD split -////////////////////////////////////////////////////// -template void Gather_plane_extract(const Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask) -{ - SimpleCompressor dontcompress; - Gather_plane_extract(rhs,pointers,dimension,plane,cbmask,dontcompress); -} - ////////////////////////////////////////////////////// // Scatter for when there is no need to SIMD split ////////////////////////////////////////////////////// @@ -200,7 +171,7 @@ template void Scatter_plane_simple (Lattice &rhs,commVector void Scatter_plane_merge(Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask) +template void Scatter_plane_merge(Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask) { int rd = rhs._grid->_rdimensions[dimension]; diff --git a/lib/cshift/Cshift_mpi.h b/lib/cshift/Cshift_mpi.h index b2a44961..a66b49bf 100644 --- a/lib/cshift/Cshift_mpi.h +++ b/lib/cshift/Cshift_mpi.h @@ -154,13 +154,7 @@ template void Cshift_comms(Lattice &ret,const Lattice &r recv_from_rank, bytes); grid->Barrier(); - /* - for(int i=0;i void Cshift_comms_simd(Lattice &ret,const LatticeBarrier(); rpointers[i] = &recv_buf_extract[i][0]; } else { diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h index 45a88a64..3affb86a 100644 --- a/lib/lattice/Lattice_reduction.h +++ b/lib/lattice/Lattice_reduction.h @@ -30,6 +30,8 @@ Author: paboyle #ifndef GRID_LATTICE_REDUCTION_H #define GRID_LATTICE_REDUCTION_H +#include + namespace Grid { #ifdef GRID_WARN_SUBOPTIMAL #warning "Optimisation alert all these reduction loops are NOT threaded " @@ -38,120 +40,123 @@ namespace Grid { //////////////////////////////////////////////////////////////////////////////////////////////////// // Deterministic Reduction operations //////////////////////////////////////////////////////////////////////////////////////////////////// - template inline RealD norm2(const Lattice &arg){ - ComplexD nrm = innerProduct(arg,arg); - return std::real(nrm); +template inline RealD norm2(const Lattice &arg){ + ComplexD nrm = innerProduct(arg,arg); + return std::real(nrm); +} + +// Double inner product +template +inline ComplexD innerProduct(const Lattice &left,const Lattice &right) +{ + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_typeD vector_type; + scalar_type nrm; + + GridBase *grid = left._grid; + + std::vector > sumarray(grid->SumArraySize()); + + parallel_for(int thr=0;thrSumArraySize();thr++){ + int nwork, mywork, myoff; + GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); + + decltype(innerProductD(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation + for(int ss=myoff;ssSumArraySize();i++){ + vvnrm = vvnrm+sumarray[i]; + } + nrm = Reduce(vvnrm);// sum across simd + right._grid->GlobalSum(nrm); + return nrm; +} + +template +inline auto sum(const LatticeUnaryExpression & expr) + ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object +{ + return sum(closure(expr)); +} - template - inline ComplexD innerProduct(const Lattice &left,const Lattice &right) - { - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - scalar_type nrm; - - GridBase *grid = left._grid; - - std::vector > sumarray(grid->SumArraySize()); - for(int i=0;iSumArraySize();i++){ - sumarray[i]=zero; - } - - parallel_for(int thr=0;thrSumArraySize();thr++){ - int nwork, mywork, myoff; - GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); - - decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation - for(int ss=myoff;ssSumArraySize();i++){ - vvnrm = vvnrm+sumarray[i]; - } - nrm = Reduce(vvnrm);// sum across simd - right._grid->GlobalSum(nrm); - return nrm; - } - - template - inline auto sum(const LatticeUnaryExpression & expr) - ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object - { - return sum(closure(expr)); - } - - template - inline auto sum(const LatticeBinaryExpression & expr) +template +inline auto sum(const LatticeBinaryExpression & expr) ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object - { - return sum(closure(expr)); - } - - - template - inline auto sum(const LatticeTrinaryExpression & expr) - ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)), - eval(0,std::get<1>(expr.second)), - eval(0,std::get<2>(expr.second)) - ))::scalar_object - { - return sum(closure(expr)); - } - - template - inline typename vobj::scalar_object sum(const Lattice &arg){ - - GridBase *grid=arg._grid; - int Nsimd = grid->Nsimd(); - - std::vector > sumarray(grid->SumArraySize()); - for(int i=0;iSumArraySize();i++){ - sumarray[i]=zero; - } - - parallel_for(int thr=0;thrSumArraySize();thr++){ - int nwork, mywork, myoff; - GridThread::GetWork(grid->oSites(),thr,mywork,myoff); - - vobj vvsum=zero; - for(int ss=myoff;ssSumArraySize();i++){ - vsum = vsum+sumarray[i]; - } - - typedef typename vobj::scalar_object sobj; - sobj ssum=zero; - - std::vector buf(Nsimd); - extract(vsum,buf); - - for(int i=0;iGlobalSum(ssum); - - return ssum; +{ + return sum(closure(expr)); +} + + +template +inline auto sum(const LatticeTrinaryExpression & expr) + ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)), + eval(0,std::get<1>(expr.second)), + eval(0,std::get<2>(expr.second)) + ))::scalar_object +{ + return sum(closure(expr)); +} + +template +inline typename vobj::scalar_object sum(const Lattice &arg) +{ + GridBase *grid=arg._grid; + int Nsimd = grid->Nsimd(); + + std::vector > sumarray(grid->SumArraySize()); + for(int i=0;iSumArraySize();i++){ + sumarray[i]=zero; + } + + parallel_for(int thr=0;thrSumArraySize();thr++){ + int nwork, mywork, myoff; + GridThread::GetWork(grid->oSites(),thr,mywork,myoff); + + vobj vvsum=zero; + for(int ss=myoff;ssSumArraySize();i++){ + vsum = vsum+sumarray[i]; + } + + typedef typename vobj::scalar_object sobj; + sobj ssum=zero; + + std::vector buf(Nsimd); + extract(vsum,buf); + + for(int i=0;iGlobalSum(ssum); + + return ssum; +} +////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc... +////////////////////////////////////////////////////////////////////////////////////////////////////////////// template inline void sliceSum(const Lattice &Data,std::vector &result,int orthogdim) { + /////////////////////////////////////////////////////// + // FIXME precision promoted summation + // may be important for correlation functions + // But easily avoided by using double precision fields + /////////////////////////////////////////////////////// typedef typename vobj::scalar_object sobj; GridBase *grid = Data._grid; assert(grid!=NULL); - // FIXME - // std::cout<SumArraySize()<<" threads "<_ndimension; const int Nsimd = grid->Nsimd(); @@ -163,23 +168,31 @@ template inline void sliceSum(const Lattice &Data,std::vector< int rd=grid->_rdimensions[orthogdim]; std::vector > lvSum(rd); // will locally sum vectors first - std::vector lsSum(ld,zero); // sum across these down to scalars - std::vector extracted(Nsimd); // splitting the SIMD + std::vector lsSum(ld,zero); // sum across these down to scalars + std::vector extracted(Nsimd); // splitting the SIMD - result.resize(fd); // And then global sum to return the same vector to every node for IO to file + result.resize(fd); // And then global sum to return the same vector to every node for(int r=0;r coor(Nd); + int e1= grid->_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; // sum over reduced dimension planes, breaking out orthog dir + // Parallel over orthog direction + parallel_for(int r=0;roSites();ss++){ - Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions); - int r = coor[orthogdim]; - lvSum[r]=lvSum[r]+Data._odata[ss]; - } + int so=r*grid->_ostride[orthogdim]; // base offset for start of plane + + for(int n=0;n icoor(Nd); @@ -214,10 +227,304 @@ template inline void sliceSum(const Lattice &Data,std::vector< result[t]=gsum; } +} +template +static void sliceInnerProductVector( std::vector & result, const Lattice &lhs,const Lattice &rhs,int orthogdim) +{ + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_type scalar_type; + GridBase *grid = lhs._grid; + assert(grid!=NULL); + conformable(grid,rhs._grid); + + const int Nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); + + assert(orthogdim >= 0); + assert(orthogdim < Nd); + + int fd=grid->_fdimensions[orthogdim]; + int ld=grid->_ldimensions[orthogdim]; + int rd=grid->_rdimensions[orthogdim]; + + std::vector > lvSum(rd); // will locally sum vectors first + std::vector lsSum(ld,scalar_type(0.0)); // sum across these down to scalars + std::vector > extracted(Nsimd); // splitting the SIMD + + result.resize(fd); // And then global sum to return the same vector to every node for IO to file + for(int r=0;r_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; + + parallel_for(int r=0;r_ostride[orthogdim]; // base offset for start of plane + + for(int n=0;n icoor(Nd); + for(int rt=0;rt temp; + temp._internal = lvSum[rt]; + extract(temp,extracted); + + for(int idx=0;idxiCoorFromIindex(icoor,idx); + + int ldx =rt+icoor[orthogdim]*rd; + + lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal; + + } + } + + // sum over nodes. + scalar_type gsum; + for(int t=0;t_processor_coor[orthogdim] ) { + gsum=lsSum[lt]; + } else { + gsum=scalar_type(0.0); + } + + grid->GlobalSum(gsum); + + result[t]=gsum; + } +} +template +static void sliceNorm (std::vector &sn,const Lattice &rhs,int Orthog) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = rhs._grid->GlobalDimensions()[Orthog]; + std::vector ip(Nblock); + sn.resize(Nblock); + + sliceInnerProductVector(ip,rhs,rhs,Orthog); + for(int ss=0;ss +static void sliceMaddVector(Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, + int orthogdim,RealD scale=1.0) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced tensor_reduced; + + GridBase *grid = X._grid; + + int Nsimd =grid->Nsimd(); + int Nblock =grid->GlobalDimensions()[orthogdim]; + + int fd =grid->_fdimensions[orthogdim]; + int ld =grid->_ldimensions[orthogdim]; + int rd =grid->_rdimensions[orthogdim]; + + int e1 =grid->_slice_nblock[orthogdim]; + int e2 =grid->_slice_block [orthogdim]; + int stride =grid->_slice_stride[orthogdim]; + + std::vector icoor; + + for(int r=0;r_ostride[orthogdim]; // base offset for start of plane + + vector_type av; + + for(int l=0;liCoorFromIindex(icoor,l); + int ldx =r+icoor[orthogdim]*rd; + scalar_type *as =(scalar_type *)&av; + as[l] = scalar_type(a[ldx])*scale; + } + + tensor_reduced at; at=av; + + parallel_for_nest2(int n=0;n +static void sliceMaddVectorSlow (Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, + int Orthog,RealD scale=1.0) +{ + // FIXME: Implementation is slow + // Best base the linear combination by constructing a + // set of vectors of size grid->_rdimensions[Orthog]. + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + // If we based this on Cshift it would work for spread out + // but it would be even slower + for(int i=0;i +static void sliceInnerProductVectorSlow( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) + { + // FIXME: Implementation is slow + // Look at localInnerProduct implementation, + // and do inside a site loop with block strided iterators + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced scalar; + typedef typename scalar::scalar_object scomplex; + + int Nblock = lhs._grid->GlobalDimensions()[Orthog]; + + vec.resize(Nblock); + std::vector sip(Nblock); + Lattice IP(lhs._grid); + + IP=localInnerProduct(lhs,rhs); + sliceSum(IP,sip,Orthog); + + for(int ss=0;ss_rdimensions[Orthog]. +////////////////////////////////////////////////////////////////////////////////////////// + +inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) +{ + int NN = BlockSolverGrid->_ndimension; + int nsimd = BlockSolverGrid->Nsimd(); + + std::vector latt_phys(0); + std::vector simd_phys(0); + std::vector mpi_phys(0); + + for(int d=0;d_fdimensions[d]); + simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); + mpi_phys.push_back(BlockSolverGrid->_processors[d]); + } + } + return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); } +template +static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + + for(int i=0;i +static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) +{ + // FIXME: Implementation is slow + // Not sure of best solution.. think about it + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + GridBase *FullGrid = lhs._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + int Nblock = FullGrid->GlobalDimensions()[Orthog]; + + Lattice Lslice(SliceGrid); + Lattice Rslice(SliceGrid); + + mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); + + for(int i=0;i &in,Lattice &out) template -void InsertSlice(Lattice &lowDim,Lattice & higherDim,int slice, int orthog) +void InsertSlice(const Lattice &lowDim,Lattice & higherDim,int slice, int orthog) { typedef typename vobj::scalar_object sobj; @@ -401,7 +401,7 @@ void InsertSlice(Lattice &lowDim,Lattice & higherDim,int slice, int } template -void ExtractSlice(Lattice &lowDim, Lattice & higherDim,int slice, int orthog) +void ExtractSlice(Lattice &lowDim,const Lattice & higherDim,int slice, int orthog) { typedef typename vobj::scalar_object sobj; @@ -444,7 +444,7 @@ void ExtractSlice(Lattice &lowDim, Lattice & higherDim,int slice, in template -void InsertSliceLocal(Lattice &lowDim, Lattice & higherDim,int slice_lo,int slice_hi, int orthog) +void InsertSliceLocal(const Lattice &lowDim, Lattice & higherDim,int slice_lo,int slice_hi, int orthog) { typedef typename vobj::scalar_object sobj; diff --git a/lib/log/Log.h b/lib/log/Log.h index 8dce0acd..74d080bb 100644 --- a/lib/log/Log.h +++ b/lib/log/Log.h @@ -110,8 +110,8 @@ public: friend std::ostream& operator<< (std::ostream& stream, Logger& log){ if ( log.active ) { - stream << log.background()<< std::setw(10) << std::left << log.topName << log.background()<< " : "; - stream << log.colour() << std::setw(14) << std::left << log.name << log.background() << " : "; + stream << log.background()<< std::setw(8) << std::left << log.topName << log.background()<< " : "; + stream << log.colour() << std::setw(10) << std::left << log.name << log.background() << " : "; if ( log.timestamp ) { StopWatch.Stop(); GridTime now = StopWatch.Elapsed(); diff --git a/lib/qcd/action/fermion/.dirstamp b/lib/qcd/action/fermion/.dirstamp deleted file mode 100644 index e69de29b..00000000 diff --git a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc index 1f7d4903..dd6ec7bf 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc @@ -237,6 +237,13 @@ void CayleyFermion5D::MooeeInvDag (const FermionField &psi, FermionField & INSTANTIATE_DPERP(GparityWilsonImplD); INSTANTIATE_DPERP(ZWilsonImplF); INSTANTIATE_DPERP(ZWilsonImplD); + + INSTANTIATE_DPERP(WilsonImplFH); + INSTANTIATE_DPERP(WilsonImplDF); + INSTANTIATE_DPERP(GparityWilsonImplFH); + INSTANTIATE_DPERP(GparityWilsonImplDF); + INSTANTIATE_DPERP(ZWilsonImplFH); + INSTANTIATE_DPERP(ZWilsonImplDF); #endif }} diff --git a/lib/qcd/action/fermion/CayleyFermion5Ddense.cc b/lib/qcd/action/fermion/CayleyFermion5Ddense.cc index 16fb47bb..6b53d540 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Ddense.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Ddense.cc @@ -137,6 +137,20 @@ template void CayleyFermion5D::MooeeInternal(const FermionField &ps template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); + +INSTANTIATE_DPERP(GparityWilsonImplFH); +INSTANTIATE_DPERP(GparityWilsonImplDF); +INSTANTIATE_DPERP(WilsonImplFH); +INSTANTIATE_DPERP(WilsonImplDF); +INSTANTIATE_DPERP(ZWilsonImplFH); +INSTANTIATE_DPERP(ZWilsonImplDF); + +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); #endif }} diff --git a/lib/qcd/action/fermion/CayleyFermion5Dssp.cc b/lib/qcd/action/fermion/CayleyFermion5Dssp.cc index a83b962e..cb9b2957 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dssp.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dssp.cc @@ -37,7 +37,6 @@ namespace Grid { namespace QCD { // FIXME -- make a version of these routines with site loop outermost for cache reuse. - // Pminus fowards // Pplus backwards template @@ -152,6 +151,13 @@ void CayleyFermion5D::MooeeInvDag (const FermionField &psi, FermionField & INSTANTIATE_DPERP(GparityWilsonImplD); INSTANTIATE_DPERP(ZWilsonImplF); INSTANTIATE_DPERP(ZWilsonImplD); + + INSTANTIATE_DPERP(WilsonImplFH); + INSTANTIATE_DPERP(WilsonImplDF); + INSTANTIATE_DPERP(GparityWilsonImplFH); + INSTANTIATE_DPERP(GparityWilsonImplDF); + INSTANTIATE_DPERP(ZWilsonImplFH); + INSTANTIATE_DPERP(ZWilsonImplDF); #endif } diff --git a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc index 566624e3..653e6ab3 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc @@ -808,10 +808,21 @@ INSTANTIATE_DPERP(DomainWallVec5dImplF); INSTANTIATE_DPERP(ZDomainWallVec5dImplD); INSTANTIATE_DPERP(ZDomainWallVec5dImplF); +INSTANTIATE_DPERP(DomainWallVec5dImplDF); +INSTANTIATE_DPERP(DomainWallVec5dImplFH); +INSTANTIATE_DPERP(ZDomainWallVec5dImplDF); +INSTANTIATE_DPERP(ZDomainWallVec5dImplFH); + template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); + + }} diff --git a/lib/qcd/action/fermion/Fermion.h b/lib/qcd/action/fermion/Fermion.h index ac42f177..94fbae6c 100644 --- a/lib/qcd/action/fermion/Fermion.h +++ b/lib/qcd/action/fermion/Fermion.h @@ -89,6 +89,10 @@ typedef WilsonFermion WilsonFermionR; typedef WilsonFermion WilsonFermionF; typedef WilsonFermion WilsonFermionD; +typedef WilsonFermion WilsonFermionRL; +typedef WilsonFermion WilsonFermionFH; +typedef WilsonFermion WilsonFermionDF; + typedef WilsonFermion WilsonAdjFermionR; typedef WilsonFermion WilsonAdjFermionF; typedef WilsonFermion WilsonAdjFermionD; @@ -105,27 +109,50 @@ typedef DomainWallFermion DomainWallFermionR; typedef DomainWallFermion DomainWallFermionF; typedef DomainWallFermion DomainWallFermionD; +typedef DomainWallFermion DomainWallFermionRL; +typedef DomainWallFermion DomainWallFermionFH; +typedef DomainWallFermion DomainWallFermionDF; + typedef MobiusFermion MobiusFermionR; typedef MobiusFermion MobiusFermionF; typedef MobiusFermion MobiusFermionD; +typedef MobiusFermion MobiusFermionRL; +typedef MobiusFermion MobiusFermionFH; +typedef MobiusFermion MobiusFermionDF; + typedef ZMobiusFermion ZMobiusFermionR; typedef ZMobiusFermion ZMobiusFermionF; typedef ZMobiusFermion ZMobiusFermionD; +typedef ZMobiusFermion ZMobiusFermionRL; +typedef ZMobiusFermion ZMobiusFermionFH; +typedef ZMobiusFermion ZMobiusFermionDF; + // Ls vectorised typedef DomainWallFermion DomainWallFermionVec5dR; typedef DomainWallFermion DomainWallFermionVec5dF; typedef DomainWallFermion DomainWallFermionVec5dD; +typedef DomainWallFermion DomainWallFermionVec5dRL; +typedef DomainWallFermion DomainWallFermionVec5dFH; +typedef DomainWallFermion DomainWallFermionVec5dDF; + typedef MobiusFermion MobiusFermionVec5dR; typedef MobiusFermion MobiusFermionVec5dF; typedef MobiusFermion MobiusFermionVec5dD; +typedef MobiusFermion MobiusFermionVec5dRL; +typedef MobiusFermion MobiusFermionVec5dFH; +typedef MobiusFermion MobiusFermionVec5dDF; + typedef ZMobiusFermion ZMobiusFermionVec5dR; typedef ZMobiusFermion ZMobiusFermionVec5dF; typedef ZMobiusFermion ZMobiusFermionVec5dD; +typedef ZMobiusFermion ZMobiusFermionVec5dRL; +typedef ZMobiusFermion ZMobiusFermionVec5dFH; +typedef ZMobiusFermion ZMobiusFermionVec5dDF; typedef ScaledShamirFermion ScaledShamirFermionR; typedef ScaledShamirFermion ScaledShamirFermionF; @@ -166,17 +193,35 @@ typedef OverlapWilsonPartialFractionZolotarevFermion OverlapWilsonP typedef WilsonFermion GparityWilsonFermionR; typedef WilsonFermion GparityWilsonFermionF; typedef WilsonFermion GparityWilsonFermionD; + +typedef WilsonFermion GparityWilsonFermionRL; +typedef WilsonFermion GparityWilsonFermionFH; +typedef WilsonFermion GparityWilsonFermionDF; + typedef DomainWallFermion GparityDomainWallFermionR; typedef DomainWallFermion GparityDomainWallFermionF; typedef DomainWallFermion GparityDomainWallFermionD; +typedef DomainWallFermion GparityDomainWallFermionRL; +typedef DomainWallFermion GparityDomainWallFermionFH; +typedef DomainWallFermion GparityDomainWallFermionDF; + typedef WilsonTMFermion GparityWilsonTMFermionR; typedef WilsonTMFermion GparityWilsonTMFermionF; typedef WilsonTMFermion GparityWilsonTMFermionD; + +typedef WilsonTMFermion GparityWilsonTMFermionRL; +typedef WilsonTMFermion GparityWilsonTMFermionFH; +typedef WilsonTMFermion GparityWilsonTMFermionDF; + typedef MobiusFermion GparityMobiusFermionR; typedef MobiusFermion GparityMobiusFermionF; typedef MobiusFermion GparityMobiusFermionD; +typedef MobiusFermion GparityMobiusFermionRL; +typedef MobiusFermion GparityMobiusFermionFH; +typedef MobiusFermion GparityMobiusFermionDF; + typedef ImprovedStaggeredFermion ImprovedStaggeredFermionR; typedef ImprovedStaggeredFermion ImprovedStaggeredFermionF; typedef ImprovedStaggeredFermion ImprovedStaggeredFermionD; diff --git a/lib/qcd/action/fermion/FermionCore.h b/lib/qcd/action/fermion/FermionCore.h index 74d94d67..17006961 100644 --- a/lib/qcd/action/fermion/FermionCore.h +++ b/lib/qcd/action/fermion/FermionCore.h @@ -55,7 +55,14 @@ Author: Peter Boyle template class A; \ template class A; \ template class A; \ - template class A; + template class A; \ + template class A; \ + template class A; \ + template class A; \ + template class A; \ + template class A; \ + template class A; + #define AdjointFermOpTemplateInstantiate(A) \ template class A; \ @@ -69,7 +76,11 @@ Author: Peter Boyle template class A; \ template class A; \ template class A; \ - template class A; + template class A; \ + template class A; \ + template class A; \ + template class A; \ + template class A; #define FermOpTemplateInstantiate(A) \ FermOp4dVecTemplateInstantiate(A) \ diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h index 141d808d..f6b47063 100644 --- a/lib/qcd/action/fermion/FermionOperatorImpl.h +++ b/lib/qcd/action/fermion/FermionOperatorImpl.h @@ -35,7 +35,6 @@ directory namespace Grid { namespace QCD { - ////////////////////////////////////////////// // Template parameter class constructs to package // externally control Fermion implementations @@ -89,7 +88,53 @@ namespace QCD { // // } ////////////////////////////////////////////// - + + template struct SamePrecisionMapper { + typedef T HigherPrecVector ; + typedef T LowerPrecVector ; + }; + template struct LowerPrecisionMapper { }; + template <> struct LowerPrecisionMapper { + typedef vRealF HigherPrecVector ; + typedef vRealH LowerPrecVector ; + }; + template <> struct LowerPrecisionMapper { + typedef vRealD HigherPrecVector ; + typedef vRealF LowerPrecVector ; + }; + template <> struct LowerPrecisionMapper { + typedef vComplexF HigherPrecVector ; + typedef vComplexH LowerPrecVector ; + }; + template <> struct LowerPrecisionMapper { + typedef vComplexD HigherPrecVector ; + typedef vComplexF LowerPrecVector ; + }; + + struct CoeffReal { + public: + typedef RealD _Coeff_t; + static const int Nhcs = 2; + template using PrecisionMapper = SamePrecisionMapper; + }; + struct CoeffRealHalfComms { + public: + typedef RealD _Coeff_t; + static const int Nhcs = 1; + template using PrecisionMapper = LowerPrecisionMapper; + }; + struct CoeffComplex { + public: + typedef ComplexD _Coeff_t; + static const int Nhcs = 2; + template using PrecisionMapper = SamePrecisionMapper; + }; + struct CoeffComplexHalfComms { + public: + typedef ComplexD _Coeff_t; + static const int Nhcs = 1; + template using PrecisionMapper = LowerPrecisionMapper; + }; //////////////////////////////////////////////////////////////////////// // Implementation dependent fermion types @@ -114,37 +159,40 @@ namespace QCD { ///////////////////////////////////////////////////////////////////////////// // Single flavour four spinors with colour index ///////////////////////////////////////////////////////////////////////////// - template + template class WilsonImpl : public PeriodicGaugeImpl > { - public: static const int Dimension = Representation::Dimension; + static const bool LsVectorised=false; + static const int Nhcs = Options::Nhcs; + typedef PeriodicGaugeImpl > Gimpl; + INHERIT_GIMPL_TYPES(Gimpl); //Necessary? constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;} - const bool LsVectorised=false; - typedef _Coeff_t Coeff_t; - - INHERIT_GIMPL_TYPES(Gimpl); + typedef typename Options::_Coeff_t Coeff_t; + typedef typename Options::template PrecisionMapper::LowerPrecVector SimdL; template using iImplSpinor = iScalar, Ns> >; template using iImplPropagator = iScalar, Ns> >; template using iImplHalfSpinor = iScalar, Nhs> >; + template using iImplHalfCommSpinor = iScalar, Nhcs> >; template using iImplDoubledGaugeField = iVector >, Nds>; typedef iImplSpinor SiteSpinor; typedef iImplPropagator SitePropagator; typedef iImplHalfSpinor SiteHalfSpinor; + typedef iImplHalfCommSpinor SiteHalfCommSpinor; typedef iImplDoubledGaugeField SiteDoubledGaugeField; typedef Lattice FermionField; typedef Lattice PropagatorField; typedef Lattice DoubledGaugeField; - typedef WilsonCompressor Compressor; + typedef WilsonCompressor Compressor; typedef WilsonImplParams ImplParams; typedef WilsonStencil StencilImpl; @@ -209,31 +257,34 @@ namespace QCD { //////////////////////////////////////////////////////////////////////////////////// // Single flavour four spinors with colour index, 5d redblack //////////////////////////////////////////////////////////////////////////////////// - -template +template class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { public: - - static const int Dimension = Nrepresentation; - const bool LsVectorised=true; - typedef _Coeff_t Coeff_t; + typedef PeriodicGaugeImpl > Gimpl; - INHERIT_GIMPL_TYPES(Gimpl); + + static const int Dimension = Nrepresentation; + static const bool LsVectorised=true; + static const int Nhcs = Options::Nhcs; + + typedef typename Options::_Coeff_t Coeff_t; + typedef typename Options::template PrecisionMapper::LowerPrecVector SimdL; template using iImplSpinor = iScalar, Ns> >; template using iImplPropagator = iScalar, Ns> >; template using iImplHalfSpinor = iScalar, Nhs> >; + template using iImplHalfCommSpinor = iScalar, Nhcs> >; template using iImplDoubledGaugeField = iVector >, Nds>; template using iImplGaugeField = iVector >, Nd>; template using iImplGaugeLink = iScalar > >; - typedef iImplSpinor SiteSpinor; - typedef iImplPropagator SitePropagator; - typedef iImplHalfSpinor SiteHalfSpinor; - typedef Lattice FermionField; - typedef Lattice PropagatorField; - + typedef iImplSpinor SiteSpinor; + typedef iImplPropagator SitePropagator; + typedef iImplHalfSpinor SiteHalfSpinor; + typedef iImplHalfCommSpinor SiteHalfCommSpinor; + typedef Lattice FermionField; + typedef Lattice PropagatorField; ///////////////////////////////////////////////// // Make the doubled gauge field a *scalar* @@ -241,9 +292,9 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres typedef iImplDoubledGaugeField SiteDoubledGaugeField; // This is a scalar typedef iImplGaugeField SiteScalarGaugeField; // scalar typedef iImplGaugeLink SiteScalarGaugeLink; // scalar - typedef Lattice DoubledGaugeField; + typedef Lattice DoubledGaugeField; - typedef WilsonCompressor Compressor; + typedef WilsonCompressor Compressor; typedef WilsonImplParams ImplParams; typedef WilsonStencil StencilImpl; @@ -311,35 +362,37 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres //////////////////////////////////////////////////////////////////////////////////////// // Flavour doubled spinors; is Gparity the only? what about C*? //////////////////////////////////////////////////////////////////////////////////////// - -template +template class GparityWilsonImpl : public ConjugateGaugeImpl > { public: static const int Dimension = Nrepresentation; + static const int Nhcs = Options::Nhcs; + static const bool LsVectorised=false; - const bool LsVectorised=false; - - typedef _Coeff_t Coeff_t; typedef ConjugateGaugeImpl< GaugeImplTypes > Gimpl; - INHERIT_GIMPL_TYPES(Gimpl); + + typedef typename Options::_Coeff_t Coeff_t; + typedef typename Options::template PrecisionMapper::LowerPrecVector SimdL; - template using iImplSpinor = iVector, Ns>, Ngp>; - template using iImplPropagator = iVector, Ns>, Ngp >; - template using iImplHalfSpinor = iVector, Nhs>, Ngp>; + template using iImplSpinor = iVector, Ns>, Ngp>; + template using iImplPropagator = iVector, Ns>, Ngp>; + template using iImplHalfSpinor = iVector, Nhs>, Ngp>; + template using iImplHalfCommSpinor = iVector, Nhcs>, Ngp>; template using iImplDoubledGaugeField = iVector >, Nds>, Ngp>; - - typedef iImplSpinor SiteSpinor; - typedef iImplPropagator SitePropagator; - typedef iImplHalfSpinor SiteHalfSpinor; + + typedef iImplSpinor SiteSpinor; + typedef iImplPropagator SitePropagator; + typedef iImplHalfSpinor SiteHalfSpinor; + typedef iImplHalfCommSpinor SiteHalfCommSpinor; typedef iImplDoubledGaugeField SiteDoubledGaugeField; typedef Lattice FermionField; typedef Lattice PropagatorField; typedef Lattice DoubledGaugeField; - typedef WilsonCompressor Compressor; + typedef WilsonCompressor Compressor; typedef WilsonStencil StencilImpl; typedef GparityWilsonImplParams ImplParams; @@ -356,8 +409,8 @@ class GparityWilsonImpl : public ConjugateGaugeImpl - class StaggeredImpl : public PeriodicGaugeImpl > { +///////////////////////////////////////////////////////////////////////////// +// Single flavour one component spinors with colour index +///////////////////////////////////////////////////////////////////////////// +template +class StaggeredImpl : public PeriodicGaugeImpl > { public: typedef RealD _Coeff_t ; static const int Dimension = Representation::Dimension; + static const bool LsVectorised=false; typedef PeriodicGaugeImpl > Gimpl; //Necessary? constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;} - const bool LsVectorised=false; typedef _Coeff_t Coeff_t; INHERIT_GIMPL_TYPES(Gimpl); @@ -641,8 +692,6 @@ class GparityWilsonImpl : public ConjugateGaugeImpl > Gimpl; //Necessary? constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;} - - const bool LsVectorised=true; - typedef _Coeff_t Coeff_t; INHERIT_GIMPL_TYPES(Gimpl); @@ -823,43 +870,61 @@ class GparityWilsonImpl : public ConjugateGaugeImpl WilsonImplR; // Real.. whichever prec +typedef WilsonImpl WilsonImplF; // Float +typedef WilsonImpl WilsonImplD; // Double +typedef WilsonImpl WilsonImplRL; // Real.. whichever prec +typedef WilsonImpl WilsonImplFH; // Float +typedef WilsonImpl WilsonImplDF; // Double - typedef WilsonImpl WilsonImplR; // Real.. whichever prec - typedef WilsonImpl WilsonImplF; // Float - typedef WilsonImpl WilsonImplD; // Double +typedef WilsonImpl ZWilsonImplR; // Real.. whichever prec +typedef WilsonImpl ZWilsonImplF; // Float +typedef WilsonImpl ZWilsonImplD; // Double - typedef WilsonImpl ZWilsonImplR; // Real.. whichever prec - typedef WilsonImpl ZWilsonImplF; // Float - typedef WilsonImpl ZWilsonImplD; // Double +typedef WilsonImpl ZWilsonImplRL; // Real.. whichever prec +typedef WilsonImpl ZWilsonImplFH; // Float +typedef WilsonImpl ZWilsonImplDF; // Double - typedef WilsonImpl WilsonAdjImplR; // Real.. whichever prec - typedef WilsonImpl WilsonAdjImplF; // Float - typedef WilsonImpl WilsonAdjImplD; // Double +typedef WilsonImpl WilsonAdjImplR; // Real.. whichever prec +typedef WilsonImpl WilsonAdjImplF; // Float +typedef WilsonImpl WilsonAdjImplD; // Double - typedef WilsonImpl WilsonTwoIndexSymmetricImplR; // Real.. whichever prec - typedef WilsonImpl WilsonTwoIndexSymmetricImplF; // Float - typedef WilsonImpl WilsonTwoIndexSymmetricImplD; // Double +typedef WilsonImpl WilsonTwoIndexSymmetricImplR; // Real.. whichever prec +typedef WilsonImpl WilsonTwoIndexSymmetricImplF; // Float +typedef WilsonImpl WilsonTwoIndexSymmetricImplD; // Double - typedef DomainWallVec5dImpl DomainWallVec5dImplR; // Real.. whichever prec - typedef DomainWallVec5dImpl DomainWallVec5dImplF; // Float - typedef DomainWallVec5dImpl DomainWallVec5dImplD; // Double +typedef DomainWallVec5dImpl DomainWallVec5dImplR; // Real.. whichever prec +typedef DomainWallVec5dImpl DomainWallVec5dImplF; // Float +typedef DomainWallVec5dImpl DomainWallVec5dImplD; // Double - typedef DomainWallVec5dImpl ZDomainWallVec5dImplR; // Real.. whichever prec - typedef DomainWallVec5dImpl ZDomainWallVec5dImplF; // Float - typedef DomainWallVec5dImpl ZDomainWallVec5dImplD; // Double +typedef DomainWallVec5dImpl DomainWallVec5dImplRL; // Real.. whichever prec +typedef DomainWallVec5dImpl DomainWallVec5dImplFH; // Float +typedef DomainWallVec5dImpl DomainWallVec5dImplDF; // Double - typedef GparityWilsonImpl GparityWilsonImplR; // Real.. whichever prec - typedef GparityWilsonImpl GparityWilsonImplF; // Float - typedef GparityWilsonImpl GparityWilsonImplD; // Double +typedef DomainWallVec5dImpl ZDomainWallVec5dImplR; // Real.. whichever prec +typedef DomainWallVec5dImpl ZDomainWallVec5dImplF; // Float +typedef DomainWallVec5dImpl ZDomainWallVec5dImplD; // Double + +typedef DomainWallVec5dImpl ZDomainWallVec5dImplRL; // Real.. whichever prec +typedef DomainWallVec5dImpl ZDomainWallVec5dImplFH; // Float +typedef DomainWallVec5dImpl ZDomainWallVec5dImplDF; // Double + +typedef GparityWilsonImpl GparityWilsonImplR; // Real.. whichever prec +typedef GparityWilsonImpl GparityWilsonImplF; // Float +typedef GparityWilsonImpl GparityWilsonImplD; // Double + +typedef GparityWilsonImpl GparityWilsonImplRL; // Real.. whichever prec +typedef GparityWilsonImpl GparityWilsonImplFH; // Float +typedef GparityWilsonImpl GparityWilsonImplDF; // Double - typedef StaggeredImpl StaggeredImplR; // Real.. whichever prec - typedef StaggeredImpl StaggeredImplF; // Float - typedef StaggeredImpl StaggeredImplD; // Double +typedef StaggeredImpl StaggeredImplR; // Real.. whichever prec +typedef StaggeredImpl StaggeredImplF; // Float +typedef StaggeredImpl StaggeredImplD; // Double - typedef StaggeredVec5dImpl StaggeredVec5dImplR; // Real.. whichever prec - typedef StaggeredVec5dImpl StaggeredVec5dImplF; // Float - typedef StaggeredVec5dImpl StaggeredVec5dImplD; // Double +typedef StaggeredVec5dImpl StaggeredVec5dImplR; // Real.. whichever prec +typedef StaggeredVec5dImpl StaggeredVec5dImplF; // Float +typedef StaggeredVec5dImpl StaggeredVec5dImplD; // Double }} diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc index 2ba4f4af..5ce2b335 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc @@ -160,8 +160,6 @@ void ImprovedStaggeredFermion::ImportGauge(const GaugeField &_Uthin,const PokeIndex(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4); } - std::cout << " Umu " << Umu._odata[0]< namespace Grid { namespace QCD { - template - class WilsonCompressor { - public: - int mu; - int dag; +///////////////////////////////////////////////////////////////////////////////////////////// +// optimised versions supporting half precision too +///////////////////////////////////////////////////////////////////////////////////////////// - WilsonCompressor(int _dag){ - mu=0; - dag=_dag; - assert((dag==0)||(dag==1)); - } - void Point(int p) { - mu=p; - }; +template +class WilsonCompressorTemplate; - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - int mudag=mu; - if (!dag) { - mudag=(mu+Nd)%(2*Nd); - } - switch(mudag) { - case Xp: - spProjXp(ret,in); - break; - case Yp: - spProjYp(ret,in); - break; - case Zp: - spProjZp(ret,in); - break; - case Tp: - spProjTp(ret,in); - break; - case Xm: - spProjXm(ret,in); - break; - case Ym: - spProjYm(ret,in); - break; - case Zm: - spProjZm(ret,in); - break; - case Tm: - spProjTm(ret,in); - break; - default: - assert(0); - break; - } - return ret; - } - }; - ///////////////////////// - // optimised versions - ///////////////////////// +template +class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector, + typename std::enable_if::value>::type > +{ + public: + + int mu,dag; - template - class WilsonXpCompressor { - public: - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - spProjXp(ret,in); - return ret; - } - }; - template - class WilsonYpCompressor { - public: - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - spProjYp(ret,in); - return ret; - } - }; - template - class WilsonZpCompressor { - public: - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - spProjZp(ret,in); - return ret; - } - }; - template - class WilsonTpCompressor { - public: - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - spProjTp(ret,in); - return ret; - } - }; + void Point(int p) { mu=p; }; - template - class WilsonXmCompressor { - public: - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - spProjXm(ret,in); - return ret; - } - }; - template - class WilsonYmCompressor { - public: - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - spProjYm(ret,in); - return ret; - } - }; - template - class WilsonZmCompressor { - public: - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - spProjZm(ret,in); - return ret; - } - }; - template - class WilsonTmCompressor { - public: - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - spProjTm(ret,in); - return ret; - } - }; + WilsonCompressorTemplate(int _dag=0){ + dag = _dag; + } - // Fast comms buffer manipulation which should inline right through (avoid direction - // dependent logic that prevents inlining - template - class WilsonStencil : public CartesianStencil { - public: + typedef _Spinor SiteSpinor; + typedef _Hspinor SiteHalfSpinor; + typedef _HCspinor SiteHalfCommSpinor; + typedef typename SiteHalfCommSpinor::vector_type vComplexLow; + typedef typename SiteHalfSpinor::vector_type vComplexHigh; + constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh); - typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; + inline int CommDatumSize(void) { + return sizeof(SiteHalfCommSpinor); + } - WilsonStencil(GridBase *grid, + /*****************************************************/ + /* Compress includes precision change if mpi data is not same */ + /*****************************************************/ + inline void Compress(SiteHalfSpinor *buf,Integer o,const SiteSpinor &in) { + projector::Proj(buf[o],in,mu,dag); + } + + /*****************************************************/ + /* Exchange includes precision change if mpi data is not same */ + /*****************************************************/ + inline void Exchange(SiteHalfSpinor *mp, + SiteHalfSpinor *vp0, + SiteHalfSpinor *vp1, + Integer type,Integer o){ + exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); + } + + /*****************************************************/ + /* Have a decompression step if mpi data is not same */ + /*****************************************************/ + inline void Decompress(SiteHalfSpinor *out, + SiteHalfSpinor *in, Integer o) { + assert(0); + } + + /*****************************************************/ + /* Compress Exchange */ + /*****************************************************/ + inline void CompressExchange(SiteHalfSpinor *out0, + SiteHalfSpinor *out1, + const SiteSpinor *in, + Integer j,Integer k, Integer m,Integer type){ + SiteHalfSpinor temp1, temp2,temp3,temp4; + projector::Proj(temp1,in[k],mu,dag); + projector::Proj(temp2,in[m],mu,dag); + exchange(out0[j],out1[j],temp1,temp2,type); + } + + /*****************************************************/ + /* Pass the info to the stencil */ + /*****************************************************/ + inline bool DecompressionStep(void) { return false; } + +}; + +template +class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector, + typename std::enable_if::value>::type > +{ + public: + + int mu,dag; + + void Point(int p) { mu=p; }; + + WilsonCompressorTemplate(int _dag=0){ + dag = _dag; + } + + typedef _Spinor SiteSpinor; + typedef _Hspinor SiteHalfSpinor; + typedef _HCspinor SiteHalfCommSpinor; + typedef typename SiteHalfCommSpinor::vector_type vComplexLow; + typedef typename SiteHalfSpinor::vector_type vComplexHigh; + constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh); + + inline int CommDatumSize(void) { + return sizeof(SiteHalfCommSpinor); + } + + /*****************************************************/ + /* Compress includes precision change if mpi data is not same */ + /*****************************************************/ + inline void Compress(SiteHalfSpinor *buf,Integer o,const SiteSpinor &in) { + SiteHalfSpinor hsp; + SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf; + projector::Proj(hsp,in,mu,dag); + precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw); + } + + /*****************************************************/ + /* Exchange includes precision change if mpi data is not same */ + /*****************************************************/ + inline void Exchange(SiteHalfSpinor *mp, + SiteHalfSpinor *vp0, + SiteHalfSpinor *vp1, + Integer type,Integer o){ + SiteHalfSpinor vt0,vt1; + SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0; + SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1; + precisionChange((vComplexHigh *)&vt0,(vComplexLow *)&vpp0[o],Nw); + precisionChange((vComplexHigh *)&vt1,(vComplexLow *)&vpp1[o],Nw); + exchange(mp[2*o],mp[2*o+1],vt0,vt1,type); + } + + /*****************************************************/ + /* Have a decompression step if mpi data is not same */ + /*****************************************************/ + inline void Decompress(SiteHalfSpinor *out, + SiteHalfSpinor *in, Integer o){ + SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in; + precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw); + } + + /*****************************************************/ + /* Compress Exchange */ + /*****************************************************/ + inline void CompressExchange(SiteHalfSpinor *out0, + SiteHalfSpinor *out1, + const SiteSpinor *in, + Integer j,Integer k, Integer m,Integer type){ + SiteHalfSpinor temp1, temp2,temp3,temp4; + SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0; + SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1; + projector::Proj(temp1,in[k],mu,dag); + projector::Proj(temp2,in[m],mu,dag); + exchange(temp3,temp4,temp1,temp2,type); + precisionChange((vComplexLow *)&hout0[j],(vComplexHigh *)&temp3,Nw); + precisionChange((vComplexLow *)&hout1[j],(vComplexHigh *)&temp4,Nw); + } + + /*****************************************************/ + /* Pass the info to the stencil */ + /*****************************************************/ + inline bool DecompressionStep(void) { return true; } + +}; + +#define DECLARE_PROJ(Projector,Compressor,spProj) \ + class Projector { \ + public: \ + template \ + static void Proj(hsp &result,const fsp &in,int mu,int dag){ \ + spProj(result,in); \ + } \ + }; \ +template using Compressor = WilsonCompressorTemplate; + +DECLARE_PROJ(WilsonXpProjector,WilsonXpCompressor,spProjXp); +DECLARE_PROJ(WilsonYpProjector,WilsonYpCompressor,spProjYp); +DECLARE_PROJ(WilsonZpProjector,WilsonZpCompressor,spProjZp); +DECLARE_PROJ(WilsonTpProjector,WilsonTpCompressor,spProjTp); +DECLARE_PROJ(WilsonXmProjector,WilsonXmCompressor,spProjXm); +DECLARE_PROJ(WilsonYmProjector,WilsonYmCompressor,spProjYm); +DECLARE_PROJ(WilsonZmProjector,WilsonZmCompressor,spProjZm); +DECLARE_PROJ(WilsonTmProjector,WilsonTmCompressor,spProjTm); + +class WilsonProjector { + public: + template + static void Proj(hsp &result,const fsp &in,int mu,int dag){ + int mudag=dag? mu : (mu+Nd)%(2*Nd); + switch(mudag) { + case Xp: spProjXp(result,in); break; + case Yp: spProjYp(result,in); break; + case Zp: spProjZp(result,in); break; + case Tp: spProjTp(result,in); break; + case Xm: spProjXm(result,in); break; + case Ym: spProjYm(result,in); break; + case Zm: spProjZm(result,in); break; + case Tm: spProjTm(result,in); break; + default: assert(0); break; + } + } +}; +template using WilsonCompressor = WilsonCompressorTemplate; + +// Fast comms buffer manipulation which should inline right through (avoid direction +// dependent logic that prevents inlining +template +class WilsonStencil : public CartesianStencil { +public: + + typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; + + std::vector same_node; + std::vector surface_list; + + WilsonStencil(GridBase *grid, int npoints, int checkerboard, const std::vector &directions, - const std::vector &distances) : CartesianStencil (grid,npoints,checkerboard,directions,distances) - { }; - - template < class compressor> - void HaloExchangeOpt(const Lattice &source,compressor &compress) - { - std::vector > reqs; - HaloExchangeOptGather(source,compress); - this->CommunicateBegin(reqs); - this->calls++; - this->CommunicateComplete(reqs); - this->CommsMerge(); - } - - template < class compressor> - void HaloExchangeOptGather(const Lattice &source,compressor &compress) - { - this->calls++; - this->Mergers.resize(0); - this->Packets.resize(0); - this->HaloGatherOpt(source,compress); - } - - - template < class compressor> - void HaloGatherOpt(const Lattice &source,compressor &compress) - { - this->_grid->StencilBarrier(); - // conformable(source._grid,_grid); - assert(source._grid==this->_grid); - this->halogtime-=usecond(); - - this->u_comm_offset=0; - - int dag = compress.dag; - - WilsonXpCompressor XpCompress; - WilsonYpCompressor YpCompress; - WilsonZpCompressor ZpCompress; - WilsonTpCompressor TpCompress; - WilsonXmCompressor XmCompress; - WilsonYmCompressor YmCompress; - WilsonZmCompressor ZmCompress; - WilsonTmCompressor TmCompress; - - // Gather all comms buffers - // for(int point = 0 ; point < _npoints; point++) { - // compress.Point(point); - // HaloGatherDir(source,compress,point,face_idx); - // } - int face_idx=0; - if ( dag ) { - // std::cout << " Optimised Dagger compress " <HaloGatherDir(source,XpCompress,Xp,face_idx); - this->HaloGatherDir(source,YpCompress,Yp,face_idx); - this->HaloGatherDir(source,ZpCompress,Zp,face_idx); - this->HaloGatherDir(source,TpCompress,Tp,face_idx); - this->HaloGatherDir(source,XmCompress,Xm,face_idx); - this->HaloGatherDir(source,YmCompress,Ym,face_idx); - this->HaloGatherDir(source,ZmCompress,Zm,face_idx); - this->HaloGatherDir(source,TmCompress,Tm,face_idx); - } else { - this->HaloGatherDir(source,XmCompress,Xp,face_idx); - this->HaloGatherDir(source,YmCompress,Yp,face_idx); - this->HaloGatherDir(source,ZmCompress,Zp,face_idx); - this->HaloGatherDir(source,TmCompress,Tp,face_idx); - this->HaloGatherDir(source,XpCompress,Xm,face_idx); - this->HaloGatherDir(source,YpCompress,Ym,face_idx); - this->HaloGatherDir(source,ZpCompress,Zm,face_idx); - this->HaloGatherDir(source,TpCompress,Tm,face_idx); - } - this->face_table_computed=1; - assert(this->u_comm_offset==this->_unified_buffer_size); - this->halogtime+=usecond(); - } - + const std::vector &distances) + : CartesianStencil (grid,npoints,checkerboard,directions,distances) , + same_node(npoints) + { + surface_list.resize(0); }; + void BuildSurfaceList(int Ls,int vol4){ + + // find same node for SHM + // Here we know the distance is 1 for WilsonStencil + for(int point=0;point_npoints;point++){ + same_node[point] = this->SameNode(point); + // std::cout << " dir " <_npoints;point++){ + if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){ + local = 0; + } + } + if(local == 0) { + surface_list.push_back(site); + } + } + } + + template < class compressor> + void HaloExchangeOpt(const Lattice &source,compressor &compress) + { + std::vector > reqs; + this->HaloExchangeOptGather(source,compress); + this->CommunicateBegin(reqs); + this->CommunicateComplete(reqs); + this->CommsMerge(compress); + this->CommsMergeSHM(compress); + } + + template + void HaloExchangeOptGather(const Lattice &source,compressor &compress) + { + this->Prepare(); + this->HaloGatherOpt(source,compress); + } + + template + void HaloGatherOpt(const Lattice &source,compressor &compress) + { + // Strategy. Inherit types from Compressor. + // Use types to select the write direction by directon compressor + typedef typename compressor::SiteSpinor SiteSpinor; + typedef typename compressor::SiteHalfSpinor SiteHalfSpinor; + typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor; + + this->_grid->StencilBarrier(); + + assert(source._grid==this->_grid); + this->halogtime-=usecond(); + + this->u_comm_offset=0; + + WilsonXpCompressor XpCompress; + WilsonYpCompressor YpCompress; + WilsonZpCompressor ZpCompress; + WilsonTpCompressor TpCompress; + WilsonXmCompressor XmCompress; + WilsonYmCompressor YmCompress; + WilsonZmCompressor ZmCompress; + WilsonTmCompressor TmCompress; + + int dag = compress.dag; + int face_idx=0; + if ( dag ) { + // std::cout << " Optimised Dagger compress " <HaloGatherDir(source,XpCompress,Xp,face_idx)); + assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); + assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); + assert(same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx)); + assert(same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx)); + assert(same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx)); + assert(same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); + assert(same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx)); + } else { + assert(same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx)); + assert(same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx)); + assert(same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); + assert(same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx)); + assert(same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx)); + assert(same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx)); + assert(same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); + assert(same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx)); + } + this->face_table_computed=1; + assert(this->u_comm_offset==this->_unified_buffer_size); + this->halogtime+=usecond(); + } + + }; }} // namespace close #endif diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 88bc425a..6f07a076 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -117,49 +117,20 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, // Allocate the required comms buffer ImportGauge(_Umu); + + // Build lists of exterior only nodes + int LLs = FiveDimGrid._rdimensions[0]; + int vol4; + vol4=FourDimGrid.oSites(); + Stencil.BuildSurfaceList(LLs,vol4); + vol4=FourDimRedBlackGrid.oSites(); + StencilEven.BuildSurfaceList(LLs,vol4); + StencilOdd.BuildSurfaceList(LLs,vol4); + + std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size() + <<" " << StencilEven.surface_list.size()< -WilsonFermion5D::WilsonFermion5D(int simd,GaugeField &_Umu, - GridCartesian &FiveDimGrid, - GridRedBlackCartesian &FiveDimRedBlackGrid, - GridCartesian &FourDimGrid, - RealD _M5,const ImplParams &p) : -{ - int nsimd = Simd::Nsimd(); - - // some assertions - assert(FiveDimGrid._ndimension==5); - assert(FiveDimRedBlackGrid._ndimension==5); - assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction - assert(FourDimGrid._ndimension==4); - - // Dimension zero of the five-d is the Ls direction - Ls=FiveDimGrid._fdimensions[0]; - assert(FiveDimGrid._processors[0] ==1); - assert(FiveDimGrid._simd_layout[0] ==nsimd); - - assert(FiveDimRedBlackGrid._fdimensions[0]==Ls); - assert(FiveDimRedBlackGrid._processors[0] ==1); - assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd); - - // Other dimensions must match the decomposition of the four-D fields - for(int d=0;d<4;d++){ - assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]); - assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]); - - assert(FourDimGrid._simd_layout[d]=1); - assert(FiveDimRedBlackGrid._simd_layout[d+1]==1); - - assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]); - assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]); - assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]); - } - - { - } -} - */ template void WilsonFermion5D::Report(void) @@ -396,6 +367,7 @@ void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, DhopTotalTime+=usecond(); } + template void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField & U, @@ -409,12 +381,21 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg int LLs = in._grid->_rdimensions[0]; int len = U._grid->oSites(); - + DhopFaceTime-=usecond(); st.HaloExchangeOptGather(in,compressor); DhopFaceTime+=usecond(); std::vector > reqs; + // Rely on async comms; start comms before merge of local data + DhopCommTime-=usecond(); + st.CommunicateBegin(reqs); + + DhopFaceTime-=usecond(); + st.CommsMergeSHM(compressor); + DhopFaceTime+=usecond(); + + // Perhaps use omp task and region #pragma omp parallel { int nthreads = omp_get_num_threads(); @@ -425,8 +406,6 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg int sF = LLs * myoff; if ( me == 0 ) { - DhopCommTime-=usecond(); - st.CommunicateBegin(reqs); st.CommunicateComplete(reqs); DhopCommTime+=usecond(); } else { @@ -439,28 +418,37 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg } DhopFaceTime-=usecond(); - st.CommsMerge(); + st.CommsMerge(compressor); DhopFaceTime+=usecond(); -#pragma omp parallel - { - int nthreads = omp_get_num_threads(); - int me = omp_get_thread_num(); - int myoff, mywork; - - GridThread::GetWork(len,me,mywork,myoff,nthreads); - int sF = LLs * myoff; - - // Exterior links in stencil - if ( me==0 ) DhopComputeTime2-=usecond(); - if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1); - else Kernels::DhopSite (st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1); - if ( me==0 ) DhopComputeTime2+=usecond(); - }// end parallel region + // Load imbalance alert. Should use dynamic schedule OMP for loop + // Perhaps create a list of only those sites with face work, and + // load balance process the list. + DhopComputeTime2-=usecond(); + if (dag == DaggerYes) { + int sz=st.surface_list.size(); + parallel_for (int ss = 0; ss < sz; ss++) { + int sU = st.surface_list[ss]; + int sF = LLs * sU; + Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1); + } + } else { + int sz=st.surface_list.size(); + parallel_for (int ss = 0; ss < sz; ss++) { + int sU = st.surface_list[ss]; + int sF = LLs * sU; + Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1); + } + } + DhopComputeTime2+=usecond(); #else assert(0); #endif + } + + + template void WilsonFermion5D::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField & U, @@ -679,7 +667,6 @@ void WilsonFermion5D::MomentumSpacePropagatorHw(FermionField &out,const Fe } - FermOpTemplateInstantiate(WilsonFermion5D); GparityFermOpTemplateInstantiate(WilsonFermion5D); diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc index 6e72e089..03c066b0 100644 --- a/lib/qcd/action/fermion/WilsonKernels.cc +++ b/lib/qcd/action/fermion/WilsonKernels.cc @@ -33,52 +33,8 @@ directory namespace Grid { namespace QCD { - int WilsonKernelsStatic::Opt = WilsonKernelsStatic::OptGeneric; - int WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute; - -#ifdef QPX -#include -#include -#include -#include -#endif - -void bgq_l1p_optimisation(int mode) -{ -#ifdef QPX -#undef L1P_CFG_PF_USR -#define L1P_CFG_PF_USR (0x3fde8000108ll) /* (64 bit reg, 23 bits wide, user/unpriv) */ - - uint64_t cfg_pf_usr; - if ( mode ) { - cfg_pf_usr = - L1P_CFG_PF_USR_ifetch_depth(0) - | L1P_CFG_PF_USR_ifetch_max_footprint(1) - | L1P_CFG_PF_USR_pf_stream_est_on_dcbt - | L1P_CFG_PF_USR_pf_stream_establish_enable - | L1P_CFG_PF_USR_pf_stream_optimistic - | L1P_CFG_PF_USR_pf_adaptive_throttle(0xF) ; - // if ( sizeof(Float) == sizeof(double) ) { - cfg_pf_usr |= L1P_CFG_PF_USR_dfetch_depth(2)| L1P_CFG_PF_USR_dfetch_max_footprint(3) ; - // } else { - // cfg_pf_usr |= L1P_CFG_PF_USR_dfetch_depth(1)| L1P_CFG_PF_USR_dfetch_max_footprint(2) ; - // } - } else { - cfg_pf_usr = L1P_CFG_PF_USR_dfetch_depth(1) - | L1P_CFG_PF_USR_dfetch_max_footprint(2) - | L1P_CFG_PF_USR_ifetch_depth(0) - | L1P_CFG_PF_USR_ifetch_max_footprint(1) - | L1P_CFG_PF_USR_pf_stream_est_on_dcbt - | L1P_CFG_PF_USR_pf_stream_establish_enable - | L1P_CFG_PF_USR_pf_stream_optimistic - | L1P_CFG_PF_USR_pf_stream_prefetch_enable; - } - *((uint64_t *)L1P_CFG_PF_USR) = cfg_pf_usr; - -#endif - -} - +int WilsonKernelsStatic::Opt = WilsonKernelsStatic::OptGeneric; +int WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute; template WilsonKernels::WilsonKernels(const ImplParams &p) : Base(p){}; @@ -86,12 +42,72 @@ WilsonKernels::WilsonKernels(const ImplParams &p) : Base(p){}; //////////////////////////////////////////// // Generic implementation; move to different file? //////////////////////////////////////////// + +#define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \ + SE = st.GetEntry(ptype, Dir, sF); \ + if (SE->_is_local) { \ + chi_p = χ \ + if (SE->_permute) { \ + spProj(tmp, in._odata[SE->_offset]); \ + permute(chi, tmp, ptype); \ + } else { \ + spProj(chi, in._odata[SE->_offset]); \ + } \ + } else { \ + chi_p = &buf[SE->_offset]; \ + } \ + Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st); \ + Recon(result, Uchi); + +#define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon) \ + SE = st.GetEntry(ptype, Dir, sF); \ + if (SE->_is_local) { \ + chi_p = χ \ + if (SE->_permute) { \ + spProj(tmp, in._odata[SE->_offset]); \ + permute(chi, tmp, ptype); \ + } else { \ + spProj(chi, in._odata[SE->_offset]); \ + } \ + } else if ( st.same_node[Dir] ) { \ + chi_p = &buf[SE->_offset]; \ + } \ + if (SE->_is_local || st.same_node[Dir] ) { \ + Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st); \ + Recon(result, Uchi); \ + } +#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \ + SE = st.GetEntry(ptype, Dir, sF); \ + if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \ + chi_p = &buf[SE->_offset]; \ + Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st); \ + Recon(result, Uchi); \ + nmu++; \ + } + +#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon) \ + if (gamma == Dir) { \ + if (SE->_is_local && SE->_permute) { \ + spProj(tmp, in._odata[SE->_offset]); \ + permute(chi, tmp, ptype); \ + } else if (SE->_is_local) { \ + spProj(chi, in._odata[SE->_offset]); \ + } else { \ + chi = buf[SE->_offset]; \ + } \ + Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); \ + Recon(result, Uchi); \ + } + + //////////////////////////////////////////////////////////////////// + // All legs kernels ; comms then compute + //////////////////////////////////////////////////////////////////// template void WilsonKernels::GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - SiteHalfSpinor *buf, int sF, - int sU, const FermionField &in, FermionField &out, - int interior,int exterior) { + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) +{ SiteHalfSpinor tmp; SiteHalfSpinor chi; SiteHalfSpinor *chi_p; @@ -100,174 +116,22 @@ void WilsonKernels::GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, StencilEntry *SE; int ptype; - /////////////////////////// - // Xp - /////////////////////////// - SE = st.GetEntry(ptype, Xp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjXp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjXp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Xp, SE, st); - spReconXp(result, Uchi); - - /////////////////////////// - // Yp - /////////////////////////// - SE = st.GetEntry(ptype, Yp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjYp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjYp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Yp, SE, st); - accumReconYp(result, Uchi); - - /////////////////////////// - // Zp - /////////////////////////// - SE = st.GetEntry(ptype, Zp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjZp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjZp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Zp, SE, st); - accumReconZp(result, Uchi); - - /////////////////////////// - // Tp - /////////////////////////// - SE = st.GetEntry(ptype, Tp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjTp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjTp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Tp, SE, st); - accumReconTp(result, Uchi); - - /////////////////////////// - // Xm - /////////////////////////// - SE = st.GetEntry(ptype, Xm, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjXm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjXm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Xm, SE, st); - accumReconXm(result, Uchi); - - /////////////////////////// - // Ym - /////////////////////////// - SE = st.GetEntry(ptype, Ym, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjYm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjYm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Ym, SE, st); - accumReconYm(result, Uchi); - - /////////////////////////// - // Zm - /////////////////////////// - SE = st.GetEntry(ptype, Zm, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjZm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjZm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Zm, SE, st); - accumReconZm(result, Uchi); - - /////////////////////////// - // Tm - /////////////////////////// - SE = st.GetEntry(ptype, Tm, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjTm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjTm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Tm, SE, st); - accumReconTm(result, Uchi); - + GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp); + GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp); + GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp); + GENERIC_STENCIL_LEG(Tp,spProjTp,accumReconTp); + GENERIC_STENCIL_LEG(Xm,spProjXm,accumReconXm); + GENERIC_STENCIL_LEG(Ym,spProjYm,accumReconYm); + GENERIC_STENCIL_LEG(Zm,spProjZm,accumReconZm); + GENERIC_STENCIL_LEG(Tm,spProjTm,accumReconTm); vstream(out._odata[sF], result); }; -// Need controls to do interior, exterior, or both template void WilsonKernels::GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - SiteHalfSpinor *buf, int sF, - int sU, const FermionField &in, FermionField &out,int interior,int exterior) { + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) +{ SiteHalfSpinor tmp; SiteHalfSpinor chi; SiteHalfSpinor *chi_p; @@ -276,168 +140,123 @@ void WilsonKernels::GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, Do StencilEntry *SE; int ptype; - /////////////////////////// - // Xp - /////////////////////////// - SE = st.GetEntry(ptype, Xm, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjXp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjXp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Xm, SE, st); - spReconXp(result, Uchi); - - /////////////////////////// - // Yp - /////////////////////////// - SE = st.GetEntry(ptype, Ym, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjYp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjYp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Ym, SE, st); - accumReconYp(result, Uchi); - - /////////////////////////// - // Zp - /////////////////////////// - SE = st.GetEntry(ptype, Zm, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjZp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjZp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Zm, SE, st); - accumReconZp(result, Uchi); - - /////////////////////////// - // Tp - /////////////////////////// - SE = st.GetEntry(ptype, Tm, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjTp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjTp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Tm, SE, st); - accumReconTp(result, Uchi); - - /////////////////////////// - // Xm - /////////////////////////// - SE = st.GetEntry(ptype, Xp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjXm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjXm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Xp, SE, st); - accumReconXm(result, Uchi); - - /////////////////////////// - // Ym - /////////////////////////// - SE = st.GetEntry(ptype, Yp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjYm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjYm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Yp, SE, st); - accumReconYm(result, Uchi); - - /////////////////////////// - // Zm - /////////////////////////// - SE = st.GetEntry(ptype, Zp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjZm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjZm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Zp, SE, st); - accumReconZm(result, Uchi); - - /////////////////////////// - // Tm - /////////////////////////// - SE = st.GetEntry(ptype, Tp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjTm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjTm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Tp, SE, st); - accumReconTm(result, Uchi); - + GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp); + GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp); + GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp); + GENERIC_STENCIL_LEG(Tm,spProjTp,accumReconTp); + GENERIC_STENCIL_LEG(Xp,spProjXm,accumReconXm); + GENERIC_STENCIL_LEG(Yp,spProjYm,accumReconYm); + GENERIC_STENCIL_LEG(Zp,spProjZm,accumReconZm); + GENERIC_STENCIL_LEG(Tp,spProjTm,accumReconTm); vstream(out._odata[sF], result); }; + //////////////////////////////////////////////////////////////////// + // Interior kernels + //////////////////////////////////////////////////////////////////// +template +void WilsonKernels::GenericDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) +{ + SiteHalfSpinor tmp; + SiteHalfSpinor chi; + SiteHalfSpinor *chi_p; + SiteHalfSpinor Uchi; + SiteSpinor result; + StencilEntry *SE; + int ptype; + + result=zero; + GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp); + GENERIC_STENCIL_LEG_INT(Yp,spProjYp,accumReconYp); + GENERIC_STENCIL_LEG_INT(Zp,spProjZp,accumReconZp); + GENERIC_STENCIL_LEG_INT(Tp,spProjTp,accumReconTp); + GENERIC_STENCIL_LEG_INT(Xm,spProjXm,accumReconXm); + GENERIC_STENCIL_LEG_INT(Ym,spProjYm,accumReconYm); + GENERIC_STENCIL_LEG_INT(Zm,spProjZm,accumReconZm); + GENERIC_STENCIL_LEG_INT(Tm,spProjTm,accumReconTm); + vstream(out._odata[sF], result); +}; + +template +void WilsonKernels::GenericDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) +{ + SiteHalfSpinor tmp; + SiteHalfSpinor chi; + SiteHalfSpinor *chi_p; + SiteHalfSpinor Uchi; + SiteSpinor result; + StencilEntry *SE; + int ptype; + result=zero; + GENERIC_STENCIL_LEG_INT(Xm,spProjXp,accumReconXp); + GENERIC_STENCIL_LEG_INT(Ym,spProjYp,accumReconYp); + GENERIC_STENCIL_LEG_INT(Zm,spProjZp,accumReconZp); + GENERIC_STENCIL_LEG_INT(Tm,spProjTp,accumReconTp); + GENERIC_STENCIL_LEG_INT(Xp,spProjXm,accumReconXm); + GENERIC_STENCIL_LEG_INT(Yp,spProjYm,accumReconYm); + GENERIC_STENCIL_LEG_INT(Zp,spProjZm,accumReconZm); + GENERIC_STENCIL_LEG_INT(Tp,spProjTm,accumReconTm); + vstream(out._odata[sF], result); +}; +//////////////////////////////////////////////////////////////////// +// Exterior kernels +//////////////////////////////////////////////////////////////////// +template +void WilsonKernels::GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) +{ + SiteHalfSpinor tmp; + SiteHalfSpinor chi; + SiteHalfSpinor *chi_p; + SiteHalfSpinor Uchi; + SiteSpinor result; + StencilEntry *SE; + int ptype; + int nmu=0; + result=zero; + GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp); + GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp); + GENERIC_STENCIL_LEG_EXT(Zp,spProjZp,accumReconZp); + GENERIC_STENCIL_LEG_EXT(Tp,spProjTp,accumReconTp); + GENERIC_STENCIL_LEG_EXT(Xm,spProjXm,accumReconXm); + GENERIC_STENCIL_LEG_EXT(Ym,spProjYm,accumReconYm); + GENERIC_STENCIL_LEG_EXT(Zm,spProjZm,accumReconZm); + GENERIC_STENCIL_LEG_EXT(Tm,spProjTm,accumReconTm); + if ( nmu ) { + out._odata[sF] = out._odata[sF] + result; + } +}; + +template +void WilsonKernels::GenericDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) +{ + SiteHalfSpinor tmp; + SiteHalfSpinor chi; + SiteHalfSpinor *chi_p; + SiteHalfSpinor Uchi; + SiteSpinor result; + StencilEntry *SE; + int ptype; + int nmu=0; + result=zero; + GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp); + GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp); + GENERIC_STENCIL_LEG_EXT(Zm,spProjZp,accumReconZp); + GENERIC_STENCIL_LEG_EXT(Tm,spProjTp,accumReconTp); + GENERIC_STENCIL_LEG_EXT(Xp,spProjXm,accumReconXm); + GENERIC_STENCIL_LEG_EXT(Yp,spProjYm,accumReconYm); + GENERIC_STENCIL_LEG_EXT(Zp,spProjZm,accumReconZm); + GENERIC_STENCIL_LEG_EXT(Tp,spProjTm,accumReconTm); + if ( nmu ) { + out._odata[sF] = out._odata[sF] + result; + } +}; template void WilsonKernels::DhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF, @@ -451,119 +270,14 @@ void WilsonKernels::DhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHal int ptype; SE = st.GetEntry(ptype, dir, sF); - - // Xp - if (gamma == Xp) { - if (SE->_is_local && SE->_permute) { - spProjXp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjXp(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconXp(result, Uchi); - } - - // Yp - if (gamma == Yp) { - if (SE->_is_local && SE->_permute) { - spProjYp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjYp(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconYp(result, Uchi); - } - - // Zp - if (gamma == Zp) { - if (SE->_is_local && SE->_permute) { - spProjZp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjZp(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconZp(result, Uchi); - } - - // Tp - if (gamma == Tp) { - if (SE->_is_local && SE->_permute) { - spProjTp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjTp(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconTp(result, Uchi); - } - - // Xm - if (gamma == Xm) { - if (SE->_is_local && SE->_permute) { - spProjXm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjXm(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconXm(result, Uchi); - } - - // Ym - if (gamma == Ym) { - if (SE->_is_local && SE->_permute) { - spProjYm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjYm(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconYm(result, Uchi); - } - - // Zm - if (gamma == Zm) { - if (SE->_is_local && SE->_permute) { - spProjZm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjZm(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconZm(result, Uchi); - } - - // Tm - if (gamma == Tm) { - if (SE->_is_local && SE->_permute) { - spProjTm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjTm(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconTm(result, Uchi); - } - + GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp); + GENERIC_DHOPDIR_LEG(Yp,spProjYp,spReconYp); + GENERIC_DHOPDIR_LEG(Zp,spProjZp,spReconZp); + GENERIC_DHOPDIR_LEG(Tp,spProjTp,spReconTp); + GENERIC_DHOPDIR_LEG(Xm,spProjXm,spReconXm); + GENERIC_DHOPDIR_LEG(Ym,spProjYm,spReconYm); + GENERIC_DHOPDIR_LEG(Zm,spProjZm,spReconZm); + GENERIC_DHOPDIR_LEG(Tm,spProjTm,spReconTm); vstream(out._odata[sF], result); } diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h index 20ee87f2..2cf52660 100644 --- a/lib/qcd/action/fermion/WilsonKernels.h +++ b/lib/qcd/action/fermion/WilsonKernels.h @@ -34,8 +34,6 @@ directory namespace Grid { namespace QCD { -void bgq_l1p_optimisation(int mode); - //////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Helper routines that implement Wilson stencil for a single site. // Common to both the WilsonFermion and WilsonFermion5D @@ -44,9 +42,8 @@ class WilsonKernelsStatic { public: enum { OptGeneric, OptHandUnroll, OptInlineAsm }; enum { CommsAndCompute, CommsThenCompute }; - // S-direction is INNERMOST and takes no part in the parity. - static int Opt; // these are a temporary hack - static int Comms; // these are a temporary hack + static int Opt; + static int Comms; }; template class WilsonKernels : public FermionOperator , public WilsonKernelsStatic { @@ -66,7 +63,7 @@ public: switch(Opt) { #if defined(AVX512) || defined (QPX) case OptInlineAsm: - if(interior&&exterior) WilsonKernels::AsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out); + if(interior&&exterior) WilsonKernels::AsmDhopSite (st,lo,U,buf,sF,sU,Ls,Ns,in,out); else if (interior) WilsonKernels::AsmDhopSiteInt(st,lo,U,buf,sF,sU,Ls,Ns,in,out); else if (exterior) WilsonKernels::AsmDhopSiteExt(st,lo,U,buf,sF,sU,Ls,Ns,in,out); else assert(0); @@ -75,7 +72,9 @@ public: case OptHandUnroll: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::HandDhopSite(st,lo,U,buf,sF,sU,in,out,interior,exterior); + if(interior&&exterior) WilsonKernels::HandDhopSite(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::HandDhopSiteInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::HandDhopSiteExt(st,lo,U,buf,sF,sU,in,out); sF++; } sU++; @@ -84,7 +83,10 @@ public: case OptGeneric: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::GenericDhopSite(st,lo,U,buf,sF,sU,in,out,interior,exterior); + if(interior&&exterior) WilsonKernels::GenericDhopSite(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out); + else assert(0); sF++; } sU++; @@ -99,11 +101,14 @@ public: template typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1 ) { + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1 ) { // no kernel choice for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::GenericDhopSite(st, lo, U, buf, sF, sU, in, out,interior,exterior); + if(interior&&exterior) WilsonKernels::GenericDhopSite(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out); + else assert(0); sF++; } sU++; @@ -113,13 +118,13 @@ public: template typename std::enable_if::type DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) { - + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) +{ bgq_l1p_optimisation(1); switch(Opt) { #if defined(AVX512) || defined (QPX) case OptInlineAsm: - if(interior&&exterior) WilsonKernels::AsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out); + if(interior&&exterior) WilsonKernels::AsmDhopSiteDag (st,lo,U,buf,sF,sU,Ls,Ns,in,out); else if (interior) WilsonKernels::AsmDhopSiteDagInt(st,lo,U,buf,sF,sU,Ls,Ns,in,out); else if (exterior) WilsonKernels::AsmDhopSiteDagExt(st,lo,U,buf,sF,sU,Ls,Ns,in,out); else assert(0); @@ -128,7 +133,10 @@ public: case OptHandUnroll: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::HandDhopSiteDag(st,lo,U,buf,sF,sU,in,out,interior,exterior); + if(interior&&exterior) WilsonKernels::HandDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::HandDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::HandDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out); + else assert(0); sF++; } sU++; @@ -137,7 +145,10 @@ public: case OptGeneric: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out,interior,exterior); + if(interior&&exterior) WilsonKernels::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::GenericDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::GenericDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out); + else assert(0); sF++; } sU++; @@ -156,7 +167,10 @@ public: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out,interior,exterior); + if(interior&&exterior) WilsonKernels::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::GenericDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::GenericDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out); + else assert(0); sF++; } sU++; @@ -169,36 +183,60 @@ public: private: // Specialised variants void GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior); + int sF, int sU, const FermionField &in, FermionField &out); void GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior); + int sF, int sU, const FermionField &in, FermionField &out); + + void GenericDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + + void GenericDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + + void GenericDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + + void GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); + int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); void AsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); void AsmDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); + int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); void AsmDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); void AsmDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); + int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); void AsmDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); void HandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior); + int sF, int sU, const FermionField &in, FermionField &out); void HandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior); + int sF, int sU, const FermionField &in, FermionField &out); + void HandDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + + void HandDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + + void HandDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + + void HandDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + public: WilsonKernels(const ImplParams &p = ImplParams()); diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc index 365be69a..cd5d2430 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc +++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc @@ -112,5 +112,16 @@ INSTANTIATE_ASM(DomainWallVec5dImplD); INSTANTIATE_ASM(ZDomainWallVec5dImplF); INSTANTIATE_ASM(ZDomainWallVec5dImplD); +INSTANTIATE_ASM(WilsonImplFH); +INSTANTIATE_ASM(WilsonImplDF); +INSTANTIATE_ASM(ZWilsonImplFH); +INSTANTIATE_ASM(ZWilsonImplDF); +INSTANTIATE_ASM(GparityWilsonImplFH); +INSTANTIATE_ASM(GparityWilsonImplDF); +INSTANTIATE_ASM(DomainWallVec5dImplFH); +INSTANTIATE_ASM(DomainWallVec5dImplDF); +INSTANTIATE_ASM(ZDomainWallVec5dImplFH); +INSTANTIATE_ASM(ZDomainWallVec5dImplDF); + }} diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h b/lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h index 1839e9bc..948c16a2 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h @@ -71,6 +71,16 @@ WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,Doub int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR @@ -84,6 +94,16 @@ WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,D int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR @@ -97,6 +117,16 @@ template<> void WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include ///////////////////////////////////////////////////////////////// // XYZT vectorised, dag Kernel, single @@ -115,6 +145,16 @@ WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,D int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR @@ -128,6 +168,16 @@ WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & l int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR @@ -141,6 +191,16 @@ WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & l int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef MAYBEPERM #undef MULT_2SPIN #define MAYBEPERM(A,B) @@ -162,6 +222,15 @@ WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR @@ -174,6 +243,15 @@ WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrd int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR @@ -189,6 +267,16 @@ WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrd int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + ///////////////////////////////////////////////////////////////// // Ls vectorised, dag Kernel, single ///////////////////////////////////////////////////////////////// @@ -205,6 +293,15 @@ WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrd int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR @@ -217,6 +314,15 @@ WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,Lebesgue int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR @@ -229,6 +335,15 @@ WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,Lebesgue int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef COMPLEX_SIGNS #undef MAYBEPERM #undef MULT_2SPIN @@ -269,6 +384,15 @@ WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,Doub int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR @@ -281,6 +405,15 @@ WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,D int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR @@ -293,6 +426,15 @@ WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,D int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + ///////////////////////////////////////////////////////////////// // XYZT vectorised, dag Kernel, single ///////////////////////////////////////////////////////////////// @@ -309,6 +451,15 @@ WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,D int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR @@ -321,6 +472,15 @@ WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & l int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR @@ -333,6 +493,15 @@ WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & l int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef MAYBEPERM #undef MULT_2SPIN #define MAYBEPERM(A,B) @@ -354,6 +523,15 @@ WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR @@ -366,6 +544,15 @@ WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrd int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR @@ -380,6 +567,15 @@ WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrd int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + ///////////////////////////////////////////////////////////////// // Ls vectorised, dag Kernel, single ///////////////////////////////////////////////////////////////// @@ -396,6 +592,15 @@ WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrd int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR @@ -408,6 +613,15 @@ WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,Lebesgue int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR @@ -420,6 +634,15 @@ WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,Lebesgue int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef COMPLEX_SIGNS #undef MAYBEPERM #undef MULT_2SPIN diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index 34aba472..db8651ab 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -39,24 +39,26 @@ //////////////////////////////////////////////////////////////////////////////// #ifdef INTERIOR_AND_EXTERIOR -#define ZERO_NMU(A) -#define INTERIOR_BLOCK_XP(a,b,PERMUTE_DIR,PROJMEM,RECON) INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) -#define EXTERIOR_BLOCK_XP(a,b,RECON) EXTERIOR_BLOCK(a,b,RECON) +#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + basep = st.GetPFInfo(nent,plocal); nent++; \ + if ( local ) { \ + LOAD64(%r10,isigns); \ + PROJ(base); \ + MAYBEPERM(PERMUTE_DIR,perm); \ + } else { \ + LOAD_CHI(base); \ + } \ + base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + PREFETCH_CHIMU(base); \ + MULT_2SPIN_DIR_PF(Dir,basep); \ + LOAD64(%r10,isigns); \ + RECON; \ -#define INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) \ - LOAD64(%r10,isigns); \ - PROJMEM(base); \ - MAYBEPERM(PERMUTE_DIR,perm); - -#define EXTERIOR_BLOCK(a,b,RECON) \ - LOAD_CHI(base); - -#define COMMON_BLOCK(a,b,RECON) \ - base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; \ - PREFETCH_CHIMU(base); \ - MULT_2SPIN_DIR_PF(a,basep); \ - LOAD64(%r10,isigns); \ - RECON; +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + PF_GAUGE(Xp); \ + PREFETCH1_CHIMU(base); \ + ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) #define RESULT(base,basep) SAVE_RESULT(base,basep); @@ -67,62 +69,62 @@ //////////////////////////////////////////////////////////////////////////////// #ifdef INTERIOR -#define COMMON_BLOCK(a,b,RECON) -#define ZERO_NMU(A) +#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + basep = st.GetPFInfo(nent,plocal); nent++; \ + if ( local ) { \ + LOAD64(%r10,isigns); \ + PROJ(base); \ + MAYBEPERM(PERMUTE_DIR,perm); \ + }else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \ + if ( local || st.same_node[Dir] ) { \ + MULT_2SPIN_DIR_PF(Dir,basep); \ + LOAD64(%r10,isigns); \ + RECON; \ + } \ + base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + PREFETCH_CHIMU(base); \ -// No accumulate for DIR0 -#define EXTERIOR_BLOCK_XP(a,b,RECON) \ - ZERO_PSI; \ - base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; - -#define EXTERIOR_BLOCK(a,b,RECON) \ - base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; - -#define INTERIOR_BLOCK_XP(a,b,PERMUTE_DIR,PROJMEM,RECON) INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) - -#define INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) \ - LOAD64(%r10,isigns); \ - PROJMEM(base); \ - MAYBEPERM(PERMUTE_DIR,perm); \ - base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; \ - PREFETCH_CHIMU(base); \ - MULT_2SPIN_DIR_PF(a,basep); \ - LOAD64(%r10,isigns); \ - RECON; +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + PF_GAUGE(Xp); \ + PREFETCH1_CHIMU(base); \ + { ZERO_PSI; } \ + ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) #define RESULT(base,basep) SAVE_RESULT(base,basep); #endif - //////////////////////////////////////////////////////////////////////////////// // Post comms kernel //////////////////////////////////////////////////////////////////////////////// #ifdef EXTERIOR -#define ZERO_NMU(A) nmu=0; -#define INTERIOR_BLOCK_XP(a,b,PERMUTE_DIR,PROJMEM,RECON) \ - ZERO_PSI; base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; +#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ + MULT_2SPIN_DIR_PF(Dir,base); \ + LOAD64(%r10,isigns); \ + RECON; \ + nmu++; \ + } -#define EXTERIOR_BLOCK_XP(a,b,RECON) EXTERIOR_BLOCK(a,b,RECON) +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + nmu=0; \ + { ZERO_PSI;} \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ + MULT_2SPIN_DIR_PF(Dir,base); \ + LOAD64(%r10,isigns); \ + RECON; \ + nmu++; \ + } -#define INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) \ - base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; - -#define EXTERIOR_BLOCK(a,b,RECON) \ - nmu++; \ - LOAD_CHI(base); \ - MULT_2SPIN_DIR_PF(a,base); \ - base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; \ - LOAD64(%r10,isigns); \ - RECON; - -#define COMMON_BLOCK(a,b,RECON) - -#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} +#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} #endif - { int nmu; int local,perm, ptype; @@ -134,11 +136,15 @@ MASK_REGS; int nmax=U._grid->oSites(); for(int site=0;site=nmax) ssn=0; int sUn=lo.Reorder(ssn); -#ifndef EXTERIOR LOCK_GAUGE(0); +#else + int sU =ssU; + int ssn=ssU+1; if(ssn>=nmax) ssn=0; + int sUn=ssn; #endif for(int s=0;s #define REGISTER #define LOAD_CHIMU \ - const SiteSpinor & ref (in._odata[offset]); \ + {const SiteSpinor & ref (in._odata[offset]); \ Chimu_00=ref()(0)(0);\ Chimu_01=ref()(0)(1);\ Chimu_02=ref()(0)(2);\ @@ -43,20 +43,20 @@ Author: paboyle Chimu_22=ref()(2)(2);\ Chimu_30=ref()(3)(0);\ Chimu_31=ref()(3)(1);\ - Chimu_32=ref()(3)(2); + Chimu_32=ref()(3)(2);} #define LOAD_CHI\ - const SiteHalfSpinor &ref(buf[offset]); \ + {const SiteHalfSpinor &ref(buf[offset]); \ Chi_00 = ref()(0)(0);\ Chi_01 = ref()(0)(1);\ Chi_02 = ref()(0)(2);\ Chi_10 = ref()(1)(0);\ Chi_11 = ref()(1)(1);\ - Chi_12 = ref()(1)(2); + Chi_12 = ref()(1)(2);} // To splat or not to splat depends on the implementation #define MULT_2SPIN(A)\ - auto & ref(U._odata[sU](A)); \ + {auto & ref(U._odata[sU](A)); \ Impl::loadLinkElement(U_00,ref()(0,0)); \ Impl::loadLinkElement(U_10,ref()(1,0)); \ Impl::loadLinkElement(U_20,ref()(2,0)); \ @@ -83,7 +83,7 @@ Author: paboyle UChi_01+= U_10*Chi_02;\ UChi_11+= U_10*Chi_12;\ UChi_02+= U_20*Chi_02;\ - UChi_12+= U_20*Chi_12; + UChi_12+= U_20*Chi_12;} #define PERMUTE_DIR(dir) \ @@ -307,55 +307,132 @@ Author: paboyle result_31-= UChi_11; \ result_32-= UChi_12; -namespace Grid { -namespace QCD { +#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \ + SE=st.GetEntry(ptype,DIR,ss); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHIMU; \ + PROJ; \ + if ( perm) { \ + PERMUTE_DIR(PERM); \ + } \ + } else { \ + LOAD_CHI; \ + } \ + MULT_2SPIN(DIR); \ + RECON; + +#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \ + SE=st.GetEntry(ptype,DIR,ss); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHIMU; \ + PROJ; \ + if ( perm) { \ + PERMUTE_DIR(PERM); \ + } \ + } else if ( st.same_node[DIR] ) { \ + LOAD_CHI; \ + } \ + if (local || st.same_node[DIR] ) { \ + MULT_2SPIN(DIR); \ + RECON; \ + } + +#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \ + SE=st.GetEntry(ptype,DIR,ss); \ + offset = SE->_offset; \ + if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \ + LOAD_CHI; \ + MULT_2SPIN(DIR); \ + RECON; \ + nmu++; \ + } + +#define HAND_RESULT(ss) \ + { \ + SiteSpinor & ref (out._odata[ss]); \ + vstream(ref()(0)(0),result_00); \ + vstream(ref()(0)(1),result_01); \ + vstream(ref()(0)(2),result_02); \ + vstream(ref()(1)(0),result_10); \ + vstream(ref()(1)(1),result_11); \ + vstream(ref()(1)(2),result_12); \ + vstream(ref()(2)(0),result_20); \ + vstream(ref()(2)(1),result_21); \ + vstream(ref()(2)(2),result_22); \ + vstream(ref()(3)(0),result_30); \ + vstream(ref()(3)(1),result_31); \ + vstream(ref()(3)(2),result_32); \ + } + +#define HAND_RESULT_EXT(ss) \ + if (nmu){ \ + SiteSpinor & ref (out._odata[ss]); \ + ref()(0)(0)+=result_00; \ + ref()(0)(1)+=result_01; \ + ref()(0)(2)+=result_02; \ + ref()(1)(0)+=result_10; \ + ref()(1)(1)+=result_11; \ + ref()(1)(2)+=result_12; \ + ref()(2)(0)+=result_20; \ + ref()(2)(1)+=result_21; \ + ref()(2)(2)+=result_22; \ + ref()(3)(0)+=result_30; \ + ref()(3)(1)+=result_31; \ + ref()(3)(2)+=result_32; \ + } -template void -WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior) -{ - typedef typename Simd::scalar_type S; - typedef typename Simd::vector_type V; +#define HAND_DECLARATIONS(a) \ + Simd result_00; \ + Simd result_01; \ + Simd result_02; \ + Simd result_10; \ + Simd result_11; \ + Simd result_12; \ + Simd result_20; \ + Simd result_21; \ + Simd result_22; \ + Simd result_30; \ + Simd result_31; \ + Simd result_32; \ + Simd Chi_00; \ + Simd Chi_01; \ + Simd Chi_02; \ + Simd Chi_10; \ + Simd Chi_11; \ + Simd Chi_12; \ + Simd UChi_00; \ + Simd UChi_01; \ + Simd UChi_02; \ + Simd UChi_10; \ + Simd UChi_11; \ + Simd UChi_12; \ + Simd U_00; \ + Simd U_10; \ + Simd U_20; \ + Simd U_01; \ + Simd U_11; \ + Simd U_21; - REGISTER Simd result_00; // 12 regs on knc - REGISTER Simd result_01; - REGISTER Simd result_02; - - REGISTER Simd result_10; - REGISTER Simd result_11; - REGISTER Simd result_12; - - REGISTER Simd result_20; - REGISTER Simd result_21; - REGISTER Simd result_22; - - REGISTER Simd result_30; - REGISTER Simd result_31; - REGISTER Simd result_32; // 20 left - - REGISTER Simd Chi_00; // two spinor; 6 regs - REGISTER Simd Chi_01; - REGISTER Simd Chi_02; - - REGISTER Simd Chi_10; - REGISTER Simd Chi_11; - REGISTER Simd Chi_12; // 14 left - - REGISTER Simd UChi_00; // two spinor; 6 regs - REGISTER Simd UChi_01; - REGISTER Simd UChi_02; - - REGISTER Simd UChi_10; - REGISTER Simd UChi_11; - REGISTER Simd UChi_12; // 8 left - - REGISTER Simd U_00; // two rows of U matrix - REGISTER Simd U_10; - REGISTER Simd U_20; - REGISTER Simd U_01; - REGISTER Simd U_11; - REGISTER Simd U_21; // 2 reg left. +#define ZERO_RESULT \ + result_00=zero; \ + result_01=zero; \ + result_02=zero; \ + result_10=zero; \ + result_11=zero; \ + result_12=zero; \ + result_20=zero; \ + result_21=zero; \ + result_22=zero; \ + result_30=zero; \ + result_31=zero; \ + result_32=zero; #define Chimu_00 Chi_00 #define Chimu_01 Chi_01 @@ -370,475 +447,225 @@ WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGauge #define Chimu_31 UChi_11 #define Chimu_32 UChi_12 +namespace Grid { +namespace QCD { + +template void +WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); int offset,local,perm, ptype; StencilEntry *SE; - // Xp - SE=st.GetEntry(ptype,Xp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - XM_PROJ; - if ( perm) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Xp); - } - XM_RECON; - - // Yp - SE=st.GetEntry(ptype,Yp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - YM_PROJ; - if ( perm) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Yp); - } - YM_RECON_ACCUM; - - - // Zp - SE=st.GetEntry(ptype,Zp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - ZM_PROJ; - if ( perm) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Zp); - } - ZM_RECON_ACCUM; - - // Tp - SE=st.GetEntry(ptype,Tp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - TM_PROJ; - if ( perm) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Tp); - } - TM_RECON_ACCUM; - - // Xm - SE=st.GetEntry(ptype,Xm,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - XP_PROJ; - if ( perm) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Xm); - } - XP_RECON_ACCUM; - - - // Ym - SE=st.GetEntry(ptype,Ym,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - YP_PROJ; - if ( perm) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Ym); - } - YP_RECON_ACCUM; - - // Zm - SE=st.GetEntry(ptype,Zm,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - ZP_PROJ; - if ( perm) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Zm); - } - ZP_RECON_ACCUM; - - // Tm - SE=st.GetEntry(ptype,Tm,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - TP_PROJ; - if ( perm) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Tm); - } - TP_RECON_ACCUM; - - { - SiteSpinor & ref (out._odata[ss]); - vstream(ref()(0)(0),result_00); - vstream(ref()(0)(1),result_01); - vstream(ref()(0)(2),result_02); - vstream(ref()(1)(0),result_10); - vstream(ref()(1)(1),result_11); - vstream(ref()(1)(2),result_12); - vstream(ref()(2)(0),result_20); - vstream(ref()(2)(1),result_21); - vstream(ref()(2)(2),result_22); - vstream(ref()(3)(0),result_30); - vstream(ref()(3)(1),result_31); - vstream(ref()(3)(2),result_32); - } + HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON); + HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM); + HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); + HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM); + HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM); + HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM); + HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); + HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM); + HAND_RESULT(ss); } template void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior) + int ss,int sU,const FermionField &in, FermionField &out) { - // std::cout << "Hand op Dhop "<_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - XP_PROJ; - if ( perm) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } + HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON); + HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM); + HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); + HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM); + HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM); + HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM); + HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); + HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM); + HAND_RESULT(ss); +} - { - MULT_2SPIN(Xp); - } - XP_RECON; +template void +WilsonKernels::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; - // Yp - SE=st.GetEntry(ptype,Yp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - YP_PROJ; - if ( perm) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Yp); - } - YP_RECON_ACCUM; + HAND_DECLARATIONS(ignore); + int offset,local,perm, ptype; + StencilEntry *SE; + ZERO_RESULT; + HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM); + HAND_RESULT(ss); +} - // Zp - SE=st.GetEntry(ptype,Zp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - ZP_PROJ; - if ( perm) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Zp); - } - ZP_RECON_ACCUM; +template +void WilsonKernels::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; - // Tp - SE=st.GetEntry(ptype,Tp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - TP_PROJ; - if ( perm) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Tp); - } - TP_RECON_ACCUM; - - // Xm - SE=st.GetEntry(ptype,Xm,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - XM_PROJ; - if ( perm) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Xm); - } - XM_RECON_ACCUM; - - // Ym - SE=st.GetEntry(ptype,Ym,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - YM_PROJ; - if ( perm) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Ym); - } - YM_RECON_ACCUM; + HAND_DECLARATIONS(ignore); - // Zm - SE=st.GetEntry(ptype,Zm,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; + StencilEntry *SE; + int offset,local,perm, ptype; + ZERO_RESULT; + HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM); + HAND_RESULT(ss); +} - if ( local ) { - LOAD_CHIMU; - ZM_PROJ; - if ( perm) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Zm); - } - ZM_RECON_ACCUM; +template void +WilsonKernels::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; - // Tm - SE=st.GetEntry(ptype,Tm,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; + HAND_DECLARATIONS(ignore); - if ( local ) { - LOAD_CHIMU; - TM_PROJ; - if ( perm) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Tm); - } - TM_RECON_ACCUM; + int offset,local,perm, ptype; + StencilEntry *SE; + int nmu=0; + ZERO_RESULT; + HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM); + HAND_RESULT_EXT(ss); +} - { - SiteSpinor & ref (out._odata[ss]); - vstream(ref()(0)(0),result_00); - vstream(ref()(0)(1),result_01); - vstream(ref()(0)(2),result_02); - vstream(ref()(1)(0),result_10); - vstream(ref()(1)(1),result_11); - vstream(ref()(1)(2),result_12); - vstream(ref()(2)(0),result_20); - vstream(ref()(2)(1),result_21); - vstream(ref()(2)(2),result_22); - vstream(ref()(3)(0),result_30); - vstream(ref()(3)(1),result_31); - vstream(ref()(3)(2),result_32); - } +template +void WilsonKernels::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + StencilEntry *SE; + int offset,local,perm, ptype; + int nmu=0; + ZERO_RESULT; + HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM); + HAND_RESULT_EXT(ss); } //////////////////////////////////////////////// // Specialise Gparity to simple implementation //////////////////////////////////////////////// -template<> void -WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) -{ - assert(0); -} - -template<> void -WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) -{ - assert(0); -} - -template<> void -WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) -{ - assert(0); -} - -template<> void -WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) -{ - assert(0); -} - +#define HAND_SPECIALISE_EMPTY(IMPL) \ + template<> void \ + WilsonKernels::HandDhopSite(StencilImpl &st, \ + LebesgueOrder &lo, \ + DoubledGaugeField &U, \ + SiteHalfSpinor *buf, \ + int sF,int sU, \ + const FermionField &in, \ + FermionField &out){ assert(0); } \ + template<> void \ + WilsonKernels::HandDhopSiteDag(StencilImpl &st, \ + LebesgueOrder &lo, \ + DoubledGaugeField &U, \ + SiteHalfSpinor *buf, \ + int sF,int sU, \ + const FermionField &in, \ + FermionField &out){ assert(0); } \ + template<> void \ + WilsonKernels::HandDhopSiteInt(StencilImpl &st, \ + LebesgueOrder &lo, \ + DoubledGaugeField &U, \ + SiteHalfSpinor *buf, \ + int sF,int sU, \ + const FermionField &in, \ + FermionField &out){ assert(0); } \ + template<> void \ + WilsonKernels::HandDhopSiteExt(StencilImpl &st, \ + LebesgueOrder &lo, \ + DoubledGaugeField &U, \ + SiteHalfSpinor *buf, \ + int sF,int sU, \ + const FermionField &in, \ + FermionField &out){ assert(0); } \ + template<> void \ + WilsonKernels::HandDhopSiteDagInt(StencilImpl &st, \ + LebesgueOrder &lo, \ + DoubledGaugeField &U, \ + SiteHalfSpinor *buf, \ + int sF,int sU, \ + const FermionField &in, \ + FermionField &out){ assert(0); } \ + template<> void \ + WilsonKernels::HandDhopSiteDagExt(StencilImpl &st, \ + LebesgueOrder &lo, \ + DoubledGaugeField &U, \ + SiteHalfSpinor *buf, \ + int sF,int sU, \ + const FermionField &in, \ + FermionField &out){ assert(0); } \ + HAND_SPECIALISE_EMPTY(GparityWilsonImplF); + HAND_SPECIALISE_EMPTY(GparityWilsonImplD); + HAND_SPECIALISE_EMPTY(GparityWilsonImplFH); + HAND_SPECIALISE_EMPTY(GparityWilsonImplDF); ////////////// Wilson ; uses this implementation ///////////////////// -// Need Nc=3 though // #define INSTANTIATE_THEM(A) \ template void WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ - int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior); \ -template void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ - int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior); + int ss,int sU,const FermionField &in, FermionField &out); \ +template void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ + int ss,int sU,const FermionField &in, FermionField &out);\ +template void WilsonKernels::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ + int ss,int sU,const FermionField &in, FermionField &out); \ +template void WilsonKernels::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ + int ss,int sU,const FermionField &in, FermionField &out); \ +template void WilsonKernels::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ + int ss,int sU,const FermionField &in, FermionField &out); \ +template void WilsonKernels::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ + int ss,int sU,const FermionField &in, FermionField &out); INSTANTIATE_THEM(WilsonImplF); INSTANTIATE_THEM(WilsonImplD); @@ -850,5 +677,15 @@ INSTANTIATE_THEM(DomainWallVec5dImplF); INSTANTIATE_THEM(DomainWallVec5dImplD); INSTANTIATE_THEM(ZDomainWallVec5dImplF); INSTANTIATE_THEM(ZDomainWallVec5dImplD); +INSTANTIATE_THEM(WilsonImplFH); +INSTANTIATE_THEM(WilsonImplDF); +INSTANTIATE_THEM(ZWilsonImplFH); +INSTANTIATE_THEM(ZWilsonImplDF); +INSTANTIATE_THEM(GparityWilsonImplFH); +INSTANTIATE_THEM(GparityWilsonImplDF); +INSTANTIATE_THEM(DomainWallVec5dImplFH); +INSTANTIATE_THEM(DomainWallVec5dImplDF); +INSTANTIATE_THEM(ZDomainWallVec5dImplFH); +INSTANTIATE_THEM(ZDomainWallVec5dImplDF); }} diff --git a/lib/qcd/spin/.dirstamp b/lib/qcd/spin/.dirstamp deleted file mode 100644 index e69de29b..00000000 diff --git a/lib/qcd/utils/.dirstamp b/lib/qcd/utils/.dirstamp deleted file mode 100644 index e69de29b..00000000 diff --git a/lib/serialisation/MacroMagic.h b/lib/serialisation/MacroMagic.h index ca789312..a864989c 100644 --- a/lib/serialisation/MacroMagic.h +++ b/lib/serialisation/MacroMagic.h @@ -54,7 +54,7 @@ THE SOFTWARE. #define GRID_MACRO_EMPTY() -#define GRID_MACRO_EVAL(...) GRID_MACRO_EVAL1024(__VA_ARGS__) +#define GRID_MACRO_EVAL(...) GRID_MACRO_EVAL64(__VA_ARGS__) #define GRID_MACRO_EVAL1024(...) GRID_MACRO_EVAL512(GRID_MACRO_EVAL512(__VA_ARGS__)) #define GRID_MACRO_EVAL512(...) GRID_MACRO_EVAL256(GRID_MACRO_EVAL256(__VA_ARGS__)) #define GRID_MACRO_EVAL256(...) GRID_MACRO_EVAL128(GRID_MACRO_EVAL128(__VA_ARGS__)) diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index 2dbe26f4..02348686 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -377,8 +377,8 @@ namespace Optimization { b0 = _mm256_extractf128_si256(b,0); a1 = _mm256_extractf128_si256(a,1); b1 = _mm256_extractf128_si256(b,1); - a0 = _mm_mul_epi32(a0,b0); - a1 = _mm_mul_epi32(a1,b1); + a0 = _mm_mullo_epi32(a0,b0); + a1 = _mm_mullo_epi32(a1,b1); return _mm256_set_m128i(a1,a0); #endif #if defined (AVX2) @@ -470,7 +470,52 @@ namespace Optimization { return in; }; }; - +#define USE_FP16 + struct PrecisionChange { + static inline __m256i StoH (__m256 a,__m256 b) { + __m256i h; +#ifdef USE_FP16 + __m128i ha = _mm256_cvtps_ph(a,0); + __m128i hb = _mm256_cvtps_ph(b,0); + h =(__m256i) _mm256_castps128_ps256((__m128)ha); + h =(__m256i) _mm256_insertf128_ps((__m256)h,(__m128)hb,1); +#else + assert(0); +#endif + return h; + } + static inline void HtoS (__m256i h,__m256 &sa,__m256 &sb) { +#ifdef USE_FP16 + sa = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,0)); + sb = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,1)); +#else + assert(0); +#endif + } + static inline __m256 DtoS (__m256d a,__m256d b) { + __m128 sa = _mm256_cvtpd_ps(a); + __m128 sb = _mm256_cvtpd_ps(b); + __m256 s = _mm256_castps128_ps256(sa); + s = _mm256_insertf128_ps(s,sb,1); + return s; + } + static inline void StoD (__m256 s,__m256d &a,__m256d &b) { + a = _mm256_cvtps_pd(_mm256_extractf128_ps(s,0)); + b = _mm256_cvtps_pd(_mm256_extractf128_ps(s,1)); + } + static inline __m256i DtoH (__m256d a,__m256d b,__m256d c,__m256d d) { + __m256 sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); + } + static inline void HtoD (__m256i h,__m256d &a,__m256d &b,__m256d &c,__m256d &d) { + __m256 sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); + } + }; struct Exchange{ // 3210 ordering static inline void Exchange0(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){ @@ -675,6 +720,7 @@ namespace Optimization { ////////////////////////////////////////////////////////////////////////////////////// // Here assign types + typedef __m256i SIMD_Htype; // Single precision type typedef __m256 SIMD_Ftype; // Single precision type typedef __m256d SIMD_Dtype; // Double precision type typedef __m256i SIMD_Itype; // Integer type diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index f39c4033..ba054665 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -235,11 +235,9 @@ namespace Optimization { inline void mac(__m512 &a, __m512 b, __m512 c){ a= _mm512_fmadd_ps( b, c, a); } - inline void mac(__m512d &a, __m512d b, __m512d c){ a= _mm512_fmadd_pd( b, c, a); } - // Real float inline __m512 operator()(__m512 a, __m512 b){ return _mm512_mul_ps(a,b); @@ -342,7 +340,52 @@ namespace Optimization { }; }; - +#define USE_FP16 + struct PrecisionChange { + static inline __m512i StoH (__m512 a,__m512 b) { + __m512i h; +#ifdef USE_FP16 + __m256i ha = _mm512_cvtps_ph(a,0); + __m256i hb = _mm512_cvtps_ph(b,0); + h =(__m512i) _mm512_castps256_ps512((__m256)ha); + h =(__m512i) _mm512_insertf64x4((__m512d)h,(__m256d)hb,1); +#else + assert(0); +#endif + return h; + } + static inline void HtoS (__m512i h,__m512 &sa,__m512 &sb) { +#ifdef USE_FP16 + sa = _mm512_cvtph_ps((__m256i)_mm512_extractf64x4_pd((__m512d)h,0)); + sb = _mm512_cvtph_ps((__m256i)_mm512_extractf64x4_pd((__m512d)h,1)); +#else + assert(0); +#endif + } + static inline __m512 DtoS (__m512d a,__m512d b) { + __m256 sa = _mm512_cvtpd_ps(a); + __m256 sb = _mm512_cvtpd_ps(b); + __m512 s = _mm512_castps256_ps512(sa); + s =(__m512) _mm512_insertf64x4((__m512d)s,(__m256d)sb,1); + return s; + } + static inline void StoD (__m512 s,__m512d &a,__m512d &b) { + a = _mm512_cvtps_pd((__m256)_mm512_extractf64x4_pd((__m512d)s,0)); + b = _mm512_cvtps_pd((__m256)_mm512_extractf64x4_pd((__m512d)s,1)); + } + static inline __m512i DtoH (__m512d a,__m512d b,__m512d c,__m512d d) { + __m512 sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); + } + static inline void HtoD (__m512i h,__m512d &a,__m512d &b,__m512d &c,__m512d &d) { + __m512 sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); + } + }; // On extracting face: Ah Al , Bh Bl -> Ah Bh, Al Bl // On merging buffers: Ah,Bh , Al Bl -> Ah Al, Bh, Bl // The operation is its own inverse @@ -539,7 +582,9 @@ namespace Optimization { ////////////////////////////////////////////////////////////////////////////////////// // Here assign types - typedef __m512 SIMD_Ftype; // Single precision type + + typedef __m512i SIMD_Htype; // Single precision type + typedef __m512 SIMD_Ftype; // Single precision type typedef __m512d SIMD_Dtype; // Double precision type typedef __m512i SIMD_Itype; // Integer type diff --git a/lib/simd/Grid_generic.h b/lib/simd/Grid_generic.h index 7972da55..bdf5aa01 100644 --- a/lib/simd/Grid_generic.h +++ b/lib/simd/Grid_generic.h @@ -279,6 +279,101 @@ namespace Optimization { #undef timesi + struct PrecisionChange { + static inline vech StoH (const vecf &a,const vecf &b) { +#ifdef USE_FP16 + vech ret; + vech *ha = (vech *)&a; + vech *hb = (vech *)&b; + const int nf = W::r; + // VECTOR_FOR(i, nf,1){ ret.v[i] = ( (uint16_t *) &a.v[i])[1] ; } + // VECTOR_FOR(i, nf,1){ ret.v[i+nf] = ( (uint16_t *) &b.v[i])[1] ; } + VECTOR_FOR(i, nf,1){ ret.v[i] = ha->v[2*i+1]; } + VECTOR_FOR(i, nf,1){ ret.v[i+nf] = hb->v[2*i+1]; } +#else + assert(0); +#endif + return ret; + } + static inline void HtoS (vech h,vecf &sa,vecf &sb) { +#ifdef USE_FP16 + const int nf = W::r; + const int nh = W::r; + vech *ha = (vech *)&sa; + vech *hb = (vech *)&sb; + VECTOR_FOR(i, nf, 1){ sb.v[i]= sa.v[i] = 0; } + // VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sa.v[i]))[1] = h.v[i];} + // VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sb.v[i]))[1] = h.v[i+nf];} + VECTOR_FOR(i, nf, 1){ ha->v[2*i+1]=h.v[i]; } + VECTOR_FOR(i, nf, 1){ hb->v[2*i+1]=h.v[i+nf]; } +#else + assert(0); +#endif + } + static inline vecf DtoS (vecd a,vecd b) { + const int nd = W::r; + const int nf = W::r; + vecf ret; + VECTOR_FOR(i, nd,1){ ret.v[i] = a.v[i] ; } + VECTOR_FOR(i, nd,1){ ret.v[i+nd] = b.v[i] ; } + return ret; + } + static inline void StoD (vecf s,vecd &a,vecd &b) { + const int nd = W::r; + VECTOR_FOR(i, nd,1){ a.v[i] = s.v[i] ; } + VECTOR_FOR(i, nd,1){ b.v[i] = s.v[i+nd] ; } + } + static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) { + vecf sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); + } + static inline void HtoD (vech h,vecd &a,vecd &b,vecd &c,vecd &d) { + vecf sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); + } + }; + + ////////////////////////////////////////////// + // Exchange support + struct Exchange{ + + template + static inline void ExchangeN(vec &out1,vec &out2,vec &in1,vec &in2){ + const int w = W::r; + unsigned int mask = w >> (n + 1); + // std::cout << " Exchange "< + static inline void Exchange0(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN(out1,out2,in1,in2); + }; + template + static inline void Exchange1(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN(out1,out2,in1,in2); + }; + template + static inline void Exchange2(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN(out1,out2,in1,in2); + }; + template + static inline void Exchange3(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN(out1,out2,in1,in2); + }; + }; + + ////////////////////////////////////////////// // Some Template specialization #define perm(a, b, n, w)\ @@ -403,6 +498,7 @@ namespace Optimization { ////////////////////////////////////////////////////////////////////////////////////// // Here assign types + typedef Optimization::vech SIMD_Htype; // Reduced precision type typedef Optimization::vecf SIMD_Ftype; // Single precision type typedef Optimization::vecd SIMD_Dtype; // Double precision type typedef Optimization::veci SIMD_Itype; // Integer type diff --git a/lib/simd/Grid_generic_types.h b/lib/simd/Grid_generic_types.h index 2142bc8e..642f6ffe 100644 --- a/lib/simd/Grid_generic_types.h +++ b/lib/simd/Grid_generic_types.h @@ -66,6 +66,10 @@ namespace Optimization { template <> struct W { constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; }; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/4u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/2u; + }; // SIMD vector types template @@ -73,8 +77,9 @@ namespace Optimization { alignas(GEN_SIMD_WIDTH) T v[W::r]; }; - typedef vec vecf; - typedef vec vecd; - typedef vec veci; + typedef vec vecf; + typedef vec vecd; + typedef vec vech; // half precision comms + typedef vec veci; }} diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index d77a560a..cbca9118 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -33,6 +33,14 @@ #include "Grid_generic_types.h" // Definitions for simulated integer SIMD. namespace Grid { + +#ifdef QPX +#include +#include +#include +#include +#endif + namespace Optimization { typedef struct { @@ -125,7 +133,6 @@ namespace Optimization { f[2] = a.v2; f[3] = a.v3; } - //Double inline void operator()(double *d, vector4double a){ vec_st(a, 0, d); diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index fcad4c28..2fb2df76 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -328,6 +328,140 @@ namespace Optimization { }; }; + +#define _my_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) +#define _my_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) + +#ifdef SFW_FP16 + + struct Grid_half { + Grid_half(){} + Grid_half(uint16_t raw) : x(raw) {} + uint16_t x; + }; + union FP32 { + unsigned int u; + float f; + }; + + // PAB - Lifted and adapted from Eigen, which is GPL V2 + inline float sfw_half_to_float(Grid_half h) { + const FP32 magic = { 113 << 23 }; + const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift + FP32 o; + o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits + unsigned int exp = shifted_exp & o.u; // just the exponent + o.u += (127 - 15) << 23; // exponent adjust + // handle exponent special cases + if (exp == shifted_exp) { // Inf/NaN? + o.u += (128 - 16) << 23; // extra exp adjust + } else if (exp == 0) { // Zero/Denormal? + o.u += 1 << 23; // extra exp adjust + o.f -= magic.f; // renormalize + } + o.u |= (h.x & 0x8000) << 16; // sign bit + return o.f; + } + inline Grid_half sfw_float_to_half(float ff) { + FP32 f; f.f = ff; + const FP32 f32infty = { 255 << 23 }; + const FP32 f16max = { (127 + 16) << 23 }; + const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; + unsigned int sign_mask = 0x80000000u; + Grid_half o; + + o.x = static_cast(0x0u); + unsigned int sign = f.u & sign_mask; + f.u ^= sign; + // NOTE all the integer compares in this function can be safely + // compiled into signed compares since all operands are below + // 0x80000000. Important if you want fast straight SSE2 code + // (since there's no unsigned PCMPGTD). + if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) + o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf + } else { // (De)normalized number or zero + if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero + // use a magic value to align our 10 mantissa bits at the bottom of + // the float. as long as FP addition is round-to-nearest-even this + // just works. + f.f += denorm_magic.f; + // and one integer subtract of the bias later, we have our final float! + o.x = static_cast(f.u - denorm_magic.u); + } else { + unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd + + // update exponent, rounding bias part 1 + f.u += ((unsigned int)(15 - 127) << 23) + 0xfff; + // rounding bias part 2 + f.u += mant_odd; + // take the bits! + o.x = static_cast(f.u >> 13); + } + } + o.x |= static_cast(sign >> 16); + return o; + } + static inline __m128i Grid_mm_cvtps_ph(__m128 f,int discard) { + __m128i ret=(__m128i)_mm_setzero_ps(); + float *fp = (float *)&f; + Grid_half *hp = (Grid_half *)&ret; + hp[0] = sfw_float_to_half(fp[0]); + hp[1] = sfw_float_to_half(fp[1]); + hp[2] = sfw_float_to_half(fp[2]); + hp[3] = sfw_float_to_half(fp[3]); + return ret; + } + static inline __m128 Grid_mm_cvtph_ps(__m128i h,int discard) { + __m128 ret=_mm_setzero_ps(); + float *fp = (float *)&ret; + Grid_half *hp = (Grid_half *)&h; + fp[0] = sfw_half_to_float(hp[0]); + fp[1] = sfw_half_to_float(hp[1]); + fp[2] = sfw_half_to_float(hp[2]); + fp[3] = sfw_half_to_float(hp[3]); + return ret; + } +#else +#define Grid_mm_cvtps_ph _mm_cvtps_ph +#define Grid_mm_cvtph_ps _mm_cvtph_ps +#endif + struct PrecisionChange { + static inline __m128i StoH (__m128 a,__m128 b) { + __m128i ha = Grid_mm_cvtps_ph(a,0); + __m128i hb = Grid_mm_cvtps_ph(b,0); + __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); + return h; + } + static inline void HtoS (__m128i h,__m128 &sa,__m128 &sb) { + sa = Grid_mm_cvtph_ps(h,0); + h = (__m128i)_my_alignr_epi32((__m128i)h,(__m128i)h,2); + sb = Grid_mm_cvtph_ps(h,0); + } + static inline __m128 DtoS (__m128d a,__m128d b) { + __m128 sa = _mm_cvtpd_ps(a); + __m128 sb = _mm_cvtpd_ps(b); + __m128 s = _mm_shuffle_ps(sa,sb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); + return s; + } + static inline void StoD (__m128 s,__m128d &a,__m128d &b) { + a = _mm_cvtps_pd(s); + s = (__m128)_my_alignr_epi32((__m128i)s,(__m128i)s,2); + b = _mm_cvtps_pd(s); + } + static inline __m128i DtoH (__m128d a,__m128d b,__m128d c,__m128d d) { + __m128 sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); + } + static inline void HtoD (__m128i h,__m128d &a,__m128d &b,__m128d &c,__m128d &d) { + __m128 sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); + } + }; + struct Exchange{ // 3210 ordering static inline void Exchange0(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ @@ -335,8 +469,10 @@ namespace Optimization { out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2)); }; static inline void Exchange1(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ - out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); - out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); + out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); /*ACEG*/ + out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); /*BDFH*/ + out1= _mm_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ + out2= _mm_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ }; static inline void Exchange2(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ assert(0); @@ -383,14 +519,9 @@ namespace Optimization { default: assert(0); } } - -#ifndef _mm_alignr_epi64 -#define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) -#define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) -#endif - template static inline __m128 tRotate(__m128 in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); }; - template static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); }; + template static inline __m128 tRotate(__m128 in){ return (__m128)_my_alignr_epi32((__m128i)in,(__m128i)in,n); }; + template static inline __m128d tRotate(__m128d in){ return (__m128d)_my_alignr_epi64((__m128i)in,(__m128i)in,n); }; }; ////////////////////////////////////////////// @@ -450,7 +581,8 @@ namespace Optimization { ////////////////////////////////////////////////////////////////////////////////////// // Here assign types - typedef __m128 SIMD_Ftype; // Single precision type + typedef __m128i SIMD_Htype; // Single precision type + typedef __m128 SIMD_Ftype; // Single precision type typedef __m128d SIMD_Dtype; // Double precision type typedef __m128i SIMD_Itype; // Integer type diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 57e7f11e..de4a13b5 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -2,7 +2,7 @@ Grid physics library, www.github.com/paboyle/Grid -Source file: ./lib/simd/Grid_vector_types.h +Source file: ./lib/simd/Grid_vector_type.h Copyright (C) 2015 @@ -53,12 +53,14 @@ directory #if defined IMCI #include "Grid_imci.h" #endif -#if defined QPX -#include "Grid_qpx.h" -#endif #ifdef NEONv8 #include "Grid_neon.h" #endif +#if defined QPX +#include "Grid_qpx.h" +#endif + +#include "l1p.h" namespace Grid { @@ -74,12 +76,14 @@ struct RealPart > { typedef T type; }; +#include + ////////////////////////////////////// // demote a vector to real type ////////////////////////////////////// // type alias used to simplify the syntax of std::enable_if template using Invoke = typename T::type; -template using EnableIf = Invoke >; +template using EnableIf = Invoke >; template using NotEnableIf = Invoke >; //////////////////////////////////////////////////////// @@ -88,13 +92,15 @@ template struct is_complex : public std::false_type {}; template <> struct is_complex > : public std::true_type {}; template <> struct is_complex > : public std::true_type {}; -template using IfReal = Invoke::value, int> >; -template using IfComplex = Invoke::value, int> >; -template using IfInteger = Invoke::value, int> >; +template using IfReal = Invoke::value, int> >; +template using IfComplex = Invoke::value, int> >; +template using IfInteger = Invoke::value, int> >; +template using IfSame = Invoke::value, int> >; -template using IfNotReal = Invoke::value, int> >; -template using IfNotComplex = Invoke::value, int> >; -template using IfNotInteger = Invoke::value, int> >; +template using IfNotReal = Invoke::value, int> >; +template using IfNotComplex = Invoke::value, int> >; +template using IfNotInteger = Invoke::value, int> >; +template using IfNotSame = Invoke::value, int> >; //////////////////////////////////////////////////////// // Define the operation templates functors @@ -358,16 +364,12 @@ class Grid_simd { { if (n==3) { Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v); - // std::cout << " Exchange3 "<< out1<<" "<< out2<<" <- " << in1 << " "< = 0> inline Grid_simd rotate(Grid_simd b, int nrot) { nrot = nrot % Grid_simd::Nsimd(); Grid_simd ret; - // std::cout << "Rotate Real by "< = 0> inline Grid_simd rotate(Grid_simd b, int nrot) { nrot = nrot % Grid_simd::Nsimd(); Grid_simd ret; - // std::cout << "Rotate Complex by "< =0> inline void rotate( Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); - // std::cout << "Rotate Real by "< =0> inline void rotate(Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); - // std::cout << "Rotate Complex by "< innerProduct(const Grid_simd &l, const Grid_simd &r) { return conjugate(l) * r; } - template inline Grid_simd outerProduct(const Grid_simd &l, const Grid_simd &r) { @@ -758,6 +755,67 @@ typedef Grid_simd, SIMD_Ftype> vComplexF; typedef Grid_simd, SIMD_Dtype> vComplexD; typedef Grid_simd vInteger; +// Half precision; no arithmetic support +typedef Grid_simd vRealH; +typedef Grid_simd, SIMD_Htype> vComplexH; + +inline void precisionChange(vRealF *out,vRealD *in,int nvec) +{ + assert((nvec&0x1)==0); + for(int m=0;m*2 +class SimpleCompressor { +public: + void Point(int) {}; + inline int CommDatumSize(void) { return sizeof(vobj); } + inline bool DecompressionStep(void) { return false; } + inline void Compress(vobj *buf,int o,const vobj &in) { buf[o]=in; } + inline void Exchange(vobj *mp,vobj *vp0,vobj *vp1,Integer type,Integer o){ + exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); + } + inline void Decompress(vobj *out,vobj *in, int o){ assert(0); } + inline void CompressExchange(vobj *out0,vobj *out1,const vobj *in, + int j,int k, int m,int type){ + exchange(out0[j],out1[j],in[k],in[m],type); + } + // For cshift. Cshift should drop compressor coupling altogether + // because I had to decouple the code from the Stencil anyway + inline vobj operator() (const vobj &arg) { + return arg; + } +}; + +} +#endif diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index 479cd979..d1c28e78 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -28,42 +28,24 @@ #ifndef GRID_STENCIL_H #define GRID_STENCIL_H +#include // subdir aggregate #include // subdir aggregate -#define NEW_XYZT_GATHER + ////////////////////////////////////////////////////////////////////////////////////////// // Must not lose sight that goal is to be able to construct really efficient // gather to a point stencil code. CSHIFT is not the best way, so need // additional stencil support. // - // Stencil based code will pre-exchange haloes and use a table lookup for neighbours. + // Stencil based code will exchange haloes and use a table lookup for neighbours. // This will be done with generality to allow easier efficient implementations. - // Overlap of comms and compute could be semi-automated by tabulating off-node connected, - // and - // - // Lattice could also allocate haloes which get used for stencil code. - // - // Grid could create a neighbour index table for a given stencil. - // - // Could also implement CovariantCshift, to fuse the loops and enhance performance. - // - // - // General stencil computation: - // + // Overlap of comms and compute is enabled by tabulating off-node connected, + // // Generic services // 0) Prebuild neighbour tables // 1) Compute sizes of all haloes/comms buffers; allocate them. - // // 2) Gather all faces, and communicate. // 3) Loop over result sites, giving nbr index/offnode info for each // - // Could take a - // SpinProjectFaces - // start comms - // complete comms - // Reconstruct Umu - // - // Approach. - // ////////////////////////////////////////////////////////////////////////////////////////// namespace Grid { @@ -82,7 +64,7 @@ void Gather_plane_simple_table (std::vector >& table,const La { int num=table.size(); parallel_for(int i=0;i >& table,const L int num=table.size()/2; int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane parallel_for(int j=0;j class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in. public: @@ -132,11 +110,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal typedef typename cobj::vector_type vector_type; typedef typename cobj::scalar_type scalar_type; typedef typename cobj::scalar_object scalar_object; - - ////////////////////////////////////////// - // Comms packet queue for asynch thread - ////////////////////////////////////////// + /////////////////////////////////////////// + // Helper structs + /////////////////////////////////////////// struct Packet { void * send_buf; void * recv_buf; @@ -144,22 +121,134 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal Integer from_rank; Integer bytes; }; - - std::vector Packets; - + struct Merge { + cobj * mpointer; + std::vector rpointers; + std::vector vpointers; + Integer buffer_size; + Integer type; + }; + struct Decompress { + cobj * kernel_p; + cobj * mpi_p; + Integer buffer_size; + }; + + //////////////////////////////////////// + // Basic Grid and stencil info + //////////////////////////////////////// int face_table_computed; std::vector > > face_table ; + - void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){ - Packet p; - p.send_buf = xmit; - p.recv_buf = rcv; - p.to_rank = to; - p.from_rank= from; - p.bytes = bytes; - Packets.push_back(p); + int _checkerboard; + int _npoints; // Move to template param? + GridBase * _grid; + + // npoints of these + std::vector _directions; + std::vector _distances; + std::vector _comm_buf_size; + std::vector _permute_type; + + Vector _entries; + std::vector Packets; + std::vector Mergers; + std::vector MergersSHM; + std::vector Decompressions; + std::vector DecompressionsSHM; + + /////////////////////////////////////////////////////////// + // Unified Comms buffers for all directions + /////////////////////////////////////////////////////////// + // Vectors that live on the symmetric heap in case of SHMEM + // These are used; either SHM objects or refs to the above symmetric heap vectors + // depending on comms target + cobj* u_recv_buf_p; + cobj* u_send_buf_p; + std::vector u_simd_send_buf; + std::vector u_simd_recv_buf; + + int u_comm_offset; + int _unified_buffer_size; + cobj *CommBuf(void) { return u_recv_buf_p; } + + ///////////////////////////////////////// + // Timing info; ugly; possibly temporary + ///////////////////////////////////////// + double commtime; + double gathertime; + double gathermtime; + double halogtime; + double mergetime; + double decompresstime; + double comms_bytes; + double splicetime; + double nosplicetime; + double calls; + + //////////////////////////////////////// + // Stencil query + //////////////////////////////////////// + inline int SameNode(int point) { + + int dimension = _directions[point]; + int displacement = _distances[point]; + assert( (displacement==1) || (displacement==-1)); + + int pd = _grid->_processors[dimension]; + int fd = _grid->_fdimensions[dimension]; + int ld = _grid->_ldimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + + int recv_from_rank; + int xmit_to_rank; + + if ( ! comm_dim ) return 1; + + int nbr_proc; + if (displacement==1) nbr_proc = 1; + else nbr_proc = pd-1; + + _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + + void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p); + + if ( shm==NULL ) return 0; + + return 1; + } + inline int GetNodeLocal(int osite,int point) { + return _entries[point+_npoints*osite]._is_local; + } + inline StencilEntry * GetEntry(int &ptype,int point,int osite) { + ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; } + inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { + uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; + local = _entries[ent]._is_local; + perm = _entries[ent]._permute; + if (perm) ptype = _permute_type[point]; + if (local) { + return base + _entries[ent]._byte_offset; + } else { + return cbase + _entries[ent]._byte_offset; + } + } + + inline uint64_t GetPFInfo(int ent,uint64_t base) { + uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; + int local = _entries[ent]._is_local; + if (local) return base + _entries[ent]._byte_offset; + else return cbase + _entries[ent]._byte_offset; + } + + ////////////////////////////////////////// + // Comms packet queue for asynch thread + ////////////////////////////////////////// void CommunicateBegin(std::vector > &reqs) { reqs.resize(Packets.size()); @@ -172,128 +261,173 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal Packets[i].from_rank, Packets[i].bytes); } - commtime+=usecond(); } + void CommunicateComplete(std::vector > &reqs) { - commtime-=usecond(); for(int i=0;iStencilSendToRecvFromComplete(reqs[i]); } - _grid->StencilBarrier();// Synch shared memory on a single nodes commtime+=usecond(); - /* - int dump=1; - if(dump){ - for(int i=0;i_ndimension;d++){ - ss<<"."<<_grid->_processor_coor[d]; - } - ss<<"_mu_"< void HaloExchange(const Lattice &source,compressor &compress) + { + std::vector > reqs; + Prepare(); + HaloGather(source,compress); + CommunicateBegin(reqs); + CommunicateComplete(reqs); + CommsMergeSHM(compress); + CommsMerge(compress); + } + + template int HaloGatherDir(const Lattice &source,compressor &compress,int point,int & face_idx) + { + int dimension = _directions[point]; + int displacement = _distances[point]; + + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + + // Map to always positive shift modulo global full dimension. + int shift = (displacement+fd)%fd; + + assert (source.checkerboard== _checkerboard); + + // the permute type + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); + + int same_node = 1; + // Gather phase + int sshift [2]; + if ( comm_dim ) { + sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); + sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); + if ( sshift[0] == sshift[1] ) { + if (splice_dim) { + splicetime-=usecond(); + same_node = same_node && GatherSimd(source,dimension,shift,0x3,compress,face_idx); + splicetime+=usecond(); + } else { + nosplicetime-=usecond(); + same_node = same_node && Gather(source,dimension,shift,0x3,compress,face_idx); + nosplicetime+=usecond(); + } + } else { + if(splice_dim){ + splicetime-=usecond(); + // if checkerboard is unfavourable take two passes + // both with block stride loop iteration + same_node = same_node && GatherSimd(source,dimension,shift,0x1,compress,face_idx); + same_node = same_node && GatherSimd(source,dimension,shift,0x2,compress,face_idx); + splicetime+=usecond(); + } else { + nosplicetime-=usecond(); + same_node = same_node && Gather(source,dimension,shift,0x1,compress,face_idx); + same_node = same_node && Gather(source,dimension,shift,0x2,compress,face_idx); + nosplicetime+=usecond(); + } } } - dump =0; -*/ + return same_node; } - - /////////////////////////////////////////// - // Simd merge queue for asynch comms - /////////////////////////////////////////// - struct Merge { - cobj * mpointer; - std::vector rpointers; - std::vector vpointers; - Integer buffer_size; - Integer packet_id; - Integer exchange; - Integer type; - }; - std::vector Mergers; + template + void HaloGather(const Lattice &source,compressor &compress) + { + _grid->StencilBarrier();// Synch shared memory on a single nodes - void AddMerge(cobj *merge_p,std::vector &rpointers,Integer buffer_size,Integer packet_id) { - Merge m; - m.exchange = 0; - m.mpointer = merge_p; - m.rpointers= rpointers; - m.buffer_size = buffer_size; - m.packet_id = packet_id; - Mergers.push_back(m); + // conformable(source._grid,_grid); + assert(source._grid==_grid); + halogtime-=usecond(); + + u_comm_offset=0; + + // Gather all comms buffers + int face_idx=0; + for(int point = 0 ; point < _npoints; point++) { + compress.Point(point); + HaloGatherDir(source,compress,point,face_idx); + } + face_table_computed=1; + + assert(u_comm_offset==_unified_buffer_size); + halogtime+=usecond(); } - - void AddMergeNew(cobj *merge_p,std::vector &rpointers,Integer buffer_size,Integer packet_id,Integer type) { + + ///////////////////////// + // Implementation + ///////////////////////// + void Prepare(void) + { + Decompressions.resize(0); + DecompressionsSHM.resize(0); + Mergers.resize(0); + MergersSHM.resize(0); + Packets.resize(0); + calls++; + } + void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){ + Packet p; + p.send_buf = xmit; + p.recv_buf = rcv; + p.to_rank = to; + p.from_rank= from; + p.bytes = bytes; + Packets.push_back(p); + } + void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector &dv) { + Decompress d; + d.kernel_p = k_p; + d.mpi_p = m_p; + d.buffer_size = buffer_size; + dv.push_back(d); + } + void AddMerge(cobj *merge_p,std::vector &rpointers,Integer buffer_size,Integer type,std::vector &mv) { Merge m; - m.exchange = 1; m.type = type; m.mpointer = merge_p; m.vpointers= rpointers; m.buffer_size = buffer_size; - m.packet_id = packet_id; - Mergers.push_back(m); + mv.push_back(m); + } + template void CommsMerge(decompressor decompress) { + CommsMerge(decompress,Mergers,Decompressions); + } + template void CommsMergeSHM(decompressor decompress) { + _grid->StencilBarrier();// Synch shared memory on a single nodes + CommsMerge(decompress,MergersSHM,DecompressionsSHM); } - void CommsMerge(void ) { + template + void CommsMerge(decompressor decompress,std::vector &mm,std::vector &dd) { - for(int i=0;i_ndimension;d++){ - // ss<<"."<<_grid->_processor_coor[d]; - // } - // ss<<"_m_"< _directions; - std::vector _distances; - std::vector _comm_buf_size; - std::vector _permute_type; - - // npoints x Osites() of these - // Flat vector, change layout for cache friendly. - Vector _entries; - void PrecomputeByteOffsets(void){ for(int i=0;i<_entries.size();i++){ if( _entries[i]._is_local ) { @@ -304,113 +438,6 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal } }; - inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; } - inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { - uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; - local = _entries[ent]._is_local; - perm = _entries[ent]._permute; - if (perm) ptype = _permute_type[point]; - if (local) { - return base + _entries[ent]._byte_offset; - } else { - return cbase + _entries[ent]._byte_offset; - } - } - inline uint64_t GetPFInfo(int ent,uint64_t base) { - uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; - int local = _entries[ent]._is_local; - if (local) return base + _entries[ent]._byte_offset; - else return cbase + _entries[ent]._byte_offset; - } - - /////////////////////////////////////////////////////////// - // Unified Comms buffers for all directions - /////////////////////////////////////////////////////////// - // Vectors that live on the symmetric heap in case of SHMEM - // std::vector > u_simd_send_buf_hide; - // std::vector > u_simd_recv_buf_hide; - // commVector u_send_buf_hide; - // commVector u_recv_buf_hide; - - // These are used; either SHM objects or refs to the above symmetric heap vectors - // depending on comms target - cobj* u_recv_buf_p; - cobj* u_send_buf_p; - std::vector new_simd_send_buf; - std::vector new_simd_recv_buf; - std::vector u_simd_send_buf; - std::vector u_simd_recv_buf; - - int u_comm_offset; - int _unified_buffer_size; - - cobj *CommBuf(void) { return u_recv_buf_p; } - - ///////////////////////////////////////// - // Timing info; ugly; possibly temporary - ///////////////////////////////////////// -#define TIMING_HACK -#ifdef TIMING_HACK - double jointime; - double gathertime; - double commtime; - double halogtime; - double mergetime; - double spintime; - double comms_bytes; - double gathermtime; - double splicetime; - double nosplicetime; - double t_data; - double t_table; - double calls; - - void ZeroCounters(void) { - gathertime = 0.; - jointime = 0.; - commtime = 0.; - halogtime = 0.; - mergetime = 0.; - spintime = 0.; - gathermtime = 0.; - splicetime = 0.; - nosplicetime = 0.; - t_data = 0.0; - t_table= 0.0; - comms_bytes = 0.; - calls = 0.; - }; - - void Report(void) { -#define PRINTIT(A) \ - std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<_Nprocessors; - RealD NN = _grid->NodeCount(); - - _grid->GlobalSum(commtime); commtime/=NP; - if ( calls > 0. ) { - std::cout << GridLogMessage << " Stencil calls "<1.0){ - PRINTIT(comms_bytes); - PRINTIT(commtime); - std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); -#ifdef NEW_XYZT_GATHER + for(int l=0;l<2;l++){ - new_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); - new_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); + u_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); + u_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); } -#else - for(int l=0;lShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object)); - u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object)); - } -#endif PrecomputeByteOffsets(); } @@ -611,7 +630,6 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal offnode = (comm_proc!= 0); } - int wraparound=0; if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) { wraparound = 1; @@ -735,106 +753,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal } } - template void HaloExchange(const Lattice &source,compressor &compress) - { - std::vector > reqs; - calls++; - Mergers.resize(0); - Packets.resize(0); - HaloGather(source,compress); - this->CommunicateBegin(reqs); - this->CommunicateComplete(reqs); - CommsMerge(); - } - - template void HaloGatherDir(const Lattice &source,compressor &compress,int point,int & face_idx) - { - int dimension = _directions[point]; - int displacement = _distances[point]; - - int fd = _grid->_fdimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - - - // Map to always positive shift modulo global full dimension. - int shift = (displacement+fd)%fd; - - // int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift); - assert (source.checkerboard== _checkerboard); - - // the permute type - int simd_layout = _grid->_simd_layout[dimension]; - int comm_dim = _grid->_processors[dimension] >1 ; - int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); - - // Gather phase - int sshift [2]; - if ( comm_dim ) { - sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); - sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); - if ( sshift[0] == sshift[1] ) { - if (splice_dim) { - splicetime-=usecond(); - // GatherSimd(source,dimension,shift,0x3,compress,face_idx); - // std::cout << "GatherSimdNew"< - void HaloGather(const Lattice &source,compressor &compress) - { - _grid->StencilBarrier();// Synch shared memory on a single nodes - - // conformable(source._grid,_grid); - assert(source._grid==_grid); - halogtime-=usecond(); - - u_comm_offset=0; - - // Gather all comms buffers - int face_idx=0; - for(int point = 0 ; point < _npoints; point++) { - compress.Point(point); - HaloGatherDir(source,compress,point,face_idx); - } - face_table_computed=1; - - assert(u_comm_offset==_unified_buffer_size); - halogtime+=usecond(); - } - - template - void Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx) + int Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx) { typedef typename cobj::vector_type vector_type; typedef typename cobj::scalar_type scalar_type; @@ -858,6 +778,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal int cb= (cbmask==0x2)? Odd : Even; int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); + int shm_receive_only = 1; for(int x=0;x>1; - int bytes = words * sizeof(cobj); + int bytes = words * compress.CommDatumSize(); - gathertime-=usecond(); int so = sx*rhs._grid->_ostride[dimension]; // base offset for start of plane if ( !face_table_computed ) { - t_table-=usecond(); face_table.resize(face_idx+1); Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]); - // std::cout << " face table size "<CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); - - // loop over outer coord planes orthog to dim - for(int x=0;x= rd ); - - if ( any_offnode ) { - - for(int i=0;i(rhs,spointers,dimension,sx,cbmask,compress); - gathermtime+=usecond(); - - for(int i=0;i2 - // for(int w=0;w : lane " << i <<" elem "<>(permute_type+1)); - int ic= (i&inner_bit)? 1:0; - - int my_coor = rd*ic + x; - int nbr_coor = my_coor+sshift; - int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors - int nbr_lcoor= (nbr_coor%ld); - int nbr_ic = (nbr_lcoor)/rd; // inner coord of peer - int nbr_ox = (nbr_lcoor%rd); // outer coord of peer - int nbr_lane = (i&(~inner_bit)); - - if (nbr_ic) nbr_lane|=inner_bit; - assert (sx == nbr_ox); - - auto rp = &u_simd_recv_buf[i ][u_comm_offset]; - auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset]; - - if(nbr_proc){ - - int recv_from_rank; - int xmit_to_rank; - - _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - - // shm == receive pointer if offnode - // shm == Translate[send pointer] if on node -- my view of his send pointer - scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(recv_from_rank,sp); - if (shm==NULL) { - shm = rp; - } - - // if Direct, StencilSendToRecvFrom will suppress copy to a peer on node - // assuming above pointer flip - AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); - - rpointers[i] = shm; - - } else { - - rpointers[i] = sp; - - } - } - - AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1); - - u_comm_offset +=buffer_size; - } - } - } - - - template - void GatherSimdNew(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx) + int GatherSimd(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx) { const int Nsimd = _grid->Nsimd(); @@ -1063,8 +896,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal int reduced_buffer_size = buffer_size; if (cbmask != 0x3) reduced_buffer_size=buffer_size>>1; - int bytes = (reduced_buffer_size*sizeof(cobj))/simd_layout; - assert(bytes*simd_layout == reduced_buffer_size*sizeof(cobj)); + int datum_bytes = compress.CommDatumSize(); + int bytes = (reduced_buffer_size*datum_bytes)/simd_layout; + assert(bytes*simd_layout == reduced_buffer_size*datum_bytes); std::vector rpointers(maxl); std::vector spointers(maxl); @@ -1077,33 +911,26 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); // loop over outer coord planes orthog to dim + int shm_receive_only = 1; for(int x=0;x= rd ); if ( any_offnode ) { - for(int i=0;i > table; - t_table-=usecond(); if ( !face_table_computed ) { face_table.resize(face_idx+1); Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]); - // std::cout << " face table size "<_Nprocessors; + RealD NN = _grid->NodeCount(); + + _grid->GlobalSum(commtime); commtime/=NP; + if ( calls > 0. ) { + std::cout << GridLogMessage << " Stencil calls "<1.0){ + PRINTIT(comms_bytes); + PRINTIT(commtime); + std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<::scalar_type scalar_type; typedef typename GridTypeMapper::vector_type vector_type; + typedef typename GridTypeMapper::vector_typeD vector_typeD; typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; - typedef iScalar tensor_reduced; typedef typename GridTypeMapper::scalar_object recurse_scalar_object; + typedef iScalar tensor_reduced; typedef iScalar scalar_object; - // substitutes a real or complex version with same tensor structure typedef iScalar::Complexified> Complexified; typedef iScalar::Realified> Realified; @@ -77,8 +77,12 @@ class iScalar { iScalar & operator= (const iScalar ©me) = default; iScalar & operator= (iScalar &©me) = default; */ - iScalar(scalar_type s) - : _internal(s){}; // recurse down and hit the constructor for vector_type + + // template + // iScalar(EnableIf, vector_type> s) : _internal(s){}; // recurse down and hit the constructor for vector_type + + iScalar(scalar_type s) : _internal(s){}; // recurse down and hit the constructor for vector_type + iScalar(const Zero &z) { *this = zero; }; iScalar &operator=(const Zero &hero) { @@ -134,42 +138,38 @@ class iScalar { strong_inline const vtype &operator()(void) const { return _internal; } // Type casts meta programmed, must be pure scalar to match TensorRemove - template = 0, - IfNotSimd = 0> + template = 0, IfNotSimd = 0> operator ComplexF() const { return (TensorRemove(_internal)); }; - template = 0, - IfNotSimd = 0> + template = 0, IfNotSimd = 0> operator ComplexD() const { return (TensorRemove(_internal)); }; // template = 0,IfNotSimd = // 0> operator RealD () const { return(real(TensorRemove(_internal))); } - template = 0, - IfNotSimd = 0> + template = 0,IfNotSimd = 0> operator RealD() const { return TensorRemove(_internal); } - template = 0, - IfNotSimd = 0> + template = 0, IfNotSimd = 0> operator Integer() const { return Integer(TensorRemove(_internal)); } // convert from a something to a scalar via constructor of something arg - template ::value, T>::type - * = nullptr> - strong_inline iScalar operator=(T arg) { + template ::value, T>::type * = nullptr> + strong_inline iScalar operator=(T arg) { _internal = arg; return *this; } - friend std::ostream &operator<<(std::ostream &stream, - const iScalar &o) { + friend std::ostream &operator<<(std::ostream &stream,const iScalar &o) { stream << "S {" << o._internal << "}"; return stream; }; + + }; /////////////////////////////////////////////////////////// // Allows to turn scalar>>> back to double. @@ -193,6 +193,7 @@ class iVector { typedef vtype element; typedef typename GridTypeMapper::scalar_type scalar_type; typedef typename GridTypeMapper::vector_type vector_type; + typedef typename GridTypeMapper::vector_typeD vector_typeD; typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; typedef typename GridTypeMapper::scalar_object recurse_scalar_object; typedef iScalar tensor_reduced; @@ -305,6 +306,7 @@ class iMatrix { typedef vtype element; typedef typename GridTypeMapper::scalar_type scalar_type; typedef typename GridTypeMapper::vector_type vector_type; + typedef typename GridTypeMapper::vector_typeD vector_typeD; typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; typedef typename GridTypeMapper::scalar_object recurse_scalar_object; diff --git a/lib/tensors/Tensor_inner.h b/lib/tensors/Tensor_inner.h index 22284be4..46185652 100644 --- a/lib/tensors/Tensor_inner.h +++ b/lib/tensors/Tensor_inner.h @@ -29,51 +29,109 @@ Author: Peter Boyle #ifndef GRID_MATH_INNER_H #define GRID_MATH_INNER_H namespace Grid { - /////////////////////////////////////////////////////////////////////////////////////// - // innerProduct Scalar x Scalar -> Scalar - // innerProduct Vector x Vector -> Scalar - // innerProduct Matrix x Matrix -> Scalar - /////////////////////////////////////////////////////////////////////////////////////// - template inline RealD norm2(const sobj &arg){ - typedef typename sobj::scalar_type scalar; - decltype(innerProduct(arg,arg)) nrm; - nrm = innerProduct(arg,arg); - RealD ret = real(nrm); - return ret; - } + /////////////////////////////////////////////////////////////////////////////////////// + // innerProduct Scalar x Scalar -> Scalar + // innerProduct Vector x Vector -> Scalar + // innerProduct Matrix x Matrix -> Scalar + /////////////////////////////////////////////////////////////////////////////////////// + template inline RealD norm2(const sobj &arg){ + auto nrm = innerProductD(arg,arg); + RealD ret = real(nrm); + return ret; + } + ////////////////////////////////////// + // If single promote to double and sum 2x + ////////////////////////////////////// - template inline - auto innerProduct (const iVector& lhs,const iVector& rhs) -> iScalar - { - typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t; - iScalar ret; - ret=zero; - for(int c1=0;c1 inline + auto innerProductD (const iVector& lhs,const iVector& rhs) -> iScalar + { + typedef decltype(innerProductD(lhs._internal[0],rhs._internal[0])) ret_t; + iScalar ret; + ret=zero; + for(int c1=0;c1 inline - auto innerProduct (const iMatrix& lhs,const iMatrix& rhs) -> iScalar - { - typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t; - iScalar ret; - iScalar tmp; - ret=zero; - for(int c1=0;c1 inline - auto innerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar - { - typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t; - iScalar ret; - ret._internal = innerProduct(lhs._internal,rhs._internal); - return ret; + return ret; + } + template inline + auto innerProductD (const iMatrix& lhs,const iMatrix& rhs) -> iScalar + { + typedef decltype(innerProductD(lhs._internal[0][0],rhs._internal[0][0])) ret_t; + iScalar ret; + iScalar tmp; + ret=zero; + for(int c1=0;c1 inline + auto innerProductD (const iScalar& lhs,const iScalar& rhs) -> iScalar + { + typedef decltype(innerProductD(lhs._internal,rhs._internal)) ret_t; + iScalar ret; + ret._internal = innerProductD(lhs._internal,rhs._internal); + return ret; + } + ////////////////////// + // Keep same precison + ////////////////////// + template inline + auto innerProduct (const iVector& lhs,const iVector& rhs) -> iScalar + { + typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t; + iScalar ret; + ret=zero; + for(int c1=0;c1 inline + auto innerProduct (const iMatrix& lhs,const iMatrix& rhs) -> iScalar + { + typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t; + iScalar ret; + iScalar tmp; + ret=zero; + for(int c1=0;c1 inline + auto innerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar + { + typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t; + iScalar ret; + ret._internal = innerProduct(lhs._internal,rhs._internal); + return ret; + } } #endif diff --git a/lib/tensors/Tensor_traits.h b/lib/tensors/Tensor_traits.h index 777b398d..af82de72 100644 --- a/lib/tensors/Tensor_traits.h +++ b/lib/tensors/Tensor_traits.h @@ -53,6 +53,7 @@ namespace Grid { public: typedef typename T::scalar_type scalar_type; typedef typename T::vector_type vector_type; + typedef typename T::vector_typeD vector_typeD; typedef typename T::tensor_reduced tensor_reduced; typedef typename T::scalar_object scalar_object; typedef typename T::Complexified Complexified; @@ -67,6 +68,7 @@ namespace Grid { public: typedef RealF scalar_type; typedef RealF vector_type; + typedef RealD vector_typeD; typedef RealF tensor_reduced ; typedef RealF scalar_object; typedef ComplexF Complexified; @@ -77,6 +79,7 @@ namespace Grid { public: typedef RealD scalar_type; typedef RealD vector_type; + typedef RealD vector_typeD; typedef RealD tensor_reduced; typedef RealD scalar_object; typedef ComplexD Complexified; @@ -87,6 +90,7 @@ namespace Grid { public: typedef ComplexF scalar_type; typedef ComplexF vector_type; + typedef ComplexD vector_typeD; typedef ComplexF tensor_reduced; typedef ComplexF scalar_object; typedef ComplexF Complexified; @@ -97,6 +101,7 @@ namespace Grid { public: typedef ComplexD scalar_type; typedef ComplexD vector_type; + typedef ComplexD vector_typeD; typedef ComplexD tensor_reduced; typedef ComplexD scalar_object; typedef ComplexD Complexified; @@ -107,6 +112,7 @@ namespace Grid { public: typedef Integer scalar_type; typedef Integer vector_type; + typedef Integer vector_typeD; typedef Integer tensor_reduced; typedef Integer scalar_object; typedef void Complexified; @@ -118,6 +124,7 @@ namespace Grid { public: typedef RealF scalar_type; typedef vRealF vector_type; + typedef vRealD vector_typeD; typedef vRealF tensor_reduced; typedef RealF scalar_object; typedef vComplexF Complexified; @@ -128,16 +135,29 @@ namespace Grid { public: typedef RealD scalar_type; typedef vRealD vector_type; + typedef vRealD vector_typeD; typedef vRealD tensor_reduced; typedef RealD scalar_object; typedef vComplexD Complexified; typedef vRealD Realified; enum { TensorLevel = 0 }; }; + template<> class GridTypeMapper { + public: + typedef ComplexF scalar_type; + typedef vComplexH vector_type; + typedef vComplexD vector_typeD; + typedef vComplexH tensor_reduced; + typedef ComplexF scalar_object; + typedef vComplexH Complexified; + typedef vRealH Realified; + enum { TensorLevel = 0 }; + }; template<> class GridTypeMapper { public: typedef ComplexF scalar_type; typedef vComplexF vector_type; + typedef vComplexD vector_typeD; typedef vComplexF tensor_reduced; typedef ComplexF scalar_object; typedef vComplexF Complexified; @@ -148,6 +168,7 @@ namespace Grid { public: typedef ComplexD scalar_type; typedef vComplexD vector_type; + typedef vComplexD vector_typeD; typedef vComplexD tensor_reduced; typedef ComplexD scalar_object; typedef vComplexD Complexified; @@ -158,6 +179,7 @@ namespace Grid { public: typedef Integer scalar_type; typedef vInteger vector_type; + typedef vInteger vector_typeD; typedef vInteger tensor_reduced; typedef Integer scalar_object; typedef void Complexified; @@ -241,7 +263,8 @@ namespace Grid { template class isSIMDvectorized{ template - static typename std::enable_if< !std::is_same< typename GridTypeMapper::type>::scalar_type, typename GridTypeMapper::type>::vector_type>::value, char>::type test(void *); + static typename std::enable_if< !std::is_same< typename GridTypeMapper::type>::scalar_type, + typename GridTypeMapper::type>::vector_type>::value, char>::type test(void *); template static double test(...); diff --git a/lib/util/Init.cc b/lib/util/Init.cc index dd3e6d13..d0685480 100644 --- a/lib/util/Init.cc +++ b/lib/util/Init.cc @@ -311,8 +311,8 @@ void Grid_init(int *argc,char ***argv) std::cout< void operator()(vec &r1,vec &r2,vec &i1,vec &i2) const { exchange(r1,r2,i1,i2,n);} - template void apply(std::vector &r1,std::vector &r2,std::vector &in1,std::vector &in2) const { + template void apply(std::vector &r1, + std::vector &r2, + std::vector &in1, + std::vector &in2) const + { int sz=in1.size(); - - int msk = sz>>(n+1); - int j1=0; - int j2=0; - for(int i=0;i({45,12,81,9})); + const int Ndp = 16; + const int Nsp = Ndp/2; + const int Nhp = Ndp/4; + std::vector > H (Nhp); + std::vector > F (Nsp); + std::vector > FF(Nsp); + std::vector > D (Ndp); + std::vector > DD(Ndp); + for(int i=0;i<16;i++){ + random(sRNG,D[i]); + } + // Double to Single + precisionChange(&F[0],&D[0],Ndp); + precisionChange(&DD[0],&F[0],Ndp); + std::cout << GridLogMessage<<"Double to single"; + for(int i=0;i Cheby(alpha,beta,mu,order); @@ -131,10 +131,9 @@ int main (int argc, char ** argv) const int Nit= 10000; int Nconv; - RealD eresid = 1.0e-8; + RealD eresid = 1.0e-6; ImplicitlyRestartedLanczos IRL(HermOp,X,Nk,Nm,eresid,Nit); - ImplicitlyRestartedLanczos ChebyIRL(HermOp,Cheby,Nk,Nm,eresid,Nit); LatticeComplex src(grid); gaussian(RNG,src); @@ -145,9 +144,9 @@ int main (int argc, char ** argv) } { - // std::vector eval(Nm); - // std::vector evec(Nm,grid); - // ChebyIRL.calc(eval,evec,src, Nconv); + std::vector eval(Nm); + std::vector evec(Nm,grid); + ChebyIRL.calc(eval,evec,src, Nconv); } Grid_finalize(); diff --git a/tests/solver/Test_dwf_cg_prec.cc b/tests/solver/Test_dwf_cg_prec.cc index 30436e36..0ede352e 100644 --- a/tests/solver/Test_dwf_cg_prec.cc +++ b/tests/solver/Test_dwf_cg_prec.cc @@ -89,7 +89,7 @@ int main(int argc, char** argv) { GridStopWatch CGTimer; SchurDiagMooeeOperator HermOpEO(Ddwf); - ConjugateGradient CG(1.0e-8, 10000, 0);// switch off the assert + ConjugateGradient CG(1.0e-5, 10000, 0);// switch off the assert CGTimer.Start(); CG(HermOpEO, src_o, result_o); diff --git a/tests/solver/Test_dwf_cg_unprec.cc b/tests/solver/Test_dwf_cg_unprec.cc index 131f6ce1..38f632ef 100644 --- a/tests/solver/Test_dwf_cg_unprec.cc +++ b/tests/solver/Test_dwf_cg_unprec.cc @@ -73,7 +73,7 @@ int main (int argc, char ** argv) DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); MdagMLinearOperator HermOp(Ddwf); - ConjugateGradient CG(1.0e-8,10000); + ConjugateGradient CG(1.0e-6,10000); CG(HermOp,src,result); Grid_finalize(); diff --git a/tests/solver/Test_staggered_block_cg_unprec.cc b/tests/solver/Test_staggered_block_cg_unprec.cc new file mode 100644 index 00000000..44e8fb52 --- /dev/null +++ b/tests/solver/Test_staggered_block_cg_unprec.cc @@ -0,0 +1,119 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_wilson_cg_unprec.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + +int main (int argc, char ** argv) +{ + typedef typename ImprovedStaggeredFermion5DR::FermionField FermionField; + typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField; + typename ImprovedStaggeredFermion5DR::ImplParams params; + + const int Ls=4; + + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::vector seeds({1,2,3,4}); + GridParallelRNG pRNG(UGrid ); pRNG.SeedFixedIntegers(seeds); + GridParallelRNG pRNG5(FGrid); pRNG5.SeedFixedIntegers(seeds); + + FermionField src(FGrid); random(pRNG5,src); + FermionField result(FGrid); result=zero; + RealD nrm = norm2(src); + + LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu); + + RealD mass=0.01; + ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); + MdagMLinearOperator HermOp(Ds); + + ConjugateGradient CG(1.0e-8,10000); + BlockConjugateGradient BCG(1.0e-8,10000); + MultiRHSConjugateGradient mCG(1.0e-8,10000); + + std::cout << GridLogMessage << "************************************************************************ "< HermOp4d(Ds4d); + FermionField src4d(UGrid); random(pRNG,src4d); + FermionField result4d(UGrid); result4d=zero; + CG(HermOp4d,src4d,result4d); + std::cout << GridLogMessage << "************************************************************************ "< +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + +int main (int argc, char ** argv) +{ + typedef typename ImprovedStaggeredFermionR::FermionField FermionField; + typedef typename ImprovedStaggeredFermionR::ComplexField ComplexField; + typename ImprovedStaggeredFermionR::ImplParams params; + + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout); + + std::vector seeds({1,2,3,4}); + GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); + + FermionField src(&Grid); random(pRNG,src); + RealD nrm = norm2(src); + FermionField result(&Grid); result=zero; + LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + + double volume=1; + for(int mu=0;mu HermOp(Ds); + CG(HermOp,src,result); + + Grid_finalize(); +}