diff --git a/.travis.yml b/.travis.yml index bc6dd0ef..a33d8ec0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,7 @@ cache: matrix: include: - os: osx - osx_image: xcode7.2 + osx_image: xcode8.3 compiler: clang - compiler: gcc addons: @@ -73,8 +73,6 @@ before_install: - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install openmpi; fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi install: - export CC=$CC$VERSION @@ -92,15 +90,14 @@ script: - cd build - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none - make -j4 - - ./benchmarks/Benchmark_dwf --threads 1 + - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - echo make clean - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none - make -j4 - - ./benchmarks/Benchmark_dwf --threads 1 + - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - echo make clean - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi - - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto - - make -j4 + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make -j4; fi - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi diff --git a/configure.ac b/configure.ac index 3a323a5d..f18beb32 100644 --- a/configure.ac +++ b/configure.ac @@ -83,6 +83,19 @@ case ${ac_LAPACK} in AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);; esac +############### FP16 conversions +AC_ARG_ENABLE([fp16], + [AC_HELP_STRING([--enable-fp16=yes|no], [enable fp16 comms])], + [ac_FP16=${enable_fp16}], [ac_FP16=no]) +case ${ac_FP16} in + no) + ;; + yes) + AC_DEFINE([USE_FP16],[1],[conversion to fp16]);; + *) + ;; +esac + ############### MKL AC_ARG_ENABLE([mkl], [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])], @@ -179,16 +192,16 @@ case ${ax_cv_cxx_compiler_vendor} in SIMD_FLAGS='-msse4.2';; AVX) AC_DEFINE([AVX1],[1],[AVX intrinsics]) - SIMD_FLAGS='-mavx';; + SIMD_FLAGS='-mavx -mf16c';; AVXFMA4) AC_DEFINE([AVXFMA4],[1],[AVX intrinsics with FMA4]) - SIMD_FLAGS='-mavx -mfma4';; + SIMD_FLAGS='-mavx -mfma4 -mf16c';; AVXFMA) AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3]) - SIMD_FLAGS='-mavx -mfma';; + SIMD_FLAGS='-mavx -mfma -mf16c';; AVX2) AC_DEFINE([AVX2],[1],[AVX2 intrinsics]) - SIMD_FLAGS='-mavx2 -mfma';; + SIMD_FLAGS='-mavx2 -mfma -mf16c';; AVX512) AC_DEFINE([AVX512],[1],[AVX512 intrinsics]) SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';; diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h new file mode 100644 index 00000000..0f4a3a80 --- /dev/null +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -0,0 +1,516 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/algorithms/iterative/BlockConjugateGradient.h + +Copyright (C) 2017 + +Author: Azusa Yamaguchi +Author: Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H +#define GRID_BLOCK_CONJUGATE_GRADIENT_H + +#include + +namespace Grid { + +GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) +{ + int NN = BlockSolverGrid->_ndimension; + int nsimd = BlockSolverGrid->Nsimd(); + + std::vector latt_phys(0); + std::vector simd_phys(0); + std::vector mpi_phys(0); + + for(int d=0;d_fdimensions[d]); + simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); + mpi_phys.push_back(BlockSolverGrid->_processors[d]); + } + } + return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); +} + ////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Need to move sliceInnerProduct, sliceAxpy, sliceNorm etc... into lattice sector along with sliceSum + ////////////////////////////////////////////////////////////////////////////////////////////////////////////// +template +static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + // FIXME: Implementation is slow + // If we based this on Cshift it would work for spread out + // but it would be even slower + // + // Repeated extract slice is inefficient + // + // Best base the linear combination by constructing a + // set of vectors of size grid->_rdimensions[Orthog]. + for(int i=0;i +static void sliceMaddVector (Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, + int Orthog,RealD scale=1.0) +{ + // FIXME: Implementation is slow + // Best base the linear combination by constructing a + // set of vectors of size grid->_rdimensions[Orthog]. + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + // If we based this on Cshift it would work for spread out + // but it would be even slower + for(int i=0;i +static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) +{ + // FIXME: Implementation is slow + // Not sure of best solution.. think about it + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + GridBase *FullGrid = lhs._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + int Nblock = FullGrid->GlobalDimensions()[Orthog]; + + Lattice Lslice(SliceGrid); + Lattice Rslice(SliceGrid); + + mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); + + for(int i=0;i +static void sliceInnerProductVector( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) +{ + // FIXME: Implementation is slow + // Look at localInnerProduct implementation, + // and do inside a site loop with block strided iterators + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced scalar; + typedef typename scalar::scalar_object scomplex; + + int Nblock = lhs._grid->GlobalDimensions()[Orthog]; + + vec.resize(Nblock); + std::vector sip(Nblock); + Lattice IP(lhs._grid); + + IP=localInnerProduct(lhs,rhs); + sliceSum(IP,sip,Orthog); + + for(int ss=0;ss +static void sliceNorm (std::vector &sn,const Lattice &rhs,int Orthog) { + + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = rhs._grid->GlobalDimensions()[Orthog]; + std::vector ip(Nblock); + sn.resize(Nblock); + + sliceInnerProductVector(ip,rhs,rhs,Orthog); + for(int ss=0;ss +static void sliceInnerProductMatrixOld( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced scalar; + typedef typename scalar::scalar_object scomplex; + + int Nblock = lhs._grid->GlobalDimensions()[Orthog]; + + std::cout << " sliceInnerProductMatrix Dim "< IP(lhs._grid); + std::vector sip(Nblock); + + mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); + + Lattice tmp = rhs; + + for(int s1=0;s1 +class BlockConjugateGradient : public OperatorFunction { + public: + + typedef typename Field::scalar_type scomplex; + + const int blockDim = 0; + + int Nblock; + bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. + // Defaults true. + RealD Tolerance; + Integer MaxIterations; + Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion + + BlockConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true) + : Tolerance(tol), + MaxIterations(maxit), + ErrorOnNoConverge(err_on_no_conv){}; + +void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) +{ + int Orthog = 0; // First dimension is block dim + Nblock = Src._grid->_fdimensions[Orthog]; + std::cout< residuals(Nblock); + std::vector ssq(Nblock); + + sliceNorm(ssq,Src,Orthog); + RealD sssum=0; + for(int b=0;b max_resid ) max_resid = rr; + } + + if ( max_resid < Tolerance*Tolerance ) { + + std::cout << GridLogMessage<<" Block solver has converged in " + < +class MultiRHSConjugateGradient : public OperatorFunction { + public: + + typedef typename Field::scalar_type scomplex; + + const int blockDim = 0; + + int Nblock; + bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. + // Defaults true. + RealD Tolerance; + Integer MaxIterations; + Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion + + MultiRHSConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true) + : Tolerance(tol), + MaxIterations(maxit), + ErrorOnNoConverge(err_on_no_conv){}; + +void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) +{ + int Orthog = 0; // First dimension is block dim + Nblock = Src._grid->_fdimensions[Orthog]; + std::cout< v_pAp(Nblock); + std::vector v_rr (Nblock); + std::vector v_rr_inv(Nblock); + std::vector v_alpha(Nblock); + std::vector v_beta(Nblock); + + // Initial residual computation & set up + std::vector residuals(Nblock); + std::vector ssq(Nblock); + + sliceNorm(ssq,Src,Orthog); + RealD sssum=0; + for(int b=0;b max_resid ) max_resid = rr; + } + + if ( max_resid < Tolerance*Tolerance ) { + std::cout << GridLogMessage<<" MultiRHS solver has converged in " + < { } std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: k=0 residual " << cp << " target " << rsq - << std::endl; + << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl; GridStopWatch LinalgTimer; GridStopWatch MatrixTimer; diff --git a/lib/lattice/Lattice_transfer.h b/lib/lattice/Lattice_transfer.h index 8eb75f15..68de52d0 100644 --- a/lib/lattice/Lattice_transfer.h +++ b/lib/lattice/Lattice_transfer.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -359,7 +359,7 @@ void localConvert(const Lattice &in,Lattice &out) template -void InsertSlice(Lattice &lowDim,Lattice & higherDim,int slice, int orthog) +void InsertSlice(const Lattice &lowDim,Lattice & higherDim,int slice, int orthog) { typedef typename vobj::scalar_object sobj; @@ -401,7 +401,7 @@ void InsertSlice(Lattice &lowDim,Lattice & higherDim,int slice, int } template -void ExtractSlice(Lattice &lowDim, Lattice & higherDim,int slice, int orthog) +void ExtractSlice(Lattice &lowDim,const Lattice & higherDim,int slice, int orthog) { typedef typename vobj::scalar_object sobj; @@ -444,7 +444,7 @@ void ExtractSlice(Lattice &lowDim, Lattice & higherDim,int slice, in template -void InsertSliceLocal(Lattice &lowDim, Lattice & higherDim,int slice_lo,int slice_hi, int orthog) +void InsertSliceLocal(const Lattice &lowDim, Lattice & higherDim,int slice_lo,int slice_hi, int orthog) { typedef typename vobj::scalar_object sobj; diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc index 2ba4f4af..5ce2b335 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc @@ -160,8 +160,6 @@ void ImprovedStaggeredFermion::ImportGauge(const GaugeField &_Uthin,const PokeIndex(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4); } - std::cout << " Umu " << Umu._odata[0]< Ah Bh, Al Bl // On merging buffers: Ah,Bh , Al Bl -> Ah Al, Bh, Bl // The operation is its own inverse diff --git a/lib/simd/Grid_generic.h b/lib/simd/Grid_generic.h index 7972da55..bdf5aa01 100644 --- a/lib/simd/Grid_generic.h +++ b/lib/simd/Grid_generic.h @@ -279,6 +279,101 @@ namespace Optimization { #undef timesi + struct PrecisionChange { + static inline vech StoH (const vecf &a,const vecf &b) { +#ifdef USE_FP16 + vech ret; + vech *ha = (vech *)&a; + vech *hb = (vech *)&b; + const int nf = W::r; + // VECTOR_FOR(i, nf,1){ ret.v[i] = ( (uint16_t *) &a.v[i])[1] ; } + // VECTOR_FOR(i, nf,1){ ret.v[i+nf] = ( (uint16_t *) &b.v[i])[1] ; } + VECTOR_FOR(i, nf,1){ ret.v[i] = ha->v[2*i+1]; } + VECTOR_FOR(i, nf,1){ ret.v[i+nf] = hb->v[2*i+1]; } +#else + assert(0); +#endif + return ret; + } + static inline void HtoS (vech h,vecf &sa,vecf &sb) { +#ifdef USE_FP16 + const int nf = W::r; + const int nh = W::r; + vech *ha = (vech *)&sa; + vech *hb = (vech *)&sb; + VECTOR_FOR(i, nf, 1){ sb.v[i]= sa.v[i] = 0; } + // VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sa.v[i]))[1] = h.v[i];} + // VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sb.v[i]))[1] = h.v[i+nf];} + VECTOR_FOR(i, nf, 1){ ha->v[2*i+1]=h.v[i]; } + VECTOR_FOR(i, nf, 1){ hb->v[2*i+1]=h.v[i+nf]; } +#else + assert(0); +#endif + } + static inline vecf DtoS (vecd a,vecd b) { + const int nd = W::r; + const int nf = W::r; + vecf ret; + VECTOR_FOR(i, nd,1){ ret.v[i] = a.v[i] ; } + VECTOR_FOR(i, nd,1){ ret.v[i+nd] = b.v[i] ; } + return ret; + } + static inline void StoD (vecf s,vecd &a,vecd &b) { + const int nd = W::r; + VECTOR_FOR(i, nd,1){ a.v[i] = s.v[i] ; } + VECTOR_FOR(i, nd,1){ b.v[i] = s.v[i+nd] ; } + } + static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) { + vecf sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); + } + static inline void HtoD (vech h,vecd &a,vecd &b,vecd &c,vecd &d) { + vecf sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); + } + }; + + ////////////////////////////////////////////// + // Exchange support + struct Exchange{ + + template + static inline void ExchangeN(vec &out1,vec &out2,vec &in1,vec &in2){ + const int w = W::r; + unsigned int mask = w >> (n + 1); + // std::cout << " Exchange "< + static inline void Exchange0(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN(out1,out2,in1,in2); + }; + template + static inline void Exchange1(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN(out1,out2,in1,in2); + }; + template + static inline void Exchange2(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN(out1,out2,in1,in2); + }; + template + static inline void Exchange3(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN(out1,out2,in1,in2); + }; + }; + + ////////////////////////////////////////////// // Some Template specialization #define perm(a, b, n, w)\ @@ -403,6 +498,7 @@ namespace Optimization { ////////////////////////////////////////////////////////////////////////////////////// // Here assign types + typedef Optimization::vech SIMD_Htype; // Reduced precision type typedef Optimization::vecf SIMD_Ftype; // Single precision type typedef Optimization::vecd SIMD_Dtype; // Double precision type typedef Optimization::veci SIMD_Itype; // Integer type diff --git a/lib/simd/Grid_generic_types.h b/lib/simd/Grid_generic_types.h index 2142bc8e..642f6ffe 100644 --- a/lib/simd/Grid_generic_types.h +++ b/lib/simd/Grid_generic_types.h @@ -66,6 +66,10 @@ namespace Optimization { template <> struct W { constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; }; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/4u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/2u; + }; // SIMD vector types template @@ -73,8 +77,9 @@ namespace Optimization { alignas(GEN_SIMD_WIDTH) T v[W::r]; }; - typedef vec vecf; - typedef vec vecd; - typedef vec veci; + typedef vec vecf; + typedef vec vecd; + typedef vec vech; // half precision comms + typedef vec veci; }} diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index d77a560a..757c84c9 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -125,7 +125,6 @@ namespace Optimization { f[2] = a.v2; f[3] = a.v3; } - //Double inline void operator()(double *d, vector4double a){ vec_st(a, 0, d); diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index fcad4c28..8a4537c2 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -38,6 +38,7 @@ Author: neo #include + namespace Grid { namespace Optimization { @@ -328,6 +329,56 @@ namespace Optimization { }; }; + +#define _my_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) +#define _my_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) + + struct PrecisionChange { + static inline __m128i StoH (__m128 a,__m128 b) { +#ifdef USE_FP16 + __m128i ha = _mm_cvtps_ph(a,0); + __m128i hb = _mm_cvtps_ph(b,0); + __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); +#else + __m128i h = (__m128i)a; + assert(0); +#endif + return h; + } + static inline void HtoS (__m128i h,__m128 &sa,__m128 &sb) { +#ifdef USE_FP16 + sa = _mm_cvtph_ps(h); + h = (__m128i)_my_alignr_epi32((__m128i)h,(__m128i)h,2); + sb = _mm_cvtph_ps(h); +#else + assert(0); +#endif + } + static inline __m128 DtoS (__m128d a,__m128d b) { + __m128 sa = _mm_cvtpd_ps(a); + __m128 sb = _mm_cvtpd_ps(b); + __m128 s = _mm_shuffle_ps(sa,sb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); + return s; + } + static inline void StoD (__m128 s,__m128d &a,__m128d &b) { + a = _mm_cvtps_pd(s); + s = (__m128)_my_alignr_epi32((__m128i)s,(__m128i)s,2); + b = _mm_cvtps_pd(s); + } + static inline __m128i DtoH (__m128d a,__m128d b,__m128d c,__m128d d) { + __m128 sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); + } + static inline void HtoD (__m128i h,__m128d &a,__m128d &b,__m128d &c,__m128d &d) { + __m128 sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); + } + }; + struct Exchange{ // 3210 ordering static inline void Exchange0(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ @@ -335,8 +386,10 @@ namespace Optimization { out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2)); }; static inline void Exchange1(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ - out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); - out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); + out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); /*ACEG*/ + out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); /*BDFH*/ + out1= _mm_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ + out2= _mm_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ }; static inline void Exchange2(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ assert(0); @@ -383,14 +436,9 @@ namespace Optimization { default: assert(0); } } - -#ifndef _mm_alignr_epi64 -#define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) -#define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) -#endif - template static inline __m128 tRotate(__m128 in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); }; - template static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); }; + template static inline __m128 tRotate(__m128 in){ return (__m128)_my_alignr_epi32((__m128i)in,(__m128i)in,n); }; + template static inline __m128d tRotate(__m128d in){ return (__m128d)_my_alignr_epi64((__m128i)in,(__m128i)in,n); }; }; ////////////////////////////////////////////// @@ -450,7 +498,8 @@ namespace Optimization { ////////////////////////////////////////////////////////////////////////////////////// // Here assign types - typedef __m128 SIMD_Ftype; // Single precision type + typedef __m128i SIMD_Htype; // Single precision type + typedef __m128 SIMD_Ftype; // Single precision type typedef __m128d SIMD_Dtype; // Double precision type typedef __m128i SIMD_Itype; // Integer type diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 57e7f11e..ccedf99c 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -358,16 +358,12 @@ class Grid_simd { { if (n==3) { Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v); - // std::cout << " Exchange3 "<< out1<<" "<< out2<<" <- " << in1 << " "<, SIMD_Ftype> vComplexF; typedef Grid_simd, SIMD_Dtype> vComplexD; typedef Grid_simd vInteger; +// Half precision; no arithmetic support +typedef Grid_simd vRealH; +typedef Grid_simd, SIMD_Htype> vComplexH; + +inline void precisionChange(vRealF *out,vRealD *in,int nvec) +{ + assert((nvec&0x1)==0); + for(int m=0;m*2 void operator()(vec &r1,vec &r2,vec &i1,vec &i2) const { exchange(r1,r2,i1,i2,n);} - template void apply(std::vector &r1,std::vector &r2,std::vector &in1,std::vector &in2) const { + template void apply(std::vector &r1, + std::vector &r2, + std::vector &in1, + std::vector &in2) const + { int sz=in1.size(); - - int msk = sz>>(n+1); - int j1=0; - int j2=0; - for(int i=0;i({45,12,81,9})); + const int Ndp = 16; + const int Nsp = Ndp/2; + const int Nhp = Ndp/4; + std::vector > H (Nhp); + std::vector > F (Nsp); + std::vector > FF(Nsp); + std::vector > D (Ndp); + std::vector > DD(Ndp); + for(int i=0;i<16;i++){ + random(sRNG,D[i]); + } + // Double to Single + precisionChange(&F[0],&D[0],Ndp); + precisionChange(&DD[0],&F[0],Ndp); + std::cout << GridLogMessage<<"Double to single"; + for(int i=0;i +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + +int main (int argc, char ** argv) +{ + typedef typename ImprovedStaggeredFermion5DR::FermionField FermionField; + typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField; + typename ImprovedStaggeredFermion5DR::ImplParams params; + + const int Ls=8; + + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::vector seeds({1,2,3,4}); + GridParallelRNG pRNG(UGrid ); pRNG.SeedFixedIntegers(seeds); + GridParallelRNG pRNG5(FGrid); pRNG5.SeedFixedIntegers(seeds); + + FermionField src(FGrid); random(pRNG5,src); + FermionField result(FGrid); result=zero; + RealD nrm = norm2(src); + + LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu); + + RealD mass=0.01; + ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); + + MdagMLinearOperator HermOp(Ds); + + ConjugateGradient CG(1.0e-8,10000); + BlockConjugateGradient BCG(1.0e-8,10000); + MultiRHSConjugateGradient mCG(1.0e-8,10000); + + std::cout << GridLogMessage << " Calling CG "< +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + +int main (int argc, char ** argv) +{ + typedef typename ImprovedStaggeredFermionR::FermionField FermionField; + typedef typename ImprovedStaggeredFermionR::ComplexField ComplexField; + typename ImprovedStaggeredFermionR::ImplParams params; + + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout); + + std::vector seeds({1,2,3,4}); + GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); + + FermionField src(&Grid); random(pRNG,src); + RealD nrm = norm2(src); + FermionField result(&Grid); result=zero; + LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + + double volume=1; + for(int mu=0;mu HermOp(Ds); + ConjugateGradient CG(1.0e-8,10000); + CG(HermOp,src,result); + + Grid_finalize(); +}