diff --git a/configure b/configure index 3c8dcd01..dbc34570 100755 --- a/configure +++ b/configure @@ -1384,9 +1384,9 @@ Optional Features: --disable-dependency-tracking speeds up one-time build --disable-openmp do not use OpenMP - --enable-simd=SSE4|AVX|AVX2|AVX512|MIC + --enable-simd=SSE4|AVX|AVX2|AVX512|IMCI Select instructions to be SSE4.0, AVX 1.0, AVX - 2.0+FMA, AVX 512, MIC + 2.0+FMA, AVX 512, IMCI --enable-precision=single|double Select default word size of Real --enable-comms=none|mpi Select communications @@ -6414,13 +6414,20 @@ $as_echo "#define AVX2 1" >>confdefs.h $as_echo "$as_me: WARNING: Your processor does not support AVX2 instructions" >&2;} fi ;; - AVX512|MIC) - echo Configuring for AVX512 and MIC + AVX512) + echo Configuring for AVX512 $as_echo "#define AVX512 1" >>confdefs.h supported="cross compilation" ;; + IMCI) + echo Configuring for IMCI + +$as_echo "#define IMCI 1" >>confdefs.h + + supported="cross compilation" + ;; NEONv8) echo Configuring for experimental ARMv8a support diff --git a/configure.ac b/configure.ac index b4ebd166..e81abd0f 100644 --- a/configure.ac +++ b/configure.ac @@ -65,8 +65,8 @@ AC_CHECK_FUNCS([gettimeofday]) #Please install or provide the correct path to your installation #Info at: http://www.mpfr.org/)]) -AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVX2|AVX512|MIC],\ - [Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, MIC])],\ +AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVX2|AVX512|IMCI],\ + [Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\ [ac_SIMD=${enable_simd}],[ac_SIMD=AVX2]) supported=no @@ -99,9 +99,14 @@ case ${ac_SIMD} in AC_MSG_WARN([Your processor does not support AVX2 instructions]) fi ;; - AVX512|MIC) - echo Configuring for AVX512 and MIC - AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Corner] ) + AVX512) + echo Configuring for AVX512 + AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Landing] ) + supported="cross compilation" + ;; + IMCI) + echo Configuring for IMCI + AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner] ) supported="cross compilation" ;; NEONv8) diff --git a/lib/Algorithms.h b/lib/Algorithms.h index 311d12db..a77e50bc 100644 --- a/lib/Algorithms.h +++ b/lib/Algorithms.h @@ -17,6 +17,9 @@ #include +// Lanczos support +#include +#include #include diff --git a/lib/AlignedAllocator.h b/lib/AlignedAllocator.h index ea25f6a8..620b2520 100644 --- a/lib/AlignedAllocator.h +++ b/lib/AlignedAllocator.h @@ -72,7 +72,6 @@ operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return t template inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } - }; // namespace Grid #endif diff --git a/lib/Config.h.in b/lib/Config.h.in index 1c1b065b..76fd9737 100644 --- a/lib/Config.h.in +++ b/lib/Config.h.in @@ -6,7 +6,7 @@ /* AVX2 Intrinsics */ #undef AVX2 -/* AVX512 Intrinsics for Knights Corner */ +/* AVX512 Intrinsics for Knights Landing */ #undef AVX512 /* EMPTY_SIMD only for DEBUGGING */ @@ -110,6 +110,9 @@ /* Define to 1 if you have the header file. */ #undef HAVE_UNISTD_H +/* IMCI Intrinsics for Knights Corner */ +#undef IMCI + /* NEON ARMv8 Experimental support */ #undef NEONv8 diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h new file mode 100644 index 00000000..9d6153da --- /dev/null +++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h @@ -0,0 +1,388 @@ +#ifndef GRID_IRL_H +#define GRID_IRL_H + +namespace Grid { + + ///////////////////////////////////////////////////////////// + // Base classes for iterative processes based on operators + // single input vec, single output vec. + ///////////////////////////////////////////////////////////// + + template + class ImplicitlyRestartedLanczos { +public: + + int Niter; + int Nk; + int Np; + RealD enorm; + RealD vthr; + + LinearOperatorBase &_Linop; + OperatorFunction &_poly; + + ImplicitlyRestartedLanczos( + LinearOperatorBase &Linop, + OperatorFunction & poly, + int _Nk, + int _Np, + RealD _enorm, + RealD _vthrs, + int _Niter) : + _Linop(Linop), + _poly(poly), + Nk(_Nk), + Np(_Np), + enorm(_enorm), + vthr(_vthrs) + { + vthr=_vthrs; + Niter=_Niter; + }; + + + void step(Vector& lmda, + Vector& lmdb, + Vector& evec, + Field& f,int Nm,int k) + { + + assert( k< Nm ); + + w = opr_->mult(evec[k]); + + if(k==0){ // Initial step + + RealD wnorm= w*w; + std::cout<<"wnorm="<& lmda, + Vector& lmdb, + int Nk, + int Nm, + Vector& Qt, + RealD Dsft, + int kmin, + int kmax) + { + int k = kmin-1; + RealD x; + + RealD Fden = 1.0/sqrt((lmd[k]-Dsh)*(lmd[k]-Dsh) +lme[k]*lme[k]); + RealD c = ( lmd[k] -Dsh) *Fden; + RealD s = -lme[k] *Fden; + + RealD tmpa1 = lmd[k]; + RealD tmpa2 = lmd[k+1]; + RealD tmpb = lme[k]; + + lmd[k] = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb; + lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb; + lme[k] = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb; + x = -s*lme[k+1]; + lme[k+1] = c*lme[k+1]; + + for(int i=0; i& lmda, + Vector& lmdb, + int Nm2, + int Nm, + Vector& Qt) + { + int Niter = 100*Nm; + int kmin = 1; + int kmax = Nk; + // (this should be more sophisticated) + + for(int iter=0; iter= kmin; --j){ + RealD dds = fabs(lmd[j-1])+fabs(lmd[j]); + if(fabs(lme[j-1])+dds > dds){ + kmax = j+1; + goto continued; + } + } + Niter = iter; + return; + + continued: + for(int j=0; j dds){ + kmin = j+1; + break; + } + } + } + std::cout << "[QL method] Error - Too many iteration: "<& evec, + int k) + { + // Schmidt orthogonalization + + size_t size = w.size(); + assert(size%2 ==0); + + std::slice re(0,size/2,2); + std::slice im(1,size/2,2); + + for(int j=0; j evr(evec[j][re]); + valarray evi(evec[j][im]); + + w.add(re, -prdr*evr +prdi*evi); + w.add(im, -prdr*evi -prdi*evr); + } + } + + void calc(Vector& lmd, + Vector& evec, + const Field& b, + int& Nsbt, + int& Nconv) + { + const size_t fsize = evec[0].size(); + + Nconv = -1; + Nsbt = 0; + int Nm = Nk_+Np_; + + std::cout << " -- Nk = " << Nk_ << " Np = "<< Np_ << endl; + std::cout << " -- Nm = " << Nm << endl; + std::cout << " -- size of lmd = " << lmd.size() << endl; + std::cout << " -- size of evec = " << evec.size() << endl; + + assert(Nm < evec.size() && Nm < lmd.size()); + + vector lme(Nm); + vector lmd2(Nm); + vector lme2(Nm); + vector Qt(Nm*Nm); + vector Iconv(Nm); + + vector B(Nm); + for(int k=0; ksaturated(lmd2[i],vthr)) ++Kthrs; + std::cout<<"Kthrs="< 0){ + // (there is a converged eigenvalue larger than Vthrs.) + Nconv = iter; + goto converged; + } + } // end of iter loop + + std::cout<<"\n NOT converged.\n"; + abort(); + + converged: + // Sorting + + lmd.clear(); + evec.clear(); + for(int i=0; ipush(lmd,evec,Kdis); + Nsbt = Kdis - Kthrs; + + std::cout << "\n Converged\n Summary :\n"; + std::cout << " -- Iterations = "<< Nconv << "\n"; + std::cout << " -- beta(k) = "<< beta_k << "\n"; + std::cout << " -- Kdis = "<< Kdis << "\n"; + std::cout << " -- Nsbt = "<< Nsbt << "\n"; + } + }; +} +#endif diff --git a/lib/algorithms/iterative/MatrixUtils.h b/lib/algorithms/iterative/MatrixUtils.h new file mode 100644 index 00000000..712c9217 --- /dev/null +++ b/lib/algorithms/iterative/MatrixUtils.h @@ -0,0 +1,48 @@ +#ifndef GRID_MATRIX_UTILS_H +#define GRID_MATRIX_UTILS_H + +namespace Grid { + + namespace MatrixUtils { + + template inline void Size(Matrix& A,int &N,int &M){ + N=A.size(); assert(N>0); + M=A[0].size(); + for(int i=0;i inline void SizeSquare(Matrix& A,int &N) + { + int M; + Size(A,N,M); + assert(N==M); + } + + template inline void Fill(Matrix& A,T & val) + { + int N,M; + Size(A,N,M); + for(int i=0;i inline void Diagonal(Matrix& A,T & val) + { + int N; + SizeSquare(A,N); + for(int i=0;i inline void Identity(Matrix& A) + { + Fill(A,0.0); + Diagonal(A,1.0); + } + + }; +} +#endif diff --git a/lib/lattice/Lattice_base.h b/lib/lattice/Lattice_base.h index 9c9ef044..a66a7af6 100644 --- a/lib/lattice/Lattice_base.h +++ b/lib/lattice/Lattice_base.h @@ -29,6 +29,9 @@ extern int GridCshiftPermuteMap[4][16]; class LatticeBase {}; class LatticeExpressionBase {}; +template using Vector = std::vector >; // Aligned allocator?? +template using Matrix = std::vector > >; // Aligned allocator?? + template class LatticeUnaryExpression : public std::pair > , public LatticeExpressionBase { public: @@ -59,7 +62,7 @@ public: GridBase *_grid; int checkerboard; - std::vector > _odata; + Vector _odata; // to pthread need a computable loop where loop induction is not required int begin(void) { return 0;}; diff --git a/lib/qcd/action/gauge/WilsonGaugeAction.h b/lib/qcd/action/gauge/WilsonGaugeAction.h index 16cef0c0..e98e15d6 100644 --- a/lib/qcd/action/gauge/WilsonGaugeAction.h +++ b/lib/qcd/action/gauge/WilsonGaugeAction.h @@ -42,7 +42,7 @@ namespace Grid{ // Staple in direction mu WilsonLoops::Staple(dSdU_mu,U,mu); dSdU_mu = Ta(Umu*adj(dSdU_mu))*factor; - pokeLorentz(dSdU, dSdU_mu, mu); + PokeIndex(dSdU, dSdU_mu, mu); } }; }; diff --git a/lib/serialisation/BaseIO.h b/lib/serialisation/BaseIO.h index 042e0e7a..b572cc46 100644 --- a/lib/serialisation/BaseIO.h +++ b/lib/serialisation/BaseIO.h @@ -10,15 +10,15 @@ public: virtual void push(const std::string &s) = 0; virtual void pop(void) =0; virtual void write( const std::string& s,const std::string &output ) =0; - virtual void write( const std::string& s, int16_t output ) =0; - virtual void write( const std::string& s, uint16_t output ) =0; - virtual void write( const std::string& s, int32_t output ) =0; - virtual void write( const std::string& s, uint32_t output ) =0; - virtual void write( const std::string& s, int64_t output ) =0; - virtual void write( const std::string& s, uint64_t output ) =0; - virtual void write( const std::string& s, float output ) =0; - virtual void write( const std::string& s, double output ) =0; - virtual void write( const std::string& s, bool output ) =0; + virtual void write( const std::string& s,const int16_t output ) =0; + virtual void write( const std::string& s,const uint16_t output ) =0; + virtual void write( const std::string& s,const int32_t output ) =0; + virtual void write( const std::string& s,const uint32_t output ) =0; + virtual void write( const std::string& s,const int64_t output ) =0; + virtual void write( const std::string& s,const uint64_t output ) =0; + virtual void write( const std::string& s,const float output ) =0; + virtual void write( const std::string& s,const double output ) =0; + virtual void write( const std::string& s,const bool output ) =0; }; diff --git a/lib/serialisation/BinaryIO.h b/lib/serialisation/BinaryIO.h index be5060c0..74edbe4e 100644 --- a/lib/serialisation/BinaryIO.h +++ b/lib/serialisation/BinaryIO.h @@ -35,19 +35,19 @@ public: write(s,cstr[c]); } }; - void write( const std::string& s, char output ) { writeInternal(s,output); }; - void write( const std::string& s, int16_t output ) { writeInternal(s,output); }; - void write( const std::string& s, uint16_t output ) { writeInternal(s,output); }; - void write( const std::string& s, int32_t output ) { writeInternal(s,output); }; - void write( const std::string& s, uint32_t output ) { writeInternal(s,output); }; - void write( const std::string& s, int64_t output ) { writeInternal(s,output); }; - void write( const std::string& s, uint64_t output ) { writeInternal(s,output); }; - void write( const std::string& s, float output ) { writeInternal(s,output); }; - void write( const std::string& s, double output ) { writeInternal(s,output); }; - void write( const std::string& s, bool output ) { writeInternal(s,output); }; + void write( const std::string& s,const char output ) { writeInternal(s,output); }; + void write( const std::string& s,const int16_t output ) { writeInternal(s,output); }; + void write( const std::string& s,const uint16_t output ) { writeInternal(s,output); }; + void write( const std::string& s,const int32_t output ) { writeInternal(s,output); }; + void write( const std::string& s,const uint32_t output ) { writeInternal(s,output); }; + void write( const std::string& s,const int64_t output ) { writeInternal(s,output); }; + void write( const std::string& s,const uint64_t output ) { writeInternal(s,output); }; + void write( const std::string& s,const float output ) { writeInternal(s,output); }; + void write( const std::string& s,const double output ) { writeInternal(s,output); }; + void write( const std::string& s,const bool output ) { writeInternal(s,output); }; private: - template void writeInternal( const std::string& s, T output ){ + template void writeInternal( const std::string& s,const T output ){ // FIXME --- htons, htonl, htno64 etc.. file.write((char *)&output,sizeof(T)); } diff --git a/lib/serialisation/MacroMagic.h b/lib/serialisation/MacroMagic.h index 8410b085..05743160 100644 --- a/lib/serialisation/MacroMagic.h +++ b/lib/serialisation/MacroMagic.h @@ -120,14 +120,14 @@ THE SOFTWARE. GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__)) \ \ \ - template friend void write(Writer &WR,const std::string &s, const cname &obj){ \ + friend void write(Writer &WR,const std::string &s, const cname &obj){ \ push(WR,s);\ GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__)) \ pop(WR);\ } \ \ \ - template friend void read(Reader &RD,const std::string &s, cname &obj){ \ + friend void read(Reader &RD,const std::string &s, cname &obj){ \ push(RD,s);\ GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__)) \ pop(RD);\ diff --git a/lib/serialisation/Serialisation.h b/lib/serialisation/Serialisation.h index ff4c77fd..c1191504 100644 --- a/lib/serialisation/Serialisation.h +++ b/lib/serialisation/Serialisation.h @@ -3,6 +3,7 @@ #include #include +#include namespace Grid { @@ -12,17 +13,17 @@ namespace Grid { inline void push(Writer & WR,const char *s) { WR.push(std::string(s));} inline void pop (Writer & WR) { WR.pop();} - inline void write(Writer& wr, const std::string& s,const char * output ) { wr.write(s,std::string(output)); }; + // inline void write(Writer& wr, const std::string& s,const char * output ) { wr.write(s,std::string(output)); }; inline void write(Writer& wr, const std::string& s,const std::string &output) { wr.write(s,output); }; - inline void write(Writer& wr, const std::string& s, int16_t output ) { wr.write(s,output); }; - inline void write(Writer& wr, const std::string& s, uint16_t output ) { wr.write(s,output); }; - inline void write(Writer& wr, const std::string& s, int32_t output ) { wr.write(s,output); }; - inline void write(Writer& wr, const std::string& s, uint32_t output ) { wr.write(s,output); }; - inline void write(Writer& wr, const std::string& s, int64_t output ) { wr.write(s,output); }; - inline void write(Writer& wr, const std::string& s, uint64_t output ) { wr.write(s,output); }; - inline void write(Writer& wr, const std::string& s, float output ) { wr.write(s,output); }; - inline void write(Writer& wr, const std::string& s, double output ) { wr.write(s,output); }; - inline void write(Writer& wr, const std::string& s, bool output ) { wr.write(s,output); }; + inline void write(Writer& wr, const std::string& s,const int16_t output ) { wr.write(s,output); }; + inline void write(Writer& wr, const std::string& s,const uint16_t output ) { wr.write(s,output); }; + inline void write(Writer& wr, const std::string& s,const int32_t output ) { wr.write(s,output); }; + inline void write(Writer& wr, const std::string& s,const uint32_t output ) { wr.write(s,output); }; + inline void write(Writer& wr, const std::string& s,const int64_t output ) { wr.write(s,output); }; + inline void write(Writer& wr, const std::string& s,const uint64_t output ) { wr.write(s,output); }; + inline void write(Writer& wr, const std::string& s,const float output ) { wr.write(s,output); }; + inline void write(Writer& wr, const std::string& s,const double output ) { wr.write(s,output); }; + inline void write(Writer& wr, const std::string& s,const bool output ) { wr.write(s,output); }; inline void push(Reader & WR,const std::string &s) { WR.push(s);} inline void push(Reader & WR,const char *s) { WR.push(std::string(s));} diff --git a/lib/serialisation/TextIO.h b/lib/serialisation/TextIO.h index 16f5a018..727d18dc 100644 --- a/lib/serialisation/TextIO.h +++ b/lib/serialisation/TextIO.h @@ -43,19 +43,19 @@ public: indent(); file< void writeInternal( const std::string& s, T output ){ + template void writeInternal( const std::string& s,const T output ){ indent(); file << std::boolalpha << output< void writeInternal( const std::string& s, T output ){ + template void writeInternal( const std::string& s,const T output ){ std::ostringstream os; os << std::boolalpha << output; write(s,os.str()); diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index b874f15c..7c1800db 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -149,49 +149,33 @@ namespace Optimization { } }; + // Note, we can beat the shuf overhead in chain with two temporaries + // Ar Ai , Br Bi, Ai Ar // one shuf + //tmpr Ar Br, Ai Bi // Mul/Mac/Mac + //tmpi Br Ai, Bi Ar // Mul/Mac/Mac + // add tmpi,shuf(tmpi) + // sub tmpr,shuf(tmpi) + // shuf(tmpr,tmpi). // Could drop/trade for write mask + + // Gives + // 2mul,4 mac +add+sub = 8 flop type insns + // 3shuf + 2 (+shuf) = 5/6 simd perm and 1/2 the load. struct MultComplex{ // Complex float inline __m512 operator()(__m512 a, __m512 b){ - __m512 vzero,ymm0,ymm1,real, imag; - vzero = _mm512_setzero_ps(); - ymm0 = _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB); // - real = (__m512)_mm512_mask_or_epi32((__m512i)a, 0xAAAA,(__m512i)vzero,(__m512i)ymm0); - imag = _mm512_mask_sub_ps(a, 0x5555,vzero, ymm0); - ymm1 = _mm512_mul_ps(real, b); - ymm0 = _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB); // OK - return _mm512_fmadd_ps(ymm0,imag,ymm1); + // dup, dup, perm, mul, madd + __m512 a_real = _mm512_moveldup_ps( a ); // Ar Ar + __m512 a_imag = _mm512_movehdup_ps( a ); // Ai Ai + a_imag = _mm512_mul_ps( a_imag, _mm512_permute_ps( b, 0xB1 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br + return _mm512_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr } // Complex double inline __m512d operator()(__m512d a, __m512d b){ - /* This is from - * Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets - * @inproceedings{McFarlin:2011:ASV:1995896.1995938, - * author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus}, - * title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets}, - * booktitle = {Proceedings of the International Conference on Supercomputing}, - * series = {ICS '11}, - * year = {2011}, - * isbn = {978-1-4503-0102-2}, - * location = {Tucson, Arizona, USA}, - * pages = {265--274}, - * numpages = {10}, - * url = {http://doi.acm.org/10.1145/1995896.1995938}, - * doi = {10.1145/1995896.1995938}, - * acmid = {1995938}, - * publisher = {ACM}, - * address = {New York, NY, USA}, - * keywords = {autovectorization, fourier transform, program generation, simd, super-optimization}, - * } - */ - __m512d vzero,ymm0,ymm1,real,imag; - vzero =_mm512_setzero_pd(); - ymm0 = _mm512_swizzle_pd(a, _MM_SWIZ_REG_CDAB); // - real =(__m512d)_mm512_mask_or_epi64((__m512i)a, 0xAA,(__m512i)vzero,(__m512i) ymm0); - imag = _mm512_mask_sub_pd(a, 0x55,vzero, ymm0); - ymm1 = _mm512_mul_pd(real, b); - ymm0 = _mm512_swizzle_pd(b, _MM_SWIZ_REG_CDAB); // OK - return _mm512_fmadd_pd(ymm0,imag,ymm1); + __m512d a_real = _mm512_shuffle_pd( a, a, 0x00 ); + __m512d a_imag = _mm512_shuffle_pd( a, a, 0xFF ); + a_imag = _mm512_mul_pd( a_imag, _mm512_permute_pd( b, 0x55 ) ); + return _mm512_fmaddsub_pd( a_real, b, a_imag ); } }; @@ -227,12 +211,12 @@ namespace Optimization { //Complex single inline __m512 operator()(__m512 in, __m512 ret){ __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag - return _mm512_swizzle_ps(tmp, _MM_SWIZ_REG_CDAB);// OK + return _mm512_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)); } //Complex double inline __m512d operator()(__m512d in, __m512d ret){ __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag - return _mm512_swizzle_pd(tmp, _MM_SWIZ_REG_CDAB);// OK + return _mm512_shuffle_pd(tmp,tmp,_MM_SHUFFLE(1,0,3,2)); } @@ -241,13 +225,13 @@ namespace Optimization { struct TimesI{ //Complex single inline __m512 operator()(__m512 in, __m512 ret){ - __m512 tmp = _mm512_swizzle_ps(in, _MM_SWIZ_REG_CDAB);// OK - return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); // real -imag + __m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)); + return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); } //Complex double inline __m512d operator()(__m512d in, __m512d ret){ - __m512d tmp = _mm512_swizzle_pd(in, _MM_SWIZ_REG_CDAB);// OK - return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); // real -imag + __m512d tmp = _mm512_shuffle_pd(tmp,tmp,_MM_SHUFFLE(1,0,3,2)); + return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); } @@ -325,8 +309,8 @@ namespace Grid { } conv; conv.v = b.v; switch(perm){ - case 3: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break; - case 2: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break; + case 3 : conv.f = _mm512_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break; + case 2 : conv.f = _mm512_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break; case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break; case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break; default: assert(0); break; diff --git a/lib/simd/Grid_imci.h b/lib/simd/Grid_imci.h new file mode 100644 index 00000000..c89e8830 --- /dev/null +++ b/lib/simd/Grid_imci.h @@ -0,0 +1,355 @@ +//---------------------------------------------------------------------- +/*! @file Grid_knc.h + @brief Optimization libraries for AVX512 instructions set for KNC + + Using intrinsics +*/ +// Time-stamp: <2015-06-09 14:27:28 neo> +//---------------------------------------------------------------------- + +#include + +#ifndef KNC_ONLY_STORES +#define _mm512_storenrngo_ps _mm512_store_ps // not present in AVX512 +#define _mm512_storenrngo_pd _mm512_store_pd // not present in AVX512 +#endif + + +namespace Optimization { + + struct Vsplat{ + //Complex float + inline __m512 operator()(float a, float b){ + return _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a); + } + // Real float + inline __m512 operator()(float a){ + return _mm512_set1_ps(a); + } + //Complex double + inline __m512d operator()(double a, double b){ + return _mm512_set_pd(b,a,b,a,b,a,b,a); + } + //Real double + inline __m512d operator()(double a){ + return _mm512_set1_pd(a); + } + //Integer + inline __m512i operator()(Integer a){ + return _mm512_set1_epi32(a); + } + }; + + struct Vstore{ + //Float + inline void operator()(__m512 a, float* F){ + _mm512_store_ps(F,a); + } + //Double + inline void operator()(__m512d a, double* D){ + _mm512_store_pd(D,a); + } + //Integer + inline void operator()(__m512i a, Integer* I){ + _mm512_store_si512((__m512i *)I,a); + } + + }; + + + struct Vstream{ + //Float + inline void operator()(float * a, __m512 b){ + _mm512_storenrngo_ps(a,b); + } + //Double + inline void operator()(double * a, __m512d b){ + _mm512_storenrngo_pd(a,b); + } + + + }; + + + + struct Vset{ + // Complex float + inline __m512 operator()(Grid::ComplexF *a){ + return _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(), + a[5].imag(),a[5].real(),a[4].imag(),a[4].real(), + a[3].imag(),a[3].real(),a[2].imag(),a[2].real(), + a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); + } + // Complex double + inline __m512d operator()(Grid::ComplexD *a){ + return _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(), + a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); + } + // Real float + inline __m512 operator()(float *a){ + return _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8], + a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); + } + // Real double + inline __m512d operator()(double *a){ + return _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); + } + // Integer + inline __m512i operator()(Integer *a){ + return _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8], + a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); + } + + + }; + + template + struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } + }; + + + + + ///////////////////////////////////////////////////// + // Arithmetic operations + ///////////////////////////////////////////////////// + struct Sum{ + //Complex/Real float + inline __m512 operator()(__m512 a, __m512 b){ + return _mm512_add_ps(a,b); + } + //Complex/Real double + inline __m512d operator()(__m512d a, __m512d b){ + return _mm512_add_pd(a,b); + } + //Integer + inline __m512i operator()(__m512i a, __m512i b){ + return _mm512_add_epi32(a,b); + } + }; + + struct Sub{ + //Complex/Real float + inline __m512 operator()(__m512 a, __m512 b){ + return _mm512_sub_ps(a,b); + } + //Complex/Real double + inline __m512d operator()(__m512d a, __m512d b){ + return _mm512_sub_pd(a,b); + } + //Integer + inline __m512i operator()(__m512i a, __m512i b){ + return _mm512_sub_epi32(a,b); + } + }; + + + struct MultComplex{ + // Complex float + inline __m512 operator()(__m512 a, __m512 b){ + __m512 vzero,ymm0,ymm1,real, imag; + vzero = _mm512_setzero_ps(); + ymm0 = _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB); // + real = (__m512)_mm512_mask_or_epi32((__m512i)a, 0xAAAA,(__m512i)vzero,(__m512i)ymm0); + imag = _mm512_mask_sub_ps(a, 0x5555,vzero, ymm0); + ymm1 = _mm512_mul_ps(real, b); + ymm0 = _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB); // OK + return _mm512_fmadd_ps(ymm0,imag,ymm1); + } + // Complex double + inline __m512d operator()(__m512d a, __m512d b){ + /* This is from + * Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets + * @inproceedings{McFarlin:2011:ASV:1995896.1995938, + * author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus}, + * title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets}, + * booktitle = {Proceedings of the International Conference on Supercomputing}, + * series = {ICS '11}, + * year = {2011}, + * isbn = {978-1-4503-0102-2}, + * location = {Tucson, Arizona, USA}, + * pages = {265--274}, + * numpages = {10}, + * url = {http://doi.acm.org/10.1145/1995896.1995938}, + * doi = {10.1145/1995896.1995938}, + * acmid = {1995938}, + * publisher = {ACM}, + * address = {New York, NY, USA}, + * keywords = {autovectorization, fourier transform, program generation, simd, super-optimization}, + * } + */ + __m512d vzero,ymm0,ymm1,real,imag; + vzero =_mm512_setzero_pd(); + ymm0 = _mm512_swizzle_pd(a, _MM_SWIZ_REG_CDAB); // + real =(__m512d)_mm512_mask_or_epi64((__m512i)a, 0xAA,(__m512i)vzero,(__m512i) ymm0); + imag = _mm512_mask_sub_pd(a, 0x55,vzero, ymm0); + ymm1 = _mm512_mul_pd(real, b); + ymm0 = _mm512_swizzle_pd(b, _MM_SWIZ_REG_CDAB); // OK + return _mm512_fmadd_pd(ymm0,imag,ymm1); + } + }; + + struct Mult{ + // Real float + inline __m512 operator()(__m512 a, __m512 b){ + return _mm512_mul_ps(a,b); + } + // Real double + inline __m512d operator()(__m512d a, __m512d b){ + return _mm512_mul_pd(a,b); + } + // Integer + inline __m512i operator()(__m512i a, __m512i b){ + return _mm512_mullo_epi32(a,b); + } + }; + + + struct Conj{ + // Complex single + inline __m512 operator()(__m512 in){ + return _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // Zero out 0+real 0-imag + } + // Complex double + inline __m512d operator()(__m512d in){ + return _mm512_mask_sub_pd(in, 0xaa,_mm512_setzero_pd(), in); + } + // do not define for integer input + }; + + struct TimesMinusI{ + //Complex single + inline __m512 operator()(__m512 in, __m512 ret){ + __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag + return _mm512_swizzle_ps(tmp, _MM_SWIZ_REG_CDAB);// OK + } + //Complex double + inline __m512d operator()(__m512d in, __m512d ret){ + __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag + return _mm512_swizzle_pd(tmp, _MM_SWIZ_REG_CDAB);// OK + } + + + }; + + struct TimesI{ + //Complex single + inline __m512 operator()(__m512 in, __m512 ret){ + __m512 tmp = _mm512_swizzle_ps(in, _MM_SWIZ_REG_CDAB);// OK + return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); // real -imag + } + //Complex double + inline __m512d operator()(__m512d in, __m512d ret){ + __m512d tmp = _mm512_swizzle_pd(in, _MM_SWIZ_REG_CDAB);// OK + return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); // real -imag + } + + + }; + + + + + + ////////////////////////////////////////////// + // Some Template specialization + + //Complex float Reduce + template<> + inline Grid::ComplexF Reduce::operator()(__m512 in){ + return Grid::ComplexF(_mm512_mask_reduce_add_ps(0x5555, in),_mm512_mask_reduce_add_ps(0xAAAA, in)); + } + //Real float Reduce + template<> + inline Grid::RealF Reduce::operator()(__m512 in){ + return _mm512_reduce_add_ps(in); + } + + + //Complex double Reduce + template<> + inline Grid::ComplexD Reduce::operator()(__m512d in){ + return Grid::ComplexD(_mm512_mask_reduce_add_pd(0x55, in),_mm512_mask_reduce_add_pd(0xAA, in)); + } + + //Real double Reduce + template<> + inline Grid::RealD Reduce::operator()(__m512d in){ + return _mm512_reduce_add_pd(in); + } + + //Integer Reduce + template<> + inline Integer Reduce::operator()(__m512i in){ + // FIXME unimplemented + printf("Reduce : Missing integer implementation -> FIX\n"); + assert(0); + } + + +} + +////////////////////////////////////////////////////////////////////////////////////// +// Here assign types +namespace Grid { + typedef __m512 SIMD_Ftype; // Single precision type + typedef __m512d SIMD_Dtype; // Double precision type + typedef __m512i SIMD_Itype; // Integer type + + // prefecth + inline void v_prefetch0(int size, const char *ptr){ + for(int i=0;i + inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) { + union { + __m512 f; + decltype(VectorSIMD::v) v; + } conv; + conv.v = b.v; + switch(perm){ + case 3: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break; + case 2: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break; + case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break; + case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break; + default: assert(0); break; + } + y.v=conv.v; + }; + + // Function name aliases + typedef Optimization::Vsplat VsplatSIMD; + typedef Optimization::Vstore VstoreSIMD; + typedef Optimization::Vset VsetSIMD; + typedef Optimization::Vstream VstreamSIMD; + template using ReduceSIMD = Optimization::Reduce; + + + // Arithmetic operations + typedef Optimization::Sum SumSIMD; + typedef Optimization::Sub SubSIMD; + typedef Optimization::Mult MultSIMD; + typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::Conj ConjSIMD; + typedef Optimization::TimesMinusI TimesMinusISIMD; + typedef Optimization::TimesI TimesISIMD; + +} diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index fe3b6cba..16308f53 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -19,6 +19,9 @@ #if defined AVX512 #include "Grid_avx512.h" #endif +#if defined IMCI +#include "Grid_imci.h" +#endif #if defined QPX #include "Grid_qpx.h" #endif @@ -263,15 +266,13 @@ namespace Grid { // this is only for the complex version template =0, class ABtype> - inline void vsplat(Grid_simd &ret,ABtype a, ABtype b){ + inline void vsplat(Grid_simd &ret,ABtype a, ABtype b){ ret.v = binary(a, b, VsplatSIMD()); } // overload if complex template inline void vsplat(Grid_simd &ret, EnableIf, S> c) { - Real a = real(c); - Real b = imag(c); - vsplat(ret,a,b); + vsplat(ret,real(c),imag(c)); } //if real fill with a, if complex fill with a in the real part (first function above) @@ -290,8 +291,8 @@ namespace Grid { template = 0 > inline void vcomplex_i(Grid_simd &ret){ vsplat(ret,S(0.0,1.0));} // if not complex overload here - template = 0 > inline void vone (Grid_simd &ret){ vsplat(ret,1.0); } - template = 0 > inline void vzero(Grid_simd &ret) { vsplat(ret,0.0); } + template = 0 > inline void vone (Grid_simd &ret){ vsplat(ret,S(1.0)); } + template = 0 > inline void vzero(Grid_simd &ret){ vsplat(ret,S(0.0)); } // For integral types template = 0 > inline void vone(Grid_simd &ret) {vsplat(ret,1); } @@ -304,13 +305,18 @@ namespace Grid { /////////////////////// // Vstream /////////////////////// - template = 0 > - inline void vstream(Grid_simd &out,const Grid_simd &in){ - binary((Real*)&out.v, in.v, VstreamSIMD()); - } + template = 0 > + inline void vstream(Grid_simd &out,const Grid_simd &in){ + binary((S *)&out.v, in.v, VstreamSIMD()); + } + template = 0 > + inline void vstream(Grid_simd &out,const Grid_simd &in){ + typedef typename S::value_type T; + binary((T *)&out.v, in.v, VstreamSIMD()); + } template = 0 > - inline void vstream(Grid_simd &out,const Grid_simd &in){ + inline void vstream(Grid_simd &out,const Grid_simd &in){ out=in; } diff --git a/scripts/configure-commands b/scripts/configure-commands index d50da271..0dbd5438 100755 --- a/scripts/configure-commands +++ b/scripts/configure-commands @@ -44,7 +44,10 @@ icpc-avx512) CXX=icpc ../../configure --enable-simd=AVX512 CXXFLAGS="-xCOMMON-AVX512 -O3 -std=c++11" --host=none LIBS="-lgmp -lmpfr" --enable-comms=none ;; icpc-mic) - CXX=icpc ../../configure --host=none --enable-simd=AVX512 CXXFLAGS="-mmic -O3 -std=c++11" LDFLAGS=-mmic LIBS="-lgmp -lmpfr" --enable-comms=none + CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-mmic -O3 -std=c++11" LDFLAGS=-mmic LIBS="-lgmp -lmpfr" --enable-comms=none + ;; +icpc-mic-avx512) + CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-xCOMMON_AVX512 -O3 -std=c++11" LDFLAGS=-xCOMMON_AVX512 LIBS="-lgmp -lmpfr" --enable-comms=none ;; clang-sse) CXX=clang++ ../../configure --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none diff --git a/tests/Make.inc b/tests/Make.inc index 78f07962..7a84aaa4 100644 --- a/tests/Make.inc +++ b/tests/Make.inc @@ -1,5 +1,5 @@ -bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi +bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_partfrac_force Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_GaugeAction_SOURCES=Test_GaugeAction.cc @@ -85,6 +85,8 @@ Test_dwf_fpgcr_LDADD=-lGrid Test_dwf_hdcr_SOURCES=Test_dwf_hdcr.cc Test_dwf_hdcr_LDADD=-lGrid +#Test_dwf_lanczos_SOURCES=Test_dwf_lanczos.cc +#Test_dwf_lanczos_LDADD=-lGrid Test_gamma_SOURCES=Test_gamma.cc Test_gamma_LDADD=-lGrid diff --git a/tests/Test_dwf_lanczos.cc b/tests/Test_dwf_lanczos.cc new file mode 100644 index 00000000..77ead91d --- /dev/null +++ b/tests/Test_dwf_lanczos.cc @@ -0,0 +1,57 @@ +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + const int Ls=8; + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + + LatticeFermion src(FGrid); gaussian(RNG5,src); + LatticeGaugeField Umu(UGrid); + SU3::HotConfiguration(RNG4, Umu); + + std::vector U(4,UGrid); + for(int mu=0;mu(Umu,mu); + } + + RealD mass=0.1; + RealD M5=1.8; + DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + + MdagMLinearOperator HermOp(Ddwf); + + const int Nk = 10; + const int Np = 1; + RealD enorm = 1.0; + RealD vthrs = 1; + const int Nit= 1000; + + ImplicitlyRestartedLanczos IRL(HermOp,PolyX, + Nk,Np,enorm,vthrs,Nit); + + + std::vector eval(Nk); + std::vector evec(Nk,FGrid); + IRL.calc(eval,evec, + src, + Nsbt, + Nconv); + + + Grid_finalize(); +} diff --git a/tests/Test_serialisation.cc b/tests/Test_serialisation.cc index 3dbbf26c..9c669b40 100644 --- a/tests/Test_serialisation.cc +++ b/tests/Test_serialisation.cc @@ -1,6 +1,6 @@ #include -using namespace Grid; +namespace Grid { class myclass { public: @@ -24,29 +24,32 @@ public: }; +} -uint16_t i16 = 1; + int16_t i16 = 1; uint16_t u16 = 2; -uint32_t i32 = 3; + int32_t i32 = 3; uint32_t u32 = 4; -uint64_t i64 = 5; + int64_t i64 = 5; uint64_t u64 = 6; float f = M_PI; double d = 2*M_PI; bool b = false; +using namespace Grid; + int main(int argc,char **argv) { { XMLWriter WR("bother.xml"); push(WR,"BasicTypes"); - write(WR,"i16",i16); + write(WR,std::string("i16"),i16); write(WR,"u16",u16); write(WR,"i32",i32); - write(WR,"i32",u32); + write(WR,"u32",u32); write(WR,"i64",i64); - write(WR,"i64",u64); + write(WR,"u64",u64); write(WR,"f",f); write(WR,"d",d); write(WR,"b",b);