// // Grid.cpp // simd // // Created by Peter Boyle on 09/05/2014. // Copyright (c) 2014 University of Edinburgh. All rights reserved. // #ifndef GRID_V3_H #define GRID_V3_H #include #include #include #include #include #include #include #include #include #include #include //////////////////////////////////////////////////////////// // Tunable header includes //////////////////////////////////////////////////////////// #ifdef HAVE_OPENMP #define OMP #include #endif #ifdef HAVE_MALLOC_MALLOC_H #include #endif #ifdef HAVE_MALLOC_H #include #endif //////////////////////////////////////////////////////////// // SIMD Alignment controls //////////////////////////////////////////////////////////// #ifdef HAVE_VAR_ATTRIBUTE_ALIGNED #define ALIGN_DIRECTIVE(A) __attribute__ ((aligned(A))) #else #define ALIGN_DIRECTIVE(A) __declspec(align(A)) #endif #ifdef SSE2 #include #define SIMDalign ALIGN_DIRECTIVE(16) #endif #if defined(AVX1) || defined (AVX2) #include #define SIMDalign ALIGN_DIRECTIVE(32) #endif #ifdef AVX512 #include #define SIMDalign ALIGN_DIRECTIVE(64) #endif namespace dpo { void Grid_init(void); inline double usecond(void) { struct timeval tv; gettimeofday(&tv,NULL); return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec; } typedef float RealF; typedef double RealD; typedef RealF Real; typedef std::complex ComplexF; typedef std::complex ComplexD; typedef std::complex Complex; class Zero{}; static Zero zero; template inline void ZeroIt(itype &arg){ arg=zero;}; template<> inline void ZeroIt(ComplexF &arg){ arg=0; }; template<> inline void ZeroIt(ComplexD &arg){ arg=0; }; template<> inline void ZeroIt(RealF &arg){ arg=0; }; template<> inline void ZeroIt(RealD &arg){ arg=0; }; // TODO // // Base class to share common code between vRealF, VComplexF etc... // // lattice Broad cast assignment // // where() support // implement with masks, and/or? Type of the mask & boolean support? // // Unary functions // cos,sin, tan, acos, asin, cosh, acosh, tanh, sinh, // Scalar only arg // exp, log, sqrt, fabs // // transposeColor, transposeSpin, // adjColor, adjSpin, // traceColor, traceSpin. // peekColor, peekSpin + pokeColor PokeSpin // // copyMask. // // localMaxAbs // // norm2, // sumMulti equivalent. // Fourier transform equivalent. // //////////////////////////////////////////////////////////////////////////////// //Provide support functions for basic real and complex data types required by dpo //Single and double precision versions. Should be able to template this once only. //////////////////////////////////////////////////////////////////////////////// inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); }; inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);} inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);} inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);} inline ComplexD adj(const ComplexD& r){ return(conj(r)); } // conj already supported for complex inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); } inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); } inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); } inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); } inline Complex adj(const Complex& r ){ return(conj(r)); } //conj already supported for complex inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){ *y = (*a) * (*x)+(*y);} inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);} inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);} inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);} inline RealD adj(const RealD & r){ return r; } // No-op for real inline RealD conj(const RealD & r){ return r; } inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){ *y = (*a) * (*x)+(*y); } inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); } inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); } inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); } inline RealF adj(const RealF & r){ return r; } inline RealF conj(const RealF & r){ return r; } //////////////////////////////////////////////////////////////////////// // Vector types are arch dependent///////////////////////////////////// //////////////////////////////////////////////////////////////////////// #if defined (SSE2) typedef __m128 fvec; typedef __m128d dvec; typedef __m128 cvec; typedef __m128d zvec; #endif #if defined (AVX1) || defined (AVX2) typedef __m256 fvec; typedef __m256d dvec; typedef __m256 cvec; typedef __m256d zvec; #endif #if defined (AVX512) typedef __m512 fvec; typedef __m512d dvec; typedef __m512 cvec; typedef __m512d zvec; #endif #if defined (QPX) typedef float fvec __attribute__ ((vector_size (16))); // QPX has same SIMD width irrespective of precision typedef float cvec __attribute__ ((vector_size (16))); typedef vector4double dvec; typedef vector4double zvec; #endif #if defined (AVX1) || defined (AVX2) || defined (AVX512) inline void v_prefetch0(int size, const char *ptr){ for(int i=0;i class iScalar { public: SIMDalign vtype _internal; iScalar(){}; iScalar(Zero &z){ *this = zero; }; iScalar & operator= (const Zero &hero){ zeroit(*this); return *this; } friend void zeroit(iScalar &that){ zeroit(that._internal); } // Unary negation friend inline iScalar operator -(const iScalar &r) { iScalar ret; ret._internal= -r._internal; return ret; } // *=,+=,-= operators inline iScalar &operator *=(const iScalar &r) { *this = (*this)*r; return *this; } inline iScalar &operator -=(const iScalar &r) { *this = (*this)-r; return *this; } inline iScalar &operator +=(const iScalar &r) { *this = (*this)+r; return *this; } }; template class iVector { public: SIMDalign vtype _internal[N]; iVector(Zero &z){ *this = zero; }; iVector() {}; iVector & operator= (Zero &hero){ zeroit(*this); return *this; } friend void zeroit(iVector &that){ for(int i=0;i operator -(const iVector &r) { iVector ret; for(int i=0;i &operator *=(const iScalar &r) { *this = (*this)*r; return *this; } inline iVector &operator -=(const iVector &r) { *this = (*this)-r; return *this; } inline iVector &operator +=(const iVector &r) { *this = (*this)+r; return *this; } }; template class iMatrix { public: SIMDalign vtype _internal[N][N]; iMatrix(Zero &z){ *this = zero; }; iMatrix() {}; iMatrix & operator= (Zero &hero){ zeroit(*this); return *this; } friend void zeroit(iMatrix &that){ for(int i=0;i operator -(const iMatrix &r) { iMatrix ret; for(int i=0;i inline iMatrix &operator *=(const T &r) { *this = (*this)*r; return *this; } template inline iMatrix &operator -=(const T &r) { *this = (*this)-r; return *this; } template inline iMatrix &operator +=(const T &r) { *this = (*this)+r; return *this; } }; /* inline vComplexD localInnerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; } inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r) { return conj(l)*r; } inline vRealD localInnerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; } inline vRealF localInnerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; } */ inline ComplexD localInnerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; } inline ComplexF localInnerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; } inline RealD localInnerProduct(const RealD & l, const RealD & r) { return conj(l)*r; } inline RealF localInnerProduct(const RealF & l, const RealF & r) { return conj(l)*r; } /////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////// ADD /////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////// // ADD is simple for now; cannot mix types and straightforward template // Scalar +/- Scalar // Vector +/- Vector // Matrix +/- Matrix template inline void add(iScalar * __restrict__ ret, const iScalar * __restrict__ lhs, const iScalar * __restrict__ rhs) { add(&ret->_internal,&lhs->_internal,&rhs->_internal); } template inline void add(iVector * __restrict__ ret, const iVector * __restrict__ lhs, const iVector * __restrict__ rhs) { for(int c=0;c_internal[c]=lhs->_internal[c]+rhs->_internal[c]; } return; } template inline void add(iMatrix * __restrict__ ret, const iMatrix * __restrict__ lhs, const iMatrix * __restrict__ rhs) { for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal[c1][c2]); }} return; } template inline void add(iMatrix * __restrict__ ret, const iScalar * __restrict__ lhs, const iMatrix * __restrict__ rhs) { for(int c2=0;c2_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]); }} return; } template inline void add(iMatrix * __restrict__ ret, const iMatrix * __restrict__ lhs, const iScalar * __restrict__ rhs) { for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal); else ret->_internal[c1][c2]=lhs->_internal[c1][c2]; }} return; } // Need to figure multi-precision. template Mytype timesI(Mytype &r) { iScalar i; i._internal = Complex(0,1); return r*i; } // + operator for scalar, vector, matrix template //inline auto operator + (iScalar& lhs,iScalar&& rhs) -> iScalar inline auto operator + (const iScalar& lhs,const iScalar& rhs) -> iScalar { typedef iScalar ret_t; ret_t ret; add(&ret,&lhs,&rhs); return ret; } template inline auto operator + (const iVector& lhs,const iVector& rhs) ->iVector { typedef iVector ret_t; ret_t ret; add(&ret,&lhs,&rhs); return ret; } template inline auto operator + (const iMatrix& lhs,const iMatrix& rhs) ->iMatrix { typedef iMatrix ret_t; ret_t ret; add(&ret,&lhs,&rhs); return ret; } template inline auto operator + (const iScalar& lhs,const iMatrix& rhs)->iMatrix { typedef iMatrix ret_t; ret_t ret; add(&ret,&lhs,&rhs); return ret; } template inline auto operator + (const iMatrix& lhs,const iScalar& rhs)->iMatrix { typedef iMatrix ret_t; ret_t ret; add(&ret,&lhs,&rhs); return ret; } /////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////// SUB /////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////// // SUB is simple for now; cannot mix types and straightforward template // Scalar +/- Scalar // Vector +/- Vector // Matrix +/- Matrix // Matrix /- scalar template inline void sub(iScalar * __restrict__ ret, const iScalar * __restrict__ lhs, const iScalar * __restrict__ rhs) { sub(&ret->_internal,&lhs->_internal,&rhs->_internal); } template inline void sub(iVector * __restrict__ ret, const iVector * __restrict__ lhs, const iVector * __restrict__ rhs) { for(int c=0;c_internal[c]=lhs->_internal[c]-rhs->_internal[c]; } return; } template inline void sub(iMatrix * __restrict__ ret, const iMatrix * __restrict__ lhs, const iMatrix * __restrict__ rhs){ for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal[c1][c2]); }} return; } template inline void sub(iMatrix * __restrict__ ret, const iScalar * __restrict__ lhs, const iMatrix * __restrict__ rhs){ for(int c2=0;c2_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]); } else { // Fails -- need unary minus. Catalogue other unops? ret->_internal[c1][c2]=zero; ret->_internal[c1][c2]=ret->_internal[c1][c2]-rhs->_internal[c1][c2]; } }} return; } template inline void sub(iMatrix * __restrict__ ret, const iMatrix * __restrict__ lhs, const iScalar * __restrict__ rhs){ for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal); else ret->_internal[c1][c2]=lhs->_internal[c1][c2]; }} return; } template void vprefetch(const iScalar &vv) { vprefetch(vv._internal); } template void vprefetch(const iVector &vv) { for(int i=0;i void vprefetch(const iMatrix &vv) { for(int i=0;i inline auto operator - (const iScalar& lhs, const iScalar& rhs) -> iScalar { typedef iScalar ret_t; ret_t ret; sub(&ret,&lhs,&rhs); return ret; } template inline auto operator - (const iVector& lhs,const iVector& rhs) ->iVector { typedef iVector ret_t; ret_t ret; sub(&ret,&lhs,&rhs); return ret; } template inline auto operator - (const iMatrix& lhs,const iMatrix& rhs) ->iMatrix { typedef iMatrix ret_t; ret_t ret; sub(&ret,&lhs,&rhs); return ret; } template inline auto operator - (const iScalar& lhs,const iMatrix& rhs)->iMatrix { typedef iMatrix ret_t; ret_t ret; sub(&ret,&lhs,&rhs); return ret; } template inline auto operator - (const iMatrix& lhs,const iScalar& rhs)->iMatrix { typedef iMatrix ret_t; ret_t ret; sub(&ret,&lhs,&rhs); return ret; } /////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////// MAC /////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////// // Legal multiplication table /////////////////////////// // scal x scal = scal // mat x mat = mat // mat x scal = mat // scal x mat = mat // mat x vec = vec // vec x scal = vec // scal x vec = vec /////////////////////////// template inline void mac(iScalar * __restrict__ ret,const iScalar * __restrict__ lhs,const iScalar * __restrict__ rhs) { mac(&ret->_internal,&lhs->_internal,&rhs->_internal); } template inline void mac(iMatrix * __restrict__ ret,const iMatrix * __restrict__ lhs,const iMatrix * __restrict__ rhs){ for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]); }}} return; } template inline void mac(iMatrix * __restrict__ ret,const iMatrix * __restrict__ lhs,const iScalar * __restrict__ rhs){ for(int c1=0;c1_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal); }} return; } template inline void mac(iMatrix * __restrict__ ret,const iScalar * __restrict__ lhs,const iMatrix * __restrict__ rhs){ for(int c1=0;c1_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]); }} return; } template inline void mac(iVector * __restrict__ ret,const iMatrix * __restrict__ lhs,const iVector * __restrict__ rhs) { for(int c1=0;c1_internal[c1],&lhs->_internal[c1][c2],&rhs->_internal[c2]); }} return; } template inline void mac(iVector * __restrict__ ret,const iScalar * __restrict__ lhs,const iVector * __restrict__ rhs) { for(int c1=0;c1_internal[c1],&lhs->_internal,&rhs->_internal[c1]); } return; } template inline void mac(iVector * __restrict__ ret,const iVector * __restrict__ lhs,const iScalar * __restrict__ rhs) { for(int c1=0;c1_internal[c1],&lhs->_internal[c1],&rhs->_internal); } return; } /////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////// MUL /////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////// template inline void mult(iScalar * __restrict__ ret,const iScalar * __restrict__ lhs,const iScalar * __restrict__ rhs){ mult(&ret->_internal,&lhs->_internal,&rhs->_internal); } template inline void mult(iMatrix * __restrict__ ret,const iMatrix * __restrict__ lhs,const iMatrix * __restrict__ rhs){ for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][0],&rhs->_internal[0][c2]); for(int c3=1;c3_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]); } }} return; } template inline void mult(iMatrix * __restrict__ ret,const iMatrix * __restrict__ lhs,const iScalar * __restrict__ rhs){ for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal); }} return; } template inline void mult(iMatrix * __restrict__ ret,const iScalar * __restrict__ lhs,const iMatrix * __restrict__ rhs){ for(int c2=0;c2_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]); }} return; } // Matrix left multiplies vector template inline void mult(iVector * __restrict__ ret,const iMatrix * __restrict__ lhs,const iVector * __restrict__ rhs) { for(int c1=0;c1_internal[c1],&lhs->_internal[c1][0],&rhs->_internal[0]); for(int c2=1;c2_internal[c1],&lhs->_internal[c1][c2],&rhs->_internal[c2]); } } return; } template inline void mult(iVector * __restrict__ ret, const iScalar * __restrict__ lhs, const iVector * __restrict__ rhs){ for(int c1=0;c1_internal[c1],&lhs->_internal,&rhs->_internal[c1]); } } template inline void mult(iVector * __restrict__ ret, const iVector * __restrict__ rhs, const iScalar * __restrict__ lhs){ mult(ret,lhs,rhs); } template inline iVector operator * (const iMatrix& lhs,const iVector& rhs) { iVector ret; mult(&ret,&lhs,&rhs); return ret; } template inline iVector operator * (const iScalar& lhs,const iVector& rhs) { iVector ret; mult(&ret,&lhs,&rhs); return ret; } template inline iVector operator * (const iVector& lhs,const iScalar& rhs) { iVector ret; mult(&ret,&lhs,&rhs); return ret; } ////////////////////////////////////////////////////////////////// // Glue operators to mult routines. Must resolve return type cleverly from typeof(internal) // since nesting matrix x matrix-> matrix // while matrix x matrix-> matrix // so return type depends on argument types in nasty way. ////////////////////////////////////////////////////////////////// // scal x scal = scal // mat x mat = mat // mat x scal = mat // scal x mat = mat // mat x vec = vec // vec x scal = vec // scal x vec = vec template inline auto operator * (const iScalar& lhs,const iScalar& rhs) -> iScalar { typedef iScalar ret_t; ret_t ret; mult(&ret,&lhs,&rhs); return ret; } template inline auto operator * (const iMatrix& lhs,const iMatrix& rhs) -> iMatrix { typedef decltype(lhs._internal[0][0]*rhs._internal[0][0]) ret_t; iMatrix ret; mult(&ret,&lhs,&rhs); return ret; } template inline auto operator * (const iMatrix& lhs,const iScalar& rhs) -> iMatrix { typedef decltype(lhs._internal[0][0]*rhs._internal) ret_t; iMatrix ret; for(int c1=0;c1 inline auto operator * (const iScalar& lhs,const iMatrix& rhs) -> iMatrix { typedef decltype(lhs._internal*rhs._internal[0][0]) ret_t; iMatrix ret; for(int c1=0;c1 inline auto operator * (const iMatrix& lhs,const iVector& rhs) -> iVector { typedef decltype(lhs._internal[0][0]*rhs._internal[0]) ret_t; iVector ret; for(int c1=0;c1 inline auto operator * (const iScalar& lhs,const iVector& rhs) -> iVector { typedef decltype(lhs._internal*rhs._internal[0]) ret_t; iVector ret; for(int c1=0;c1 inline auto operator * (const iVector& lhs,const iScalar& rhs) -> iVector { typedef decltype(lhs._internal[0]*rhs._internal) ret_t; iVector ret; for(int c1=0;c1 Scalar // localInnerProduct Vector x Vector -> Scalar // localInnerProduct Matrix x Matrix -> Scalar /////////////////////////////////////////////////////////////////////////////////////// template inline auto localInnerProduct (const iVector& lhs,const iVector& rhs) -> iScalar { typedef decltype(localInnerProduct(lhs._internal[0],rhs._internal[0])) ret_t; iScalar ret=zero; for(int c1=0;c1 inline auto localInnerProduct (const iMatrix& lhs,const iMatrix& rhs) -> iScalar { typedef decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t; iScalar ret=zero; for(int c1=0;c1 inline auto localInnerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar { typedef decltype(localInnerProduct(lhs._internal,rhs._internal)) ret_t; iScalar ret; ret._internal = localInnerProduct(lhs._internal,rhs._internal); return ret; } /////////////////////////////////////////////////////////////////////////////////////// // outerProduct Scalar x Scalar -> Scalar // Vector x Vector -> Matrix /////////////////////////////////////////////////////////////////////////////////////// template inline auto outerProduct (const iVector& lhs,const iVector& rhs) -> iMatrix { typedef decltype(outerProduct(lhs._internal[0],rhs._internal[0])) ret_t; iMatrix ret; for(int c1=0;c1 inline auto outerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar { typedef decltype(outerProduct(lhs._internal,rhs._internal)) ret_t; iScalar ret; ret._internal = outerProduct(lhs._internal,rhs._internal); return ret; } /* inline vComplexF outerProduct(const vComplexF &l, const vComplexF& r) { return l*r; } inline vComplexD outerProduct(const vComplexD &l, const vComplexD& r) { return l*r; } inline vRealF outerProduct(const vRealF &l, const vRealF& r) { return l*r; } inline vRealD outerProduct(const vRealD &l, const vRealD& r) { return l*r; } */ inline ComplexF outerProduct(const ComplexF &l, const ComplexF& r) { return l*r; } inline ComplexD outerProduct(const ComplexD &l, const ComplexD& r) { return l*r; } inline RealF outerProduct(const RealF &l, const RealF& r) { return l*r; } inline RealD outerProduct(const RealD &l, const RealD& r) { return l*r; } /////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////// CONJ /////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////// // Conj function for scalar, vector, matrix template inline iScalar conj(const iScalar&r) { iScalar ret; ret._internal = conj(r._internal); return ret; } // Adj function for scalar, vector, matrix template inline iScalar adj(const iScalar&r) { iScalar ret; ret._internal = adj(r._internal); return ret; } template inline iVector adj(const iVector&r) { iVector ret; for(int i=0;i inline iMatrix adj(const iMatrix &arg) { iMatrix ret; for(int c1=0;c1 inline auto real(const iScalar &z) -> iScalar { iScalar ret; ret._internal = real(z._internal); return ret; } template inline auto real(const iMatrix &z) -> iMatrix { iMatrix ret; for(int c1=0;c1 inline auto real(const iVector &z) -> iVector { iVector ret; for(int c1=0;c1 inline auto imag(const iScalar &z) -> iScalar { iScalar ret; ret._internal = imag(z._internal); return ret; } template inline auto imag(const iMatrix &z) -> iMatrix { iMatrix ret; for(int c1=0;c1 inline auto imag(const iVector &z) -> iVector { iVector ret; for(int c1=0;c1 inline auto trace(const iMatrix &arg) -> iScalar { iScalar ret; ZeroIt(ret._internal); for(int i=0;i inline auto trace(const iScalar &arg) -> iScalar { iScalar ret; ret._internal=trace(arg._internal); return ret; } ///////////////////////////////////////////////////////////////////////// // Generic routine to promote object -> object // Supports the array reordering transformation that gives me SIMD utilisation ///////////////////////////////////////////////////////////////////////// /* template class object> inline object splat(objects){ object ret; vComplex * v_ptr = (vComplex *)& ret; Complex * s_ptr = (Complex *) &s; for(int i=0;i friend class Lattice; //protected: // Lattice wide random support. not yet fully implemented. Need seed strategy // and one generator per site. //std::default_random_engine generator; // static std::mt19937 generator( 9 ); // Grid information. unsigned long _ndimension; std::vector _layout; // Which dimensions get relayed out over simd lanes. std::vector _dimensions; // Dimensions of array std::vector _rdimensions;// Reduced dimensions with simd lane images removed std::vector _ostride; // Outer stride for each dimension std::vector _istride; // Inner stride i.e. within simd lane int _osites; // _isites*_osites = product(dimensions). int _isites; // subslice information std::vector _slice_block; std::vector _slice_stride; std::vector _slice_nblock; public: // These routines are key. Subdivide the linearised cartesian index into // "inner" index identifying which simd lane of object is associated with coord // "outer" index identifying which element of _odata in class "Lattice" is associated with coord. // Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer // stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional // coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all // lanes are operated upon simultaneously. inline int oIndexReduced(std::vector &rcoor) { int idx=0; for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*rcoor[d]; return idx; } virtual int oIndex(std::vector &coor) { int idx=0; for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]); return idx; } inline int iIndex(std::vector &rcoor) { int idx=0; for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(rcoor[d]/_rdimensions[d]); return idx; } inline int oSites(void) { return _osites; }; inline int iSites(void) { return _isites; }; virtual int CheckerBoard(std::vector site)=0; virtual int CheckerBoardDestination(int source_cb,int shift)=0; virtual int CheckerBoardShift(int source_cb,int dim,int shift)=0; }; //////////////////////////////////////////////////////////////////// // A lattice of something, but assume the something is SIMDized. //////////////////////////////////////////////////////////////////// template class myallocator { public: typedef std::size_t size_type; typedef std::ptrdiff_t difference_type; typedef _Tp* pointer; typedef const _Tp* const_pointer; typedef _Tp& reference; typedef const _Tp& const_reference; typedef _Tp value_type; template struct rebind { typedef myallocator<_Tp1> other; }; myallocator() throw() { } myallocator(const myallocator&) throw() { } template myallocator(const myallocator<_Tp1>&) throw() { } ~myallocator() throw() { } pointer address(reference __x) const { return &__x; } const_pointer address(const_reference __x) const { return &__x; } size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); } // Should override allocate and deallocate pointer allocate(size_type __n, const void* = 0) { //_Tp * ptr = (_Tp *) memalign(sizeof(_Tp),__n*sizeof(_Tp)); // _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp)); #ifdef AVX512 _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp)); #else _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128); #endif return ptr; } void deallocate(pointer __p, size_type) { free(__p); } void construct(pointer __p, const _Tp& __val) { }; void construct(pointer __p) { }; void destroy(pointer __p) { }; }; template inline bool operator==(const myallocator<_Tp>&, const myallocator<_Tp>&){ return true; } template inline bool operator!=(const myallocator<_Tp>&, const myallocator<_Tp>&){ return false; } }; // namespace dpo #endif