From 1a1474b32339d9873fb46772032256ffe3330b72 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 4 Mar 2015 05:31:44 +0000 Subject: [PATCH] Better organisation --- Grid.h | 1122 +------------------------------------- Grid_Lattice.h | 98 +--- Grid_QCD.h | 94 ++++ Grid_aligned_allocator.h | 56 ++ Grid_config.h | 101 ++++ Grid_math_types.h | 819 ++++++++++++++++++++++++++++ Grid_signal.cc | 16 +- Grid_simd.h | 68 ++- 8 files changed, 1161 insertions(+), 1213 deletions(-) create mode 100644 Grid_QCD.h create mode 100644 Grid_aligned_allocator.h create mode 100644 Grid_config.h create mode 100644 Grid_math_types.h diff --git a/Grid.h b/Grid.h index b3669ace..042835ca 100644 --- a/Grid.h +++ b/Grid.h @@ -40,1123 +40,21 @@ #endif -//////////////////////////////////////////////////////////// -// SIMD Alignment controls -//////////////////////////////////////////////////////////// -#ifdef HAVE_VAR_ATTRIBUTE_ALIGNED -#define ALIGN_DIRECTIVE(A) __attribute__ ((aligned(A))) -#else -#define ALIGN_DIRECTIVE(A) __declspec(align(A)) -#endif - -#ifdef SSE2 -#include -#define SIMDalign ALIGN_DIRECTIVE(16) -#endif - -#if defined(AVX1) || defined (AVX2) -#include -#define SIMDalign ALIGN_DIRECTIVE(32) -#endif - -#ifdef AVX512 -#include -#define SIMDalign ALIGN_DIRECTIVE(64) -#endif - +#include +#include +#include +#include +#include +#include +#include namespace dpo { void Grid_init(void); - -inline double usecond(void) -{ - struct timeval tv; - gettimeofday(&tv,NULL); - return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec; -} - - typedef float RealF; - typedef double RealD; - typedef RealF Real; - - typedef std::complex ComplexF; - typedef std::complex ComplexD; - typedef std::complex Complex; - - - class Zero{}; - static Zero zero; - template inline void ZeroIt(itype &arg){ arg=zero;}; - template<> inline void ZeroIt(ComplexF &arg){ arg=0; }; - template<> inline void ZeroIt(ComplexD &arg){ arg=0; }; - template<> inline void ZeroIt(RealF &arg){ arg=0; }; - template<> inline void ZeroIt(RealD &arg){ arg=0; }; - - // TODO - // - // Base class to share common code between vRealF, VComplexF etc... - // - // lattice Broad cast assignment - // - // where() support - // implement with masks, and/or? Type of the mask & boolean support? - // - // Unary functions - // cos,sin, tan, acos, asin, cosh, acosh, tanh, sinh, // Scalar only arg - // exp, log, sqrt, fabs - // - // transposeColor, transposeSpin, - // adjColor, adjSpin, - // traceColor, traceSpin. - // peekColor, peekSpin + pokeColor PokeSpin - // - // copyMask. - // - // localMaxAbs - // - // norm2, - // sumMulti equivalent. - // Fourier transform equivalent. - // - - //////////////////////////////////////////////////////////////////////////////// - //Provide support functions for basic real and complex data types required by dpo - //Single and double precision versions. Should be able to template this once only. - //////////////////////////////////////////////////////////////////////////////// - - inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); }; - inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);} - inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);} - inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);} - inline ComplexD adj(const ComplexD& r){ return(conj(r)); } - // conj already supported for complex - - inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); } - inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); } - inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); } - inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); } - inline Complex adj(const Complex& r ){ return(conj(r)); } - //conj already supported for complex - - inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){ *y = (*a) * (*x)+(*y);} - inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);} - inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);} - inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);} - inline RealD adj(const RealD & r){ return r; } // No-op for real - inline RealD conj(const RealD & r){ return r; } - - inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){ *y = (*a) * (*x)+(*y); } - inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); } - inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); } - inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); } - inline RealF adj(const RealF & r){ return r; } - inline RealF conj(const RealF & r){ return r; } - - //////////////////////////////////////////////////////////////////////// - // Vector types are arch dependent///////////////////////////////////// - //////////////////////////////////////////////////////////////////////// -#if defined (SSE2) - typedef __m128 fvec; - typedef __m128d dvec; - typedef __m128 cvec; - typedef __m128d zvec; -#endif -#if defined (AVX1) || defined (AVX2) - typedef __m256 fvec; - typedef __m256d dvec; - typedef __m256 cvec; - typedef __m256d zvec; -#endif -#if defined (AVX512) - typedef __m512 fvec; - typedef __m512d dvec; - typedef __m512 cvec; - typedef __m512d zvec; -#endif -#if defined (QPX) - typedef float fvec __attribute__ ((vector_size (16))); // QPX has same SIMD width irrespective of precision - typedef float cvec __attribute__ ((vector_size (16))); - - typedef vector4double dvec; - typedef vector4double zvec; -#endif -#if defined (AVX1) || defined (AVX2) || defined (AVX512) - inline void v_prefetch0(int size, const char *ptr){ - for(int i=0;i class iScalar -{ -public: - SIMDalign vtype _internal; - iScalar(){}; - iScalar(Zero &z){ *this = zero; }; - iScalar & operator= (const Zero &hero){ - zeroit(*this); - return *this; - } - friend void zeroit(iScalar &that){ - zeroit(that._internal); - } - // Unary negation - friend inline iScalar operator -(const iScalar &r) { - iScalar ret; - ret._internal= -r._internal; - return ret; - } - // *=,+=,-= operators - inline iScalar &operator *=(const iScalar &r) { - *this = (*this)*r; - return *this; - } - inline iScalar &operator -=(const iScalar &r) { - *this = (*this)-r; - return *this; - } - inline iScalar &operator +=(const iScalar &r) { - *this = (*this)+r; - return *this; - } - + double usecond(void); + void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr); + void Grid_debug_handler_init(void); }; - -template class iVector -{ -public: - SIMDalign vtype _internal[N]; - iVector(Zero &z){ *this = zero; }; - iVector() {}; - iVector & operator= (Zero &hero){ - zeroit(*this); - return *this; - } - friend void zeroit(iVector &that){ - for(int i=0;i operator -(const iVector &r) { - iVector ret; - for(int i=0;i &operator *=(const iScalar &r) { - *this = (*this)*r; - return *this; - } - inline iVector &operator -=(const iVector &r) { - *this = (*this)-r; - return *this; - } - inline iVector &operator +=(const iVector &r) { - *this = (*this)+r; - return *this; - } - -}; - - -template class iMatrix -{ -public: - SIMDalign vtype _internal[N][N]; - iMatrix(Zero &z){ *this = zero; }; - iMatrix() {}; - iMatrix & operator= (Zero &hero){ - zeroit(*this); - return *this; - } - friend void zeroit(iMatrix &that){ - for(int i=0;i operator -(const iMatrix &r) { - iMatrix ret; - for(int i=0;i - inline iMatrix &operator *=(const T &r) { - *this = (*this)*r; - return *this; - } - template - inline iMatrix &operator -=(const T &r) { - *this = (*this)-r; - return *this; - } - template - inline iMatrix &operator +=(const T &r) { - *this = (*this)+r; - return *this; - } - -}; -/* - inline vComplexD localInnerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; } - inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r) { return conj(l)*r; } - inline vRealD localInnerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; } - inline vRealF localInnerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; } -*/ - inline ComplexD localInnerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; } - inline ComplexF localInnerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; } - inline RealD localInnerProduct(const RealD & l, const RealD & r) { return conj(l)*r; } - inline RealF localInnerProduct(const RealF & l, const RealF & r) { return conj(l)*r; } - - - /////////////////////////////////////////////////////////////////////////////////////////////////// - /////////////////////////////////////////// ADD /////////////////////////////////////////// - /////////////////////////////////////////////////////////////////////////////////////////////////// - - -// ADD is simple for now; cannot mix types and straightforward template -// Scalar +/- Scalar -// Vector +/- Vector -// Matrix +/- Matrix -template inline void add(iScalar * __restrict__ ret, - const iScalar * __restrict__ lhs, - const iScalar * __restrict__ rhs) -{ - add(&ret->_internal,&lhs->_internal,&rhs->_internal); -} -template inline void add(iVector * __restrict__ ret, - const iVector * __restrict__ lhs, - const iVector * __restrict__ rhs) -{ - for(int c=0;c_internal[c]=lhs->_internal[c]+rhs->_internal[c]; - } - return; -} -template inline void add(iMatrix * __restrict__ ret, - const iMatrix * __restrict__ lhs, - const iMatrix * __restrict__ rhs) -{ - for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal[c1][c2]); - }} - return; -} -template inline void add(iMatrix * __restrict__ ret, - const iScalar * __restrict__ lhs, - const iMatrix * __restrict__ rhs) -{ - for(int c2=0;c2_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]); - }} - return; -} -template inline void add(iMatrix * __restrict__ ret, - const iMatrix * __restrict__ lhs, - const iScalar * __restrict__ rhs) -{ - for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal); - else - ret->_internal[c1][c2]=lhs->_internal[c1][c2]; - }} - return; -} -// Need to figure multi-precision. -template Mytype timesI(Mytype &r) -{ - iScalar i; - i._internal = Complex(0,1); - return r*i; -} - - // + operator for scalar, vector, matrix -template -//inline auto operator + (iScalar& lhs,iScalar&& rhs) -> iScalar -inline auto operator + (const iScalar& lhs,const iScalar& rhs) -> iScalar -{ - typedef iScalar ret_t; - ret_t ret; - add(&ret,&lhs,&rhs); - return ret; -} -template -inline auto operator + (const iVector& lhs,const iVector& rhs) ->iVector -{ - typedef iVector ret_t; - ret_t ret; - add(&ret,&lhs,&rhs); - return ret; -} -template -inline auto operator + (const iMatrix& lhs,const iMatrix& rhs) ->iMatrix -{ - typedef iMatrix ret_t; - ret_t ret; - add(&ret,&lhs,&rhs); - return ret; -} -template -inline auto operator + (const iScalar& lhs,const iMatrix& rhs)->iMatrix -{ - typedef iMatrix ret_t; - ret_t ret; - add(&ret,&lhs,&rhs); - return ret; -} -template -inline auto operator + (const iMatrix& lhs,const iScalar& rhs)->iMatrix -{ - typedef iMatrix ret_t; - ret_t ret; - add(&ret,&lhs,&rhs); - return ret; -} - - - /////////////////////////////////////////////////////////////////////////////////////////////////// - /////////////////////////////////////////// SUB /////////////////////////////////////////// - /////////////////////////////////////////////////////////////////////////////////////////////////// - - -// SUB is simple for now; cannot mix types and straightforward template -// Scalar +/- Scalar -// Vector +/- Vector -// Matrix +/- Matrix -// Matrix /- scalar -template inline void sub(iScalar * __restrict__ ret, - const iScalar * __restrict__ lhs, - const iScalar * __restrict__ rhs) -{ - sub(&ret->_internal,&lhs->_internal,&rhs->_internal); -} - -template inline void sub(iVector * __restrict__ ret, - const iVector * __restrict__ lhs, - const iVector * __restrict__ rhs) -{ - for(int c=0;c_internal[c]=lhs->_internal[c]-rhs->_internal[c]; - } - return; -} -template inline void sub(iMatrix * __restrict__ ret, - const iMatrix * __restrict__ lhs, - const iMatrix * __restrict__ rhs){ - for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal[c1][c2]); - }} - return; -} -template inline void sub(iMatrix * __restrict__ ret, - const iScalar * __restrict__ lhs, - const iMatrix * __restrict__ rhs){ - for(int c2=0;c2_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]); - } else { - // Fails -- need unary minus. Catalogue other unops? - ret->_internal[c1][c2]=zero; - ret->_internal[c1][c2]=ret->_internal[c1][c2]-rhs->_internal[c1][c2]; - - } - }} - return; -} -template inline void sub(iMatrix * __restrict__ ret, - const iMatrix * __restrict__ lhs, - const iScalar * __restrict__ rhs){ - for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal); - else - ret->_internal[c1][c2]=lhs->_internal[c1][c2]; - }} - return; -} - -template void vprefetch(const iScalar &vv) -{ - vprefetch(vv._internal); -} -template void vprefetch(const iVector &vv) -{ - for(int i=0;i void vprefetch(const iMatrix &vv) -{ - for(int i=0;i inline auto -operator - (const iScalar& lhs, const iScalar& rhs) -> iScalar -{ - typedef iScalar ret_t; - ret_t ret; - sub(&ret,&lhs,&rhs); - return ret; -} -template -inline auto operator - (const iVector& lhs,const iVector& rhs) ->iVector -{ - typedef iVector ret_t; - ret_t ret; - sub(&ret,&lhs,&rhs); - return ret; -} -template -inline auto operator - (const iMatrix& lhs,const iMatrix& rhs) ->iMatrix -{ - typedef iMatrix ret_t; - ret_t ret; - sub(&ret,&lhs,&rhs); - return ret; -} -template -inline auto operator - (const iScalar& lhs,const iMatrix& rhs)->iMatrix -{ - typedef iMatrix ret_t; - ret_t ret; - sub(&ret,&lhs,&rhs); - return ret; -} -template -inline auto operator - (const iMatrix& lhs,const iScalar& rhs)->iMatrix -{ - typedef iMatrix ret_t; - ret_t ret; - sub(&ret,&lhs,&rhs); - return ret; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////////////// MAC /////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////////////////// - - /////////////////////////// - // Legal multiplication table - /////////////////////////// - // scal x scal = scal - // mat x mat = mat - // mat x scal = mat - // scal x mat = mat - // mat x vec = vec - // vec x scal = vec - // scal x vec = vec - /////////////////////////// -template -inline void mac(iScalar * __restrict__ ret,const iScalar * __restrict__ lhs,const iScalar * __restrict__ rhs) -{ - mac(&ret->_internal,&lhs->_internal,&rhs->_internal); -} -template -inline void mac(iMatrix * __restrict__ ret,const iMatrix * __restrict__ lhs,const iMatrix * __restrict__ rhs){ - for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]); - }}} - return; -} -template -inline void mac(iMatrix * __restrict__ ret,const iMatrix * __restrict__ lhs,const iScalar * __restrict__ rhs){ - for(int c1=0;c1_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal); - }} - return; -} -template -inline void mac(iMatrix * __restrict__ ret,const iScalar * __restrict__ lhs,const iMatrix * __restrict__ rhs){ - for(int c1=0;c1_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]); - }} - return; -} -template -inline void mac(iVector * __restrict__ ret,const iMatrix * __restrict__ lhs,const iVector * __restrict__ rhs) -{ - for(int c1=0;c1_internal[c1],&lhs->_internal[c1][c2],&rhs->_internal[c2]); - }} - return; -} -template -inline void mac(iVector * __restrict__ ret,const iScalar * __restrict__ lhs,const iVector * __restrict__ rhs) -{ - for(int c1=0;c1_internal[c1],&lhs->_internal,&rhs->_internal[c1]); - } - return; -} -template -inline void mac(iVector * __restrict__ ret,const iVector * __restrict__ lhs,const iScalar * __restrict__ rhs) -{ - for(int c1=0;c1_internal[c1],&lhs->_internal[c1],&rhs->_internal); - } - return; -} - - /////////////////////////////////////////////////////////////////////////////////////////////////// - /////////////////////////////////////////// MUL /////////////////////////////////////////// - /////////////////////////////////////////////////////////////////////////////////////////////////// - - -template -inline void mult(iScalar * __restrict__ ret,const iScalar * __restrict__ lhs,const iScalar * __restrict__ rhs){ - mult(&ret->_internal,&lhs->_internal,&rhs->_internal); -} - -template -inline void mult(iMatrix * __restrict__ ret,const iMatrix * __restrict__ lhs,const iMatrix * __restrict__ rhs){ - for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][0],&rhs->_internal[0][c2]); - for(int c3=1;c3_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]); - } - }} - return; -} -template -inline void mult(iMatrix * __restrict__ ret,const iMatrix * __restrict__ lhs,const iScalar * __restrict__ rhs){ - for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal); - }} - return; -} - -template -inline void mult(iMatrix * __restrict__ ret,const iScalar * __restrict__ lhs,const iMatrix * __restrict__ rhs){ - for(int c2=0;c2_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]); - }} - return; -} -// Matrix left multiplies vector -template -inline void mult(iVector * __restrict__ ret,const iMatrix * __restrict__ lhs,const iVector * __restrict__ rhs) -{ - for(int c1=0;c1_internal[c1],&lhs->_internal[c1][0],&rhs->_internal[0]); - for(int c2=1;c2_internal[c1],&lhs->_internal[c1][c2],&rhs->_internal[c2]); - } - } - return; -} -template -inline void mult(iVector * __restrict__ ret, - const iScalar * __restrict__ lhs, - const iVector * __restrict__ rhs){ - for(int c1=0;c1_internal[c1],&lhs->_internal,&rhs->_internal[c1]); - } -} -template -inline void mult(iVector * __restrict__ ret, - const iVector * __restrict__ rhs, - const iScalar * __restrict__ lhs){ - mult(ret,lhs,rhs); -} - - - -template inline -iVector operator * (const iMatrix& lhs,const iVector& rhs) -{ - iVector ret; - mult(&ret,&lhs,&rhs); - return ret; -} - -template inline -iVector operator * (const iScalar& lhs,const iVector& rhs) -{ - iVector ret; - mult(&ret,&lhs,&rhs); - return ret; -} - -template inline -iVector operator * (const iVector& lhs,const iScalar& rhs) -{ - iVector ret; - mult(&ret,&lhs,&rhs); - return ret; -} - - ////////////////////////////////////////////////////////////////// - // Glue operators to mult routines. Must resolve return type cleverly from typeof(internal) - // since nesting matrix x matrix-> matrix - // while matrix x matrix-> matrix - // so return type depends on argument types in nasty way. - ////////////////////////////////////////////////////////////////// - // scal x scal = scal - // mat x mat = mat - // mat x scal = mat - // scal x mat = mat - // mat x vec = vec - // vec x scal = vec - // scal x vec = vec - -template -inline auto operator * (const iScalar& lhs,const iScalar& rhs) -> iScalar -{ - typedef iScalar ret_t; - ret_t ret; - mult(&ret,&lhs,&rhs); - return ret; -} -template inline -auto operator * (const iMatrix& lhs,const iMatrix& rhs) -> iMatrix -{ - typedef decltype(lhs._internal[0][0]*rhs._internal[0][0]) ret_t; - iMatrix ret; - mult(&ret,&lhs,&rhs); - return ret; -} -template inline -auto operator * (const iMatrix& lhs,const iScalar& rhs) -> iMatrix -{ - typedef decltype(lhs._internal[0][0]*rhs._internal) ret_t; - - iMatrix ret; - for(int c1=0;c1 inline -auto operator * (const iScalar& lhs,const iMatrix& rhs) -> iMatrix -{ - typedef decltype(lhs._internal*rhs._internal[0][0]) ret_t; - iMatrix ret; - for(int c1=0;c1 inline -auto operator * (const iMatrix& lhs,const iVector& rhs) -> iVector -{ - typedef decltype(lhs._internal[0][0]*rhs._internal[0]) ret_t; - iVector ret; - for(int c1=0;c1 inline -auto operator * (const iScalar& lhs,const iVector& rhs) -> iVector -{ - typedef decltype(lhs._internal*rhs._internal[0]) ret_t; - iVector ret; - for(int c1=0;c1 inline -auto operator * (const iVector& lhs,const iScalar& rhs) -> iVector -{ - typedef decltype(lhs._internal[0]*rhs._internal) ret_t; - iVector ret; - for(int c1=0;c1 Scalar - // localInnerProduct Vector x Vector -> Scalar - // localInnerProduct Matrix x Matrix -> Scalar - /////////////////////////////////////////////////////////////////////////////////////// - template inline - auto localInnerProduct (const iVector& lhs,const iVector& rhs) -> iScalar - { - typedef decltype(localInnerProduct(lhs._internal[0],rhs._internal[0])) ret_t; - iScalar ret=zero; - for(int c1=0;c1 inline - auto localInnerProduct (const iMatrix& lhs,const iMatrix& rhs) -> iScalar - { - typedef decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t; - iScalar ret=zero; - for(int c1=0;c1 inline - auto localInnerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar - { - typedef decltype(localInnerProduct(lhs._internal,rhs._internal)) ret_t; - iScalar ret; - ret._internal = localInnerProduct(lhs._internal,rhs._internal); - return ret; - } - - /////////////////////////////////////////////////////////////////////////////////////// - // outerProduct Scalar x Scalar -> Scalar - // Vector x Vector -> Matrix - /////////////////////////////////////////////////////////////////////////////////////// - -template inline -auto outerProduct (const iVector& lhs,const iVector& rhs) -> iMatrix -{ - typedef decltype(outerProduct(lhs._internal[0],rhs._internal[0])) ret_t; - iMatrix ret; - for(int c1=0;c1 inline -auto outerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar -{ - typedef decltype(outerProduct(lhs._internal,rhs._internal)) ret_t; - iScalar ret; - ret._internal = outerProduct(lhs._internal,rhs._internal); - return ret; -} -/* - inline vComplexF outerProduct(const vComplexF &l, const vComplexF& r) - { - return l*r; - } - inline vComplexD outerProduct(const vComplexD &l, const vComplexD& r) - { - return l*r; - } - inline vRealF outerProduct(const vRealF &l, const vRealF& r) - { - return l*r; - } - inline vRealD outerProduct(const vRealD &l, const vRealD& r) - { - return l*r; - } -*/ - inline ComplexF outerProduct(const ComplexF &l, const ComplexF& r) - { - return l*r; - } - inline ComplexD outerProduct(const ComplexD &l, const ComplexD& r) - { - return l*r; - } - inline RealF outerProduct(const RealF &l, const RealF& r) - { - return l*r; - } - inline RealD outerProduct(const RealD &l, const RealD& r) - { - return l*r; - } - /////////////////////////////////////////////////////////////////////////////////////////////////// - /////////////////////////////////////////// CONJ /////////////////////////////////////////// - /////////////////////////////////////////////////////////////////////////////////////////////////// - -// Conj function for scalar, vector, matrix -template inline iScalar conj(const iScalar&r) -{ - iScalar ret; - ret._internal = conj(r._internal); - return ret; -} - -// Adj function for scalar, vector, matrix -template inline iScalar adj(const iScalar&r) -{ - iScalar ret; - ret._internal = adj(r._internal); - return ret; -} -template inline iVector adj(const iVector&r) -{ - iVector ret; - for(int i=0;i inline iMatrix adj(const iMatrix &arg) -{ - iMatrix ret; - for(int c1=0;c1 inline auto real(const iScalar &z) -> iScalar -{ - iScalar ret; - ret._internal = real(z._internal); - return ret; -} -template inline auto real(const iMatrix &z) -> iMatrix -{ - iMatrix ret; - for(int c1=0;c1 inline auto real(const iVector &z) -> iVector -{ - iVector ret; - for(int c1=0;c1 inline auto imag(const iScalar &z) -> iScalar -{ - iScalar ret; - ret._internal = imag(z._internal); - return ret; -} -template inline auto imag(const iMatrix &z) -> iMatrix -{ - iMatrix ret; - for(int c1=0;c1 inline auto imag(const iVector &z) -> iVector -{ - iVector ret; - for(int c1=0;c1 -inline auto trace(const iMatrix &arg) -> iScalar -{ - iScalar ret; - ZeroIt(ret._internal); - for(int i=0;i -inline auto trace(const iScalar &arg) -> iScalar -{ - iScalar ret; - ret._internal=trace(arg._internal); - return ret; -} - -///////////////////////////////////////////////////////////////////////// -// Generic routine to promote object -> object -// Supports the array reordering transformation that gives me SIMD utilisation -///////////////////////////////////////////////////////////////////////// -/* -template class object> -inline object splat(objects){ - object ret; - vComplex * v_ptr = (vComplex *)& ret; - Complex * s_ptr = (Complex *) &s; - for(int i=0;i friend class Lattice; - - -//protected: - - // Lattice wide random support. not yet fully implemented. Need seed strategy - // and one generator per site. - //std::default_random_engine generator; - // static std::mt19937 generator( 9 ); - - - // Grid information. - unsigned long _ndimension; - std::vector _layout; // Which dimensions get relayed out over simd lanes. - std::vector _dimensions; // Dimensions of array - std::vector _rdimensions;// Reduced dimensions with simd lane images removed - std::vector _ostride; // Outer stride for each dimension - std::vector _istride; // Inner stride i.e. within simd lane - int _osites; // _isites*_osites = product(dimensions). - int _isites; - - // subslice information - std::vector _slice_block; - std::vector _slice_stride; - std::vector _slice_nblock; -public: - - // These routines are key. Subdivide the linearised cartesian index into - // "inner" index identifying which simd lane of object is associated with coord - // "outer" index identifying which element of _odata in class "Lattice" is associated with coord. - // Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer - // stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional - // coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all - // lanes are operated upon simultaneously. - - inline int oIndexReduced(std::vector &rcoor) - { - int idx=0; - for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*rcoor[d]; - return idx; - } - virtual int oIndex(std::vector &coor) - { - int idx=0; - for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]); - return idx; - } - inline int iIndex(std::vector &rcoor) - { - int idx=0; - for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(rcoor[d]/_rdimensions[d]); - return idx; - } - - inline int oSites(void) { return _osites; }; - inline int iSites(void) { return _isites; }; - virtual int CheckerBoard(std::vector site)=0; - virtual int CheckerBoardDestination(int source_cb,int shift)=0; - virtual int CheckerBoardShift(int source_cb,int dim,int shift)=0; -}; - -//////////////////////////////////////////////////////////////////// -// A lattice of something, but assume the something is SIMDized. -//////////////////////////////////////////////////////////////////// -template -class myallocator { -public: - typedef std::size_t size_type; - typedef std::ptrdiff_t difference_type; - typedef _Tp* pointer; - typedef const _Tp* const_pointer; - typedef _Tp& reference; - typedef const _Tp& const_reference; - typedef _Tp value_type; - - template struct rebind { typedef myallocator<_Tp1> other; }; - myallocator() throw() { } - myallocator(const myallocator&) throw() { } - template myallocator(const myallocator<_Tp1>&) throw() { } - ~myallocator() throw() { } - pointer address(reference __x) const { return &__x; } - const_pointer address(const_reference __x) const { return &__x; } - size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); } - // Should override allocate and deallocate - pointer allocate(size_type __n, const void* = 0) - { - //_Tp * ptr = (_Tp *) memalign(sizeof(_Tp),__n*sizeof(_Tp)); - // _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp)); -#ifdef AVX512 - _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp)); -#else - _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128); -#endif - - return ptr; - } - void deallocate(pointer __p, size_type) { - free(__p); - } - void construct(pointer __p, const _Tp& __val) { }; - void construct(pointer __p) { }; - void destroy(pointer __p) { }; -}; - -template inline bool -operator==(const myallocator<_Tp>&, const myallocator<_Tp>&){ return true; } - -template inline bool -operator!=(const myallocator<_Tp>&, const myallocator<_Tp>&){ return false; } - - -}; // namespace dpo - #endif diff --git a/Grid_Lattice.h b/Grid_Lattice.h index f18784fb..47b85b0f 100644 --- a/Grid_Lattice.h +++ b/Grid_Lattice.h @@ -1,5 +1,7 @@ +#ifndef GRID_LATTICE_H +#define GRID_LATTICE_H + #include "Grid.h" -#include "Grid_vComplexD.h" namespace dpo { @@ -9,7 +11,7 @@ class Lattice public: Grid *_grid; int checkerboard; - std::vector > _odata; + std::vector > _odata; public: @@ -554,95 +556,5 @@ public: return ret; } - -namespace QCD { - - static const int Nc=3; - static const int Ns=4; - - static const int CbRed =0; - static const int CbBlack=1; - - // QCD iMatrix types - template using iSinglet = iScalar > ; - template using iSpinMatrix = iMatrix, Ns>; - template using iSpinColourMatrix = iMatrix, Ns>; - - template using iColourMatrix = iScalar> ; - - template using iSpinVector = iVector, Ns>; - template using iColourVector = iScalar >; - template using iSpinColourVector = iVector, Ns>; - - typedef iSinglet TComplex; // This is painful. Tensor singlet complex type. - typedef iSinglet vTComplex; - typedef iSinglet TReal; // This is painful. Tensor singlet complex type. - - typedef iSpinMatrix SpinMatrix; - typedef iColourMatrix ColourMatrix; - typedef iSpinColourMatrix SpinColourMatrix; - - typedef iSpinVector SpinVector; - typedef iColourVector ColourVector; - typedef iSpinColourVector SpinColourVector; - - - typedef iSpinMatrix vSpinMatrix; - typedef iColourMatrix vColourMatrix; - typedef iSpinColourMatrix vSpinColourMatrix; - - typedef iSpinVector vSpinVector; - typedef iColourVector vColourVector; - typedef iSpinColourVector vSpinColourVector; - - - typedef Lattice LatticeComplex; - - typedef Lattice LatticeColourMatrix; - typedef Lattice LatticeSpinMatrix; - typedef Lattice LatticePropagator; - typedef LatticePropagator LatticeSpinColourMatrix; - - typedef Lattice LatticeFermion; - typedef Lattice LatticeSpinColourVector; - typedef Lattice LatticeSpinVector; - typedef Lattice LatticeColourVector; - - // localNorm2, - template - inline LatticeComplex localNorm2 (const Lattice &rhs) - { - LatticeComplex ret(rhs._grid); -#pragma omp parallel for - for(int ss=0;ssoSites(); ss++){ - ret._odata[ss]=trace(adj(rhs)*rhs); - } - return ret; - } - // localInnerProduct - template - inline LatticeComplex localInnerProduct (const Lattice &lhs,const Lattice &rhs) - { - LatticeComplex ret(rhs._grid); -#pragma omp parallel for - for(int ss=0;ssoSites(); ss++){ - ret._odata[ss]=localInnerProduct(lhs._odata[ss],rhs._odata[ss]); - } - return ret; - } - - // outerProduct Scalar x Scalar -> Scalar - // Vector x Vector -> Matrix - template - inline auto outerProduct (const Lattice &lhs,const Lattice &rhs) -> Lattice - { - Lattice ret(rhs._grid); -#pragma omp parallel for - for(int ss=0;ssoSites(); ss++){ - ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]); - } - return ret; - } -} //namespace QCD - } +#endif diff --git a/Grid_QCD.h b/Grid_QCD.h new file mode 100644 index 00000000..92a02506 --- /dev/null +++ b/Grid_QCD.h @@ -0,0 +1,94 @@ +#ifndef GRID_QCD_H +#define GRID_QCD_H +namespace dpo{ +namespace QCD { + + static const int Nc=3; + static const int Ns=4; + + static const int CbRed =0; + static const int CbBlack=1; + + // QCD iMatrix types + template using iSinglet = iScalar > ; + template using iSpinMatrix = iMatrix, Ns>; + template using iSpinColourMatrix = iMatrix, Ns>; + + template using iColourMatrix = iScalar> ; + + template using iSpinVector = iVector, Ns>; + template using iColourVector = iScalar >; + template using iSpinColourVector = iVector, Ns>; + + typedef iSinglet TComplex; // This is painful. Tensor singlet complex type. + typedef iSinglet vTComplex; + typedef iSinglet TReal; // This is painful. Tensor singlet complex type. + + typedef iSpinMatrix SpinMatrix; + typedef iColourMatrix ColourMatrix; + typedef iSpinColourMatrix SpinColourMatrix; + + typedef iSpinVector SpinVector; + typedef iColourVector ColourVector; + typedef iSpinColourVector SpinColourVector; + + + typedef iSpinMatrix vSpinMatrix; + typedef iColourMatrix vColourMatrix; + typedef iSpinColourMatrix vSpinColourMatrix; + + typedef iSpinVector vSpinVector; + typedef iColourVector vColourVector; + typedef iSpinColourVector vSpinColourVector; + + + typedef Lattice LatticeComplex; + + typedef Lattice LatticeColourMatrix; + typedef Lattice LatticeSpinMatrix; + typedef Lattice LatticePropagator; + typedef LatticePropagator LatticeSpinColourMatrix; + + typedef Lattice LatticeFermion; + typedef Lattice LatticeSpinColourVector; + typedef Lattice LatticeSpinVector; + typedef Lattice LatticeColourVector; + + // localNorm2, + template + inline LatticeComplex localNorm2 (const Lattice &rhs) + { + LatticeComplex ret(rhs._grid); +#pragma omp parallel for + for(int ss=0;ssoSites(); ss++){ + ret._odata[ss]=trace(adj(rhs)*rhs); + } + return ret; + } + // localInnerProduct + template + inline LatticeComplex localInnerProduct (const Lattice &lhs,const Lattice &rhs) + { + LatticeComplex ret(rhs._grid); +#pragma omp parallel for + for(int ss=0;ssoSites(); ss++){ + ret._odata[ss]=localInnerProduct(lhs._odata[ss],rhs._odata[ss]); + } + return ret; + } + + // outerProduct Scalar x Scalar -> Scalar + // Vector x Vector -> Matrix + template + inline auto outerProduct (const Lattice &lhs,const Lattice &rhs) -> Lattice + { + Lattice ret(rhs._grid); +#pragma omp parallel for + for(int ss=0;ssoSites(); ss++){ + ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]); + } + return ret; + } +} //namespace QCD +} // dpo +#endif diff --git a/Grid_aligned_allocator.h b/Grid_aligned_allocator.h new file mode 100644 index 00000000..6040c233 --- /dev/null +++ b/Grid_aligned_allocator.h @@ -0,0 +1,56 @@ +#ifndef GRID_ALIGNED_ALLOCATOR_H +#define GRID_ALIGNED_ALLOCATOR_H +namespace dpo { + +//////////////////////////////////////////////////////////////////// +// A lattice of something, but assume the something is SIMDized. +//////////////////////////////////////////////////////////////////// +template +class alignedAllocator { +public: + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + typedef _Tp* pointer; + typedef const _Tp* const_pointer; + typedef _Tp& reference; + typedef const _Tp& const_reference; + typedef _Tp value_type; + + template struct rebind { typedef alignedAllocator<_Tp1> other; }; + alignedAllocator() throw() { } + alignedAllocator(const alignedAllocator&) throw() { } + template alignedAllocator(const alignedAllocator<_Tp1>&) throw() { } + ~alignedAllocator() throw() { } + pointer address(reference __x) const { return &__x; } + const_pointer address(const_reference __x) const { return &__x; } + size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); } + // Should override allocate and deallocate + pointer allocate(size_type __n, const void* = 0) + { + //_Tp * ptr = (_Tp *) memalign(sizeof(_Tp),__n*sizeof(_Tp)); + // _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp)); +#ifdef AVX512 + _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp)); +#else + _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128); +#endif + + return ptr; + } + void deallocate(pointer __p, size_type) { + free(__p); + } + void construct(pointer __p, const _Tp& __val) { }; + void construct(pointer __p) { }; + void destroy(pointer __p) { }; +}; + +template inline bool +operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; } + +template inline bool +operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } + + +}; // namespace dpo +#endif diff --git a/Grid_config.h b/Grid_config.h new file mode 100644 index 00000000..b882917d --- /dev/null +++ b/Grid_config.h @@ -0,0 +1,101 @@ +/* Grid_config.h. Generated from Grid_config.h.in by configure. */ +/* Grid_config.h.in. Generated from configure.ac by autoheader. */ + +/* AVX */ +#define AVX1 1 + +/* AVX2 */ +/* #undef AVX2 */ + +/* AVX512 */ +/* #undef AVX512 */ + +/* Define to 1 if you have the `gettimeofday' function. */ +#define HAVE_GETTIMEOFDAY 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_MALLOC_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_MALLOC_MALLOC_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define to 1 if the system has the `aligned' variable attribute */ +#define HAVE_VAR_ATTRIBUTE_ALIGNED 1 + +/* Name of package */ +#define PACKAGE "grid" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "paboyle@ph.ed.ac.uk" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "Grid" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "Grid 1.0" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "grid" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "1.0" + +/* SSE2 */ +/* #undef SSE2 */ + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Version number of package */ +#define VERSION "1.0" + +/* Define for Solaris 2.5.1 so the uint32_t typedef from , + , or is not used. If the typedef were allowed, the + #define below would cause a syntax error. */ +/* #undef _UINT32_T */ + +/* Define for Solaris 2.5.1 so the uint64_t typedef from , + , or is not used. If the typedef were allowed, the + #define below would cause a syntax error. */ +/* #undef _UINT64_T */ + +/* Define to `unsigned int' if does not define. */ +/* #undef size_t */ + +/* Define to the type of an unsigned integer type of width exactly 32 bits if + such a type exists and the standard includes do not define it. */ +/* #undef uint32_t */ + +/* Define to the type of an unsigned integer type of width exactly 64 bits if + such a type exists and the standard includes do not define it. */ +/* #undef uint64_t */ diff --git a/Grid_math_types.h b/Grid_math_types.h new file mode 100644 index 00000000..405d5c9a --- /dev/null +++ b/Grid_math_types.h @@ -0,0 +1,819 @@ +#ifndef GRID_MATH_TYPES_H +#define GRID_MATH_TYPES_H +namespace dpo { +/////////////////////////////////////////////////// +// Scalar, Vector, Matrix objects. +// These can be composed to form tensor products of internal indices. +/////////////////////////////////////////////////// + +template class iScalar +{ +public: + SIMDalign vtype _internal; + iScalar(){}; + iScalar(Zero &z){ *this = zero; }; + iScalar & operator= (const Zero &hero){ + zeroit(*this); + return *this; + } + friend void zeroit(iScalar &that){ + zeroit(that._internal); + } + // Unary negation + friend inline iScalar operator -(const iScalar &r) { + iScalar ret; + ret._internal= -r._internal; + return ret; + } + // *=,+=,-= operators + inline iScalar &operator *=(const iScalar &r) { + *this = (*this)*r; + return *this; + } + inline iScalar &operator -=(const iScalar &r) { + *this = (*this)-r; + return *this; + } + inline iScalar &operator +=(const iScalar &r) { + *this = (*this)+r; + return *this; + } + + +}; + +template class iVector +{ +public: + SIMDalign vtype _internal[N]; + iVector(Zero &z){ *this = zero; }; + iVector() {}; + iVector & operator= (Zero &hero){ + zeroit(*this); + return *this; + } + friend void zeroit(iVector &that){ + for(int i=0;i operator -(const iVector &r) { + iVector ret; + for(int i=0;i &operator *=(const iScalar &r) { + *this = (*this)*r; + return *this; + } + inline iVector &operator -=(const iVector &r) { + *this = (*this)-r; + return *this; + } + inline iVector &operator +=(const iVector &r) { + *this = (*this)+r; + return *this; + } + +}; + + +template class iMatrix +{ +public: + SIMDalign vtype _internal[N][N]; + iMatrix(Zero &z){ *this = zero; }; + iMatrix() {}; + iMatrix & operator= (Zero &hero){ + zeroit(*this); + return *this; + } + friend void zeroit(iMatrix &that){ + for(int i=0;i operator -(const iMatrix &r) { + iMatrix ret; + for(int i=0;i + inline iMatrix &operator *=(const T &r) { + *this = (*this)*r; + return *this; + } + template + inline iMatrix &operator -=(const T &r) { + *this = (*this)-r; + return *this; + } + template + inline iMatrix &operator +=(const T &r) { + *this = (*this)+r; + return *this; + } + +}; + + + /////////////////////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////// ADD /////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////////////////////// + + +// ADD is simple for now; cannot mix types and straightforward template +// Scalar +/- Scalar +// Vector +/- Vector +// Matrix +/- Matrix +template inline void add(iScalar * __restrict__ ret, + const iScalar * __restrict__ lhs, + const iScalar * __restrict__ rhs) +{ + add(&ret->_internal,&lhs->_internal,&rhs->_internal); +} +template inline void add(iVector * __restrict__ ret, + const iVector * __restrict__ lhs, + const iVector * __restrict__ rhs) +{ + for(int c=0;c_internal[c]=lhs->_internal[c]+rhs->_internal[c]; + } + return; +} +template inline void add(iMatrix * __restrict__ ret, + const iMatrix * __restrict__ lhs, + const iMatrix * __restrict__ rhs) +{ + for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal[c1][c2]); + }} + return; +} +template inline void add(iMatrix * __restrict__ ret, + const iScalar * __restrict__ lhs, + const iMatrix * __restrict__ rhs) +{ + for(int c2=0;c2_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]); + }} + return; +} +template inline void add(iMatrix * __restrict__ ret, + const iMatrix * __restrict__ lhs, + const iScalar * __restrict__ rhs) +{ + for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal); + else + ret->_internal[c1][c2]=lhs->_internal[c1][c2]; + }} + return; +} +// Need to figure multi-precision. +template Mytype timesI(Mytype &r) +{ + iScalar i; + i._internal = Complex(0,1); + return r*i; +} + + // + operator for scalar, vector, matrix +template +//inline auto operator + (iScalar& lhs,iScalar&& rhs) -> iScalar +inline auto operator + (const iScalar& lhs,const iScalar& rhs) -> iScalar +{ + typedef iScalar ret_t; + ret_t ret; + add(&ret,&lhs,&rhs); + return ret; +} +template +inline auto operator + (const iVector& lhs,const iVector& rhs) ->iVector +{ + typedef iVector ret_t; + ret_t ret; + add(&ret,&lhs,&rhs); + return ret; +} +template +inline auto operator + (const iMatrix& lhs,const iMatrix& rhs) ->iMatrix +{ + typedef iMatrix ret_t; + ret_t ret; + add(&ret,&lhs,&rhs); + return ret; +} +template +inline auto operator + (const iScalar& lhs,const iMatrix& rhs)->iMatrix +{ + typedef iMatrix ret_t; + ret_t ret; + add(&ret,&lhs,&rhs); + return ret; +} +template +inline auto operator + (const iMatrix& lhs,const iScalar& rhs)->iMatrix +{ + typedef iMatrix ret_t; + ret_t ret; + add(&ret,&lhs,&rhs); + return ret; +} + + + /////////////////////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////// SUB /////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////////////////////// + + +// SUB is simple for now; cannot mix types and straightforward template +// Scalar +/- Scalar +// Vector +/- Vector +// Matrix +/- Matrix +// Matrix /- scalar +template inline void sub(iScalar * __restrict__ ret, + const iScalar * __restrict__ lhs, + const iScalar * __restrict__ rhs) +{ + sub(&ret->_internal,&lhs->_internal,&rhs->_internal); +} + +template inline void sub(iVector * __restrict__ ret, + const iVector * __restrict__ lhs, + const iVector * __restrict__ rhs) +{ + for(int c=0;c_internal[c]=lhs->_internal[c]-rhs->_internal[c]; + } + return; +} +template inline void sub(iMatrix * __restrict__ ret, + const iMatrix * __restrict__ lhs, + const iMatrix * __restrict__ rhs){ + for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal[c1][c2]); + }} + return; +} +template inline void sub(iMatrix * __restrict__ ret, + const iScalar * __restrict__ lhs, + const iMatrix * __restrict__ rhs){ + for(int c2=0;c2_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]); + } else { + // Fails -- need unary minus. Catalogue other unops? + ret->_internal[c1][c2]=zero; + ret->_internal[c1][c2]=ret->_internal[c1][c2]-rhs->_internal[c1][c2]; + + } + }} + return; +} +template inline void sub(iMatrix * __restrict__ ret, + const iMatrix * __restrict__ lhs, + const iScalar * __restrict__ rhs){ + for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal); + else + ret->_internal[c1][c2]=lhs->_internal[c1][c2]; + }} + return; +} + +template void vprefetch(const iScalar &vv) +{ + vprefetch(vv._internal); +} +template void vprefetch(const iVector &vv) +{ + for(int i=0;i void vprefetch(const iMatrix &vv) +{ + for(int i=0;i inline auto +operator - (const iScalar& lhs, const iScalar& rhs) -> iScalar +{ + typedef iScalar ret_t; + ret_t ret; + sub(&ret,&lhs,&rhs); + return ret; +} +template +inline auto operator - (const iVector& lhs,const iVector& rhs) ->iVector +{ + typedef iVector ret_t; + ret_t ret; + sub(&ret,&lhs,&rhs); + return ret; +} +template +inline auto operator - (const iMatrix& lhs,const iMatrix& rhs) ->iMatrix +{ + typedef iMatrix ret_t; + ret_t ret; + sub(&ret,&lhs,&rhs); + return ret; +} +template +inline auto operator - (const iScalar& lhs,const iMatrix& rhs)->iMatrix +{ + typedef iMatrix ret_t; + ret_t ret; + sub(&ret,&lhs,&rhs); + return ret; +} +template +inline auto operator - (const iMatrix& lhs,const iScalar& rhs)->iMatrix +{ + typedef iMatrix ret_t; + ret_t ret; + sub(&ret,&lhs,&rhs); + return ret; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////// MAC /////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////// + + /////////////////////////// + // Legal multiplication table + /////////////////////////// + // scal x scal = scal + // mat x mat = mat + // mat x scal = mat + // scal x mat = mat + // mat x vec = vec + // vec x scal = vec + // scal x vec = vec + /////////////////////////// +template +inline void mac(iScalar * __restrict__ ret,const iScalar * __restrict__ lhs,const iScalar * __restrict__ rhs) +{ + mac(&ret->_internal,&lhs->_internal,&rhs->_internal); +} +template +inline void mac(iMatrix * __restrict__ ret,const iMatrix * __restrict__ lhs,const iMatrix * __restrict__ rhs){ + for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]); + }}} + return; +} +template +inline void mac(iMatrix * __restrict__ ret,const iMatrix * __restrict__ lhs,const iScalar * __restrict__ rhs){ + for(int c1=0;c1_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal); + }} + return; +} +template +inline void mac(iMatrix * __restrict__ ret,const iScalar * __restrict__ lhs,const iMatrix * __restrict__ rhs){ + for(int c1=0;c1_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]); + }} + return; +} +template +inline void mac(iVector * __restrict__ ret,const iMatrix * __restrict__ lhs,const iVector * __restrict__ rhs) +{ + for(int c1=0;c1_internal[c1],&lhs->_internal[c1][c2],&rhs->_internal[c2]); + }} + return; +} +template +inline void mac(iVector * __restrict__ ret,const iScalar * __restrict__ lhs,const iVector * __restrict__ rhs) +{ + for(int c1=0;c1_internal[c1],&lhs->_internal,&rhs->_internal[c1]); + } + return; +} +template +inline void mac(iVector * __restrict__ ret,const iVector * __restrict__ lhs,const iScalar * __restrict__ rhs) +{ + for(int c1=0;c1_internal[c1],&lhs->_internal[c1],&rhs->_internal); + } + return; +} + + /////////////////////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////// MUL /////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////////////////////// + + +template +inline void mult(iScalar * __restrict__ ret,const iScalar * __restrict__ lhs,const iScalar * __restrict__ rhs){ + mult(&ret->_internal,&lhs->_internal,&rhs->_internal); +} + +template +inline void mult(iMatrix * __restrict__ ret,const iMatrix * __restrict__ lhs,const iMatrix * __restrict__ rhs){ + for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][0],&rhs->_internal[0][c2]); + for(int c3=1;c3_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]); + } + }} + return; +} +template +inline void mult(iMatrix * __restrict__ ret,const iMatrix * __restrict__ lhs,const iScalar * __restrict__ rhs){ + for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal); + }} + return; +} + +template +inline void mult(iMatrix * __restrict__ ret,const iScalar * __restrict__ lhs,const iMatrix * __restrict__ rhs){ + for(int c2=0;c2_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]); + }} + return; +} +// Matrix left multiplies vector +template +inline void mult(iVector * __restrict__ ret,const iMatrix * __restrict__ lhs,const iVector * __restrict__ rhs) +{ + for(int c1=0;c1_internal[c1],&lhs->_internal[c1][0],&rhs->_internal[0]); + for(int c2=1;c2_internal[c1],&lhs->_internal[c1][c2],&rhs->_internal[c2]); + } + } + return; +} +template +inline void mult(iVector * __restrict__ ret, + const iScalar * __restrict__ lhs, + const iVector * __restrict__ rhs){ + for(int c1=0;c1_internal[c1],&lhs->_internal,&rhs->_internal[c1]); + } +} +template +inline void mult(iVector * __restrict__ ret, + const iVector * __restrict__ rhs, + const iScalar * __restrict__ lhs){ + mult(ret,lhs,rhs); +} + + + +template inline +iVector operator * (const iMatrix& lhs,const iVector& rhs) +{ + iVector ret; + mult(&ret,&lhs,&rhs); + return ret; +} + +template inline +iVector operator * (const iScalar& lhs,const iVector& rhs) +{ + iVector ret; + mult(&ret,&lhs,&rhs); + return ret; +} + +template inline +iVector operator * (const iVector& lhs,const iScalar& rhs) +{ + iVector ret; + mult(&ret,&lhs,&rhs); + return ret; +} + + ////////////////////////////////////////////////////////////////// + // Glue operators to mult routines. Must resolve return type cleverly from typeof(internal) + // since nesting matrix x matrix-> matrix + // while matrix x matrix-> matrix + // so return type depends on argument types in nasty way. + ////////////////////////////////////////////////////////////////// + // scal x scal = scal + // mat x mat = mat + // mat x scal = mat + // scal x mat = mat + // mat x vec = vec + // vec x scal = vec + // scal x vec = vec + +template +inline auto operator * (const iScalar& lhs,const iScalar& rhs) -> iScalar +{ + typedef iScalar ret_t; + ret_t ret; + mult(&ret,&lhs,&rhs); + return ret; +} +template inline +auto operator * (const iMatrix& lhs,const iMatrix& rhs) -> iMatrix +{ + typedef decltype(lhs._internal[0][0]*rhs._internal[0][0]) ret_t; + iMatrix ret; + mult(&ret,&lhs,&rhs); + return ret; +} +template inline +auto operator * (const iMatrix& lhs,const iScalar& rhs) -> iMatrix +{ + typedef decltype(lhs._internal[0][0]*rhs._internal) ret_t; + + iMatrix ret; + for(int c1=0;c1 inline +auto operator * (const iScalar& lhs,const iMatrix& rhs) -> iMatrix +{ + typedef decltype(lhs._internal*rhs._internal[0][0]) ret_t; + iMatrix ret; + for(int c1=0;c1 inline +auto operator * (const iMatrix& lhs,const iVector& rhs) -> iVector +{ + typedef decltype(lhs._internal[0][0]*rhs._internal[0]) ret_t; + iVector ret; + for(int c1=0;c1 inline +auto operator * (const iScalar& lhs,const iVector& rhs) -> iVector +{ + typedef decltype(lhs._internal*rhs._internal[0]) ret_t; + iVector ret; + for(int c1=0;c1 inline +auto operator * (const iVector& lhs,const iScalar& rhs) -> iVector +{ + typedef decltype(lhs._internal[0]*rhs._internal) ret_t; + iVector ret; + for(int c1=0;c1 Scalar + // localInnerProduct Vector x Vector -> Scalar + // localInnerProduct Matrix x Matrix -> Scalar + /////////////////////////////////////////////////////////////////////////////////////// + template inline + auto localInnerProduct (const iVector& lhs,const iVector& rhs) -> iScalar + { + typedef decltype(localInnerProduct(lhs._internal[0],rhs._internal[0])) ret_t; + iScalar ret=zero; + for(int c1=0;c1 inline + auto localInnerProduct (const iMatrix& lhs,const iMatrix& rhs) -> iScalar + { + typedef decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t; + iScalar ret=zero; + for(int c1=0;c1 inline + auto localInnerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar + { + typedef decltype(localInnerProduct(lhs._internal,rhs._internal)) ret_t; + iScalar ret; + ret._internal = localInnerProduct(lhs._internal,rhs._internal); + return ret; + } + + /////////////////////////////////////////////////////////////////////////////////////// + // outerProduct Scalar x Scalar -> Scalar + // Vector x Vector -> Matrix + /////////////////////////////////////////////////////////////////////////////////////// + +template inline +auto outerProduct (const iVector& lhs,const iVector& rhs) -> iMatrix +{ + typedef decltype(outerProduct(lhs._internal[0],rhs._internal[0])) ret_t; + iMatrix ret; + for(int c1=0;c1 inline +auto outerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar +{ + typedef decltype(outerProduct(lhs._internal,rhs._internal)) ret_t; + iScalar ret; + ret._internal = outerProduct(lhs._internal,rhs._internal); + return ret; +} + +inline ComplexF outerProduct(const ComplexF &l, const ComplexF& r) +{ + return l*r; +} +inline ComplexD outerProduct(const ComplexD &l, const ComplexD& r) +{ + return l*r; +} +inline RealF outerProduct(const RealF &l, const RealF& r) +{ + return l*r; +} +inline RealD outerProduct(const RealD &l, const RealD& r) +{ + return l*r; +} + /////////////////////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////// CONJ /////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////////////////////// + +// Conj function for scalar, vector, matrix +template inline iScalar conj(const iScalar&r) +{ + iScalar ret; + ret._internal = conj(r._internal); + return ret; +} + +// Adj function for scalar, vector, matrix +template inline iScalar adj(const iScalar&r) +{ + iScalar ret; + ret._internal = adj(r._internal); + return ret; +} +template inline iVector adj(const iVector&r) +{ + iVector ret; + for(int i=0;i inline iMatrix adj(const iMatrix &arg) +{ + iMatrix ret; + for(int c1=0;c1 inline auto real(const iScalar &z) -> iScalar +{ + iScalar ret; + ret._internal = real(z._internal); + return ret; +} +template inline auto real(const iMatrix &z) -> iMatrix +{ + iMatrix ret; + for(int c1=0;c1 inline auto real(const iVector &z) -> iVector +{ + iVector ret; + for(int c1=0;c1 inline auto imag(const iScalar &z) -> iScalar +{ + iScalar ret; + ret._internal = imag(z._internal); + return ret; +} +template inline auto imag(const iMatrix &z) -> iMatrix +{ + iMatrix ret; + for(int c1=0;c1 inline auto imag(const iVector &z) -> iVector +{ + iVector ret; + for(int c1=0;c1 +inline auto trace(const iMatrix &arg) -> iScalar +{ + iScalar ret; + ZeroIt(ret._internal); + for(int i=0;i +inline auto trace(const iScalar &arg) -> iScalar +{ + iScalar ret; + ret._internal=trace(arg._internal); + return ret; +} +}; +///////////////////////////////////////////////////////////////////////// +// Generic routine to promote object -> object +// Supports the array reordering transformation that gives me SIMD utilisation +///////////////////////////////////////////////////////////////////////// +/* +template class object> +inline object splat(objects){ + object ret; + vComplex * v_ptr = (vComplex *)& ret; + Complex * s_ptr = (Complex *) &s; + for(int i=0;i only arg + // exp, log, sqrt, fabs + // + // transposeColor, transposeSpin, + // adjColor, adjSpin, + // traceColor, traceSpin. + // peekColor, peekSpin + pokeColor PokeSpin + // + // copyMask. + // + // localMaxAbs + // + // norm2, + // sumMulti equivalent. + // Fourier transform equivalent. + // + namespace dpo { @@ -21,7 +48,46 @@ namespace dpo { typedef std::complex ComplexD; typedef std::complex Complex; - + + inline RealF adj(const RealF & r){ return r; } + inline RealF conj(const RealF & r){ return r; } + inline ComplexD localInnerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; } + inline ComplexF localInnerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; } + inline RealD localInnerProduct(const RealD & l, const RealD & r) { return l*r; } + inline RealF localInnerProduct(const RealF & l, const RealF & r) { return l*r; } + + //////////////////////////////////////////////////////////////////////////////// + //Provide support functions for basic real and complex data types required by dpo + //Single and double precision versions. Should be able to template this once only. + //////////////////////////////////////////////////////////////////////////////// + inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); }; + inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);} + inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);} + inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);} + inline ComplexD adj(const ComplexD& r){ return(conj(r)); } + // conj already supported for complex + + inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); } + inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); } + inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); } + inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); } + inline Complex adj(const Complex& r ){ return(conj(r)); } + //conj already supported for complex + + inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){ *y = (*a) * (*x)+(*y);} + inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);} + inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);} + inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);} + inline RealD adj(const RealD & r){ return r; } // No-op for real + inline RealD conj(const RealD & r){ return r; } + + inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){ *y = (*a) * (*x)+(*y); } + inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); } + inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); } + inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); } + + + class Zero{}; static Zero zero; template inline void ZeroIt(itype &arg){ arg=zero;};