From 70e689900be8084c95647bdb0a4e95f5fee45c77 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 17:58:13 +0000 Subject: [PATCH 001/754] NAMESPACE --- lib/simd/l1p.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lib/simd/l1p.h b/lib/simd/l1p.h index 8e43fdbd..b246bb90 100644 --- a/lib/simd/l1p.h +++ b/lib/simd/l1p.h @@ -1,26 +1,26 @@ #pragma once -namespace Grid { +NAMESPACE_BEGIN(Grid); // L1p optimisation inline void bgq_l1p_optimisation(int mode) { #ifdef QPX #undef L1P_CFG_PF_USR #define L1P_CFG_PF_USR (0x3fde8000108ll) /* (64 bit reg, 23 bits wide, user/unpriv) */ - + uint64_t cfg_pf_usr; if ( mode ) { cfg_pf_usr = - L1P_CFG_PF_USR_ifetch_depth(0) + L1P_CFG_PF_USR_ifetch_depth(0) | L1P_CFG_PF_USR_ifetch_max_footprint(1) | L1P_CFG_PF_USR_pf_stream_est_on_dcbt | L1P_CFG_PF_USR_pf_stream_establish_enable | L1P_CFG_PF_USR_pf_stream_optimistic | L1P_CFG_PF_USR_pf_adaptive_throttle(0xF) ; // if ( sizeof(Float) == sizeof(double) ) { - cfg_pf_usr |= L1P_CFG_PF_USR_dfetch_depth(2)| L1P_CFG_PF_USR_dfetch_max_footprint(3) ; - // } else { - // cfg_pf_usr |= L1P_CFG_PF_USR_dfetch_depth(1)| L1P_CFG_PF_USR_dfetch_max_footprint(2) ; - // } + cfg_pf_usr |= L1P_CFG_PF_USR_dfetch_depth(2)| L1P_CFG_PF_USR_dfetch_max_footprint(3) ; + // } else { + // cfg_pf_usr |= L1P_CFG_PF_USR_dfetch_depth(1)| L1P_CFG_PF_USR_dfetch_max_footprint(2) ; + // } } else { cfg_pf_usr = L1P_CFG_PF_USR_dfetch_depth(1) | L1P_CFG_PF_USR_dfetch_max_footprint(2) @@ -34,4 +34,4 @@ inline void bgq_l1p_optimisation(int mode) *((uint64_t *)L1P_CFG_PF_USR) = cfg_pf_usr; #endif } -} +NAMESPACE_END(Grid) From 13bce2a6bf538fdefd172789a0c5243fb385f2a3 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 17:58:53 +0000 Subject: [PATCH 002/754] NAMESPACE --- lib/simd/Simd.h | 324 ++++++++++++++++++++++++------------------------ 1 file changed, 161 insertions(+), 163 deletions(-) diff --git a/lib/simd/Simd.h b/lib/simd/Simd.h index 3f2b10dc..4a5b1569 100644 --- a/lib/simd/Simd.h +++ b/lib/simd/Simd.h @@ -41,7 +41,6 @@ directory // Vector types are arch dependent //////////////////////////////////////////////////////////////////////// - #define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D)) #define _MM_SELECT_FOUR_FOUR_STRING(A,B,C,D) "((" #A "<<6)|(" #B "<<4)|(" #C "<<2)|(" #D "))" #define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H)) @@ -50,206 +49,205 @@ directory #define RotateBit (0x100) -namespace Grid { +NAMESPACE_BEGIN(Grid); - typedef uint32_t Integer; +typedef uint32_t Integer; - typedef float RealF; - typedef double RealD; +typedef float RealF; +typedef double RealD; #ifdef GRID_DEFAULT_PRECISION_DOUBLE - typedef RealD Real; +typedef RealD Real; #else - typedef RealF Real; +typedef RealF Real; #endif - typedef std::complex ComplexF; - typedef std::complex ComplexD; - typedef std::complex Complex; +typedef std::complex ComplexF; +typedef std::complex ComplexD; +typedef std::complex Complex; - inline RealF adj(const RealF & r){ return r; } - inline RealF conjugate(const RealF & r){ return r; } - inline RealF real(const RealF & r){ return r; } +inline RealF adj(const RealF & r){ return r; } +inline RealF conjugate(const RealF & r){ return r; } +inline RealF real(const RealF & r){ return r; } - inline RealD adj(const RealD & r){ return r; } - inline RealD conjugate(const RealD & r){ return r; } - inline RealD real(const RealD & r){ return r; } +inline RealD adj(const RealD & r){ return r; } +inline RealD conjugate(const RealD & r){ return r; } +inline RealD real(const RealD & r){ return r; } - inline RealD sqrt(const RealD & r){ return std::sqrt(r); } +inline RealD sqrt(const RealD & r){ return std::sqrt(r); } - inline ComplexD conjugate(const ComplexD& r){ return(conj(r)); } - inline ComplexD adj(const ComplexD& r){ return(conjugate(r)); } - inline ComplexF conjugate(const ComplexF& r ){ return(conj(r)); } - inline ComplexF adj(const ComplexF& r ){ return(conjugate(r)); } +inline ComplexD conjugate(const ComplexD& r){ return(conj(r)); } +inline ComplexD adj(const ComplexD& r){ return(conjugate(r)); } +inline ComplexF conjugate(const ComplexF& r ){ return(conj(r)); } +inline ComplexF adj(const ComplexF& r ){ return(conjugate(r)); } - inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conjugate(l)*r; } - inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conjugate(l)*r; } - inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; } - inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; } +inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conjugate(l)*r; } +inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conjugate(l)*r; } +inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; } +inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; } - inline ComplexD Reduce(const ComplexD& r){ return r; } - inline ComplexF Reduce(const ComplexF& r){ return r; } - inline RealD Reduce(const RealD& r){ return r; } - inline RealF Reduce(const RealF& r){ return r; } +inline ComplexD Reduce(const ComplexD& r){ return r; } +inline ComplexF Reduce(const ComplexF& r){ return r; } +inline RealD Reduce(const RealD& r){ return r; } +inline RealF Reduce(const RealF& r){ return r; } - inline RealD toReal(const ComplexD& r){ return real(r); } - inline RealF toReal(const ComplexF& r){ return real(r); } - inline RealD toReal(const RealD& r){ return r; } - inline RealF toReal(const RealF& r){ return r; } +inline RealD toReal(const ComplexD& r){ return real(r); } +inline RealF toReal(const ComplexF& r){ return real(r); } +inline RealD toReal(const RealD& r){ return r; } +inline RealF toReal(const RealF& r){ return r; } - //////////////////////////////////////////////////////////////////////////////// - //Provide support functions for basic real and complex data types required by Grid - //Single and double precision versions. Should be able to template this once only. - //////////////////////////////////////////////////////////////////////////////// - inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); }; - inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);} - inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);} - inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);} - // conjugate already supported for complex +//////////////////////////////////////////////////////////////////////////////// +//Provide support functions for basic real and complex data types required by Grid +//Single and double precision versions. Should be able to template this once only. +//////////////////////////////////////////////////////////////////////////////// +inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); }; +inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);} +inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);} +inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);} +// conjugate already supported for complex - inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); } - inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); } - inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); } - inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); } +inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); } +inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); } +inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); } +inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); } - //conjugate already supported for complex +//conjugate already supported for complex - inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));} - inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));} - inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));} - inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));} +inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));} +inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));} +inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));} +inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));} - // define projections to real and imaginay parts - inline ComplexF projReal(const ComplexF &r){return( ComplexF(std::real(r), 0.0));} - inline ComplexD projReal(const ComplexD &r){return( ComplexD(std::real(r), 0.0));} - inline ComplexF projImag(const ComplexF &r){return (ComplexF(std::imag(r), 0.0 ));} - inline ComplexD projImag(const ComplexD &r){return (ComplexD(std::imag(r), 0.0));} +// define projections to real and imaginay parts +inline ComplexF projReal(const ComplexF &r){return( ComplexF(std::real(r), 0.0));} +inline ComplexD projReal(const ComplexD &r){return( ComplexD(std::real(r), 0.0));} +inline ComplexF projImag(const ComplexF &r){return (ComplexF(std::imag(r), 0.0 ));} +inline ComplexD projImag(const ComplexD &r){return (ComplexD(std::imag(r), 0.0));} - // define auxiliary functions for complex computations - inline void timesI(ComplexF &ret,const ComplexF &r) { ret = timesI(r);} - inline void timesI(ComplexD &ret,const ComplexD &r) { ret = timesI(r);} - inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);} - inline void timesMinusI(ComplexD &ret,const ComplexD &r){ ret = timesMinusI(r);} +// define auxiliary functions for complex computations +inline void timesI(ComplexF &ret,const ComplexF &r) { ret = timesI(r);} +inline void timesI(ComplexD &ret,const ComplexD &r) { ret = timesI(r);} +inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);} +inline void timesMinusI(ComplexD &ret,const ComplexD &r){ ret = timesMinusI(r);} - inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){ *y = (*a) * (*x)+(*y);} - inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);} - inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);} - inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);} +inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){ *y = (*a) * (*x)+(*y);} +inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);} +inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);} +inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);} - inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){ *y = (*a) * (*x)+(*y); } - inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); } - inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); } - inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); } +inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){ *y = (*a) * (*x)+(*y); } +inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); } +inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); } +inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); } - inline void vstream(ComplexF &l, const ComplexF &r){ l=r;} - inline void vstream(ComplexD &l, const ComplexD &r){ l=r;} - inline void vstream(RealF &l, const RealF &r){ l=r;} - inline void vstream(RealD &l, const RealD &r){ l=r;} +inline void vstream(ComplexF &l, const ComplexF &r){ l=r;} +inline void vstream(ComplexD &l, const ComplexD &r){ l=r;} +inline void vstream(RealF &l, const RealF &r){ l=r;} +inline void vstream(RealD &l, const RealD &r){ l=r;} - class Zero{}; - static Zero zero; - template inline void zeroit(itype &arg){ arg=zero;}; - template<> inline void zeroit(ComplexF &arg){ arg=0; }; - template<> inline void zeroit(ComplexD &arg){ arg=0; }; - template<> inline void zeroit(RealF &arg){ arg=0; }; - template<> inline void zeroit(RealD &arg){ arg=0; }; +class Zero{}; +static Zero zero; +template inline void zeroit(itype &arg){ arg=zero;}; +template<> inline void zeroit(ComplexF &arg){ arg=0; }; +template<> inline void zeroit(ComplexD &arg){ arg=0; }; +template<> inline void zeroit(RealF &arg){ arg=0; }; +template<> inline void zeroit(RealD &arg){ arg=0; }; - ////////////////////////////////////////////////////////// - // Permute - // Permute 0 every ABCDEFGH -> BA DC FE HG - // Permute 1 every ABCDEFGH -> CD AB GH EF - // Permute 2 every ABCDEFGH -> EFGH ABCD - // Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single) - // Permute 4 possible on half precision @512bit vectors. - // - // Defined inside SIMD specialization files - ////////////////////////////////////////////////////////// - template - inline void Gpermute(VectorSIMD &y,const VectorSIMD &b,int perm); +////////////////////////////////////////////////////////// +// Permute +// Permute 0 every ABCDEFGH -> BA DC FE HG +// Permute 1 every ABCDEFGH -> CD AB GH EF +// Permute 2 every ABCDEFGH -> EFGH ABCD +// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single) +// Permute 4 possible on half precision @512bit vectors. +// +// Defined inside SIMD specialization files +////////////////////////////////////////////////////////// +template +inline void Gpermute(VectorSIMD &y,const VectorSIMD &b,int perm); -}; +NAMESPACE_END(Grid); #include #include -namespace Grid { - // Default precision +NAMESPACE_BEGIN(Grid); +// Default precision #ifdef GRID_DEFAULT_PRECISION_DOUBLE - typedef vRealD vReal; - typedef vComplexD vComplex; +typedef vRealD vReal; +typedef vComplexD vComplex; #else - typedef vRealF vReal; - typedef vComplexF vComplex; +typedef vRealF vReal; +typedef vComplexF vComplex; #endif - inline std::ostream& operator<< (std::ostream& stream, const vComplexF &o){ - int nn=vComplexF::Nsimd(); - std::vector > buf(nn); - vstore(o,&buf[0]); - stream<<"<"; - for(int i=0;i"; - return stream; +inline std::ostream& operator<< (std::ostream& stream, const vComplexF &o){ + int nn=vComplexF::Nsimd(); + std::vector > buf(nn); + vstore(o,&buf[0]); + stream<<"<"; + for(int i=0;i > buf(nn); - vstore(o,&buf[0]); - stream<<"<"; - for(int i=0;i"; - return stream; - } - - inline std::ostream& operator<< (std::ostream& stream, const vRealF &o){ - int nn=vRealF::Nsimd(); - std::vector > buf(nn); - vstore(o,&buf[0]); - stream<<"<"; - for(int i=0;i"; - return stream; - } - - inline std::ostream& operator<< (std::ostream& stream, const vRealD &o){ - int nn=vRealD::Nsimd(); - std::vector > buf(nn); - vstore(o,&buf[0]); - stream<<"<"; - for(int i=0;i"; - return stream; - } - inline std::ostream& operator<< (std::ostream& stream, const vInteger &o){ - int nn=vInteger::Nsimd(); - std::vector > buf(nn); - vstore(o,&buf[0]); - stream<<"<"; - for(int i=0;i"; - return stream; - } - - + stream<<">"; + return stream; } + +inline std::ostream& operator<< (std::ostream& stream, const vComplexD &o){ + int nn=vComplexD::Nsimd(); + std::vector > buf(nn); + vstore(o,&buf[0]); + stream<<"<"; + for(int i=0;i"; + return stream; +} + +inline std::ostream& operator<< (std::ostream& stream, const vRealF &o){ + int nn=vRealF::Nsimd(); + std::vector > buf(nn); + vstore(o,&buf[0]); + stream<<"<"; + for(int i=0;i"; + return stream; +} + +inline std::ostream& operator<< (std::ostream& stream, const vRealD &o){ + int nn=vRealD::Nsimd(); + std::vector > buf(nn); + vstore(o,&buf[0]); + stream<<"<"; + for(int i=0;i"; + return stream; +} +inline std::ostream& operator<< (std::ostream& stream, const vInteger &o){ + int nn=vInteger::Nsimd(); + std::vector > buf(nn); + vstore(o,&buf[0]); + stream<<"<"; + for(int i=0;i"; + return stream; +} + +NAMESPACE_END(Grid) #endif From 08682c546101bf085d06b32e94f3d74c43f8c3cc Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 18:03:57 +0000 Subject: [PATCH 003/754] NAMESPACE and format to my liking --- lib/simd/Grid_avx.h | 1206 +++++++++++++++++++++---------------------- 1 file changed, 603 insertions(+), 603 deletions(-) diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index f4634432..1994e224 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -27,8 +27,8 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #include #ifdef AVXFMA4 #include @@ -38,102 +38,102 @@ Author: paboyle #define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1) #endif -namespace Grid { -namespace Optimization { +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Optimization); - template - union uconv { - __m256 f; - vtype v; - }; +template +union uconv { + __m256 f; + vtype v; +}; - union u256f { - __m256 v; - float f[8]; - }; +union u256f { + __m256 v; + float f[8]; +}; - union u256d { - __m256d v; - double f[4]; - }; +union u256d { + __m256d v; + double f[4]; +}; - struct Vsplat{ - // Complex float - inline __m256 operator()(float a, float b) { - return _mm256_set_ps(b,a,b,a,b,a,b,a); - } - // Real float - inline __m256 operator()(float a){ - return _mm256_set_ps(a,a,a,a,a,a,a,a); - } - //Complex double - inline __m256d operator()(double a, double b){ - return _mm256_set_pd(b,a,b,a); - } - //Real double - inline __m256d operator()(double a){ - return _mm256_set_pd(a,a,a,a); - } - //Integer - inline __m256i operator()(Integer a){ - return _mm256_set1_epi32(a); - } - }; +struct Vsplat{ + // Complex float + inline __m256 operator()(float a, float b) { + return _mm256_set_ps(b,a,b,a,b,a,b,a); + } + // Real float + inline __m256 operator()(float a){ + return _mm256_set_ps(a,a,a,a,a,a,a,a); + } + //Complex double + inline __m256d operator()(double a, double b){ + return _mm256_set_pd(b,a,b,a); + } + //Real double + inline __m256d operator()(double a){ + return _mm256_set_pd(a,a,a,a); + } + //Integer + inline __m256i operator()(Integer a){ + return _mm256_set1_epi32(a); + } +}; - struct Vstore{ - //Float - inline void operator()(__m256 a, float* F){ - _mm256_store_ps(F,a); - } - //Double - inline void operator()(__m256d a, double* D){ - _mm256_store_pd(D,a); - } - //Integer - inline void operator()(__m256i a, Integer* I){ - _mm256_store_si256((__m256i*)I,a); - } +struct Vstore{ + //Float + inline void operator()(__m256 a, float* F){ + _mm256_store_ps(F,a); + } + //Double + inline void operator()(__m256d a, double* D){ + _mm256_store_pd(D,a); + } + //Integer + inline void operator()(__m256i a, Integer* I){ + _mm256_store_si256((__m256i*)I,a); + } - }; +}; - struct Vstream{ - //Float - inline void operator()(float * a, __m256 b){ - _mm256_stream_ps(a,b); - } - //Double - inline void operator()(double * a, __m256d b){ - _mm256_stream_pd(a,b); - } +struct Vstream{ + //Float + inline void operator()(float * a, __m256 b){ + _mm256_stream_ps(a,b); + } + //Double + inline void operator()(double * a, __m256d b){ + _mm256_stream_pd(a,b); + } - }; +}; - struct Vset{ - // Complex float - inline __m256 operator()(Grid::ComplexF *a){ - return _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); - } - // Complex double - inline __m256d operator()(Grid::ComplexD *a){ - return _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); - } - // Real float - inline __m256 operator()(float *a){ - return _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); - } - // Real double - inline __m256d operator()(double *a){ - return _mm256_set_pd(a[3],a[2],a[1],a[0]); - } - // Integer - inline __m256i operator()(Integer *a){ - return _mm256_set_epi32(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); - } +struct Vset{ + // Complex float + inline __m256 operator()(Grid::ComplexF *a){ + return _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); + } + // Complex double + inline __m256d operator()(Grid::ComplexD *a){ + return _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); + } + // Real float + inline __m256 operator()(float *a){ + return _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); + } + // Real double + inline __m256d operator()(double *a){ + return _mm256_set_pd(a[3],a[2],a[1],a[0]); + } + // Integer + inline __m256i operator()(Integer *a){ + return _mm256_set_epi32(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); + } - }; +}; - template +template struct Reduce{ // Need templated class to overload output type // General form must generate error if compiled @@ -144,421 +144,421 @@ namespace Optimization { } }; - ///////////////////////////////////////////////////// - // Arithmetic operations - ///////////////////////////////////////////////////// - struct Sum{ - //Complex/Real float - inline __m256 operator()(__m256 a, __m256 b){ - return _mm256_add_ps(a,b); - } - //Complex/Real double - inline __m256d operator()(__m256d a, __m256d b){ - return _mm256_add_pd(a,b); - } - //Integer - inline __m256i operator()(__m256i a, __m256i b){ +///////////////////////////////////////////////////// +// Arithmetic operations +///////////////////////////////////////////////////// +struct Sum{ + //Complex/Real float + inline __m256 operator()(__m256 a, __m256 b){ + return _mm256_add_ps(a,b); + } + //Complex/Real double + inline __m256d operator()(__m256d a, __m256d b){ + return _mm256_add_pd(a,b); + } + //Integer + inline __m256i operator()(__m256i a, __m256i b){ #if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4) - __m128i a0,a1; - __m128i b0,b1; - a0 = _mm256_extractf128_si256(a,0); - b0 = _mm256_extractf128_si256(b,0); - a1 = _mm256_extractf128_si256(a,1); - b1 = _mm256_extractf128_si256(b,1); - a0 = _mm_add_epi32(a0,b0); - a1 = _mm_add_epi32(a1,b1); - return _mm256_set_m128i(a1,a0); + __m128i a0,a1; + __m128i b0,b1; + a0 = _mm256_extractf128_si256(a,0); + b0 = _mm256_extractf128_si256(b,0); + a1 = _mm256_extractf128_si256(a,1); + b1 = _mm256_extractf128_si256(b,1); + a0 = _mm_add_epi32(a0,b0); + a1 = _mm_add_epi32(a1,b1); + return _mm256_set_m128i(a1,a0); #endif #if defined (AVX2) - return _mm256_add_epi32(a,b); + return _mm256_add_epi32(a,b); #endif - } - }; + } +}; - struct Sub{ - //Complex/Real float - inline __m256 operator()(__m256 a, __m256 b){ - return _mm256_sub_ps(a,b); - } - //Complex/Real double - inline __m256d operator()(__m256d a, __m256d b){ - return _mm256_sub_pd(a,b); - } - //Integer - inline __m256i operator()(__m256i a, __m256i b){ +struct Sub{ + //Complex/Real float + inline __m256 operator()(__m256 a, __m256 b){ + return _mm256_sub_ps(a,b); + } + //Complex/Real double + inline __m256d operator()(__m256d a, __m256d b){ + return _mm256_sub_pd(a,b); + } + //Integer + inline __m256i operator()(__m256i a, __m256i b){ #if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4) - __m128i a0,a1; - __m128i b0,b1; - a0 = _mm256_extractf128_si256(a,0); - b0 = _mm256_extractf128_si256(b,0); - a1 = _mm256_extractf128_si256(a,1); - b1 = _mm256_extractf128_si256(b,1); - a0 = _mm_sub_epi32(a0,b0); - a1 = _mm_sub_epi32(a1,b1); - return _mm256_set_m128i(a1,a0); + __m128i a0,a1; + __m128i b0,b1; + a0 = _mm256_extractf128_si256(a,0); + b0 = _mm256_extractf128_si256(b,0); + a1 = _mm256_extractf128_si256(a,1); + b1 = _mm256_extractf128_si256(b,1); + a0 = _mm_sub_epi32(a0,b0); + a1 = _mm_sub_epi32(a1,b1); + return _mm256_set_m128i(a1,a0); #endif #if defined (AVX2) - return _mm256_sub_epi32(a,b); + return _mm256_sub_epi32(a,b); #endif - } - }; + } +}; - struct MultRealPart{ - inline __m256 operator()(__m256 a, __m256 b){ - __m256 ymm0; - ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, - return _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br - } - inline __m256d operator()(__m256d a, __m256d b){ - __m256d ymm0; - ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00 - return _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br - } - }; - struct MaddRealPart{ - inline __m256 operator()(__m256 a, __m256 b, __m256 c){ - __m256 ymm0 = _mm256_moveldup_ps(a); // ymm0 <- ar ar, - return _mm256_add_ps(_mm256_mul_ps( ymm0, b),c); - } - inline __m256d operator()(__m256d a, __m256d b, __m256d c){ - __m256d ymm0 = _mm256_shuffle_pd( a, a, 0x0 ); - return _mm256_add_pd(_mm256_mul_pd( ymm0, b),c); - } - }; +struct MultRealPart{ + inline __m256 operator()(__m256 a, __m256 b){ + __m256 ymm0; + ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, + return _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br + } + inline __m256d operator()(__m256d a, __m256d b){ + __m256d ymm0; + ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00 + return _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br + } +}; +struct MaddRealPart{ + inline __m256 operator()(__m256 a, __m256 b, __m256 c){ + __m256 ymm0 = _mm256_moveldup_ps(a); // ymm0 <- ar ar, + return _mm256_add_ps(_mm256_mul_ps( ymm0, b),c); + } + inline __m256d operator()(__m256d a, __m256d b, __m256d c){ + __m256d ymm0 = _mm256_shuffle_pd( a, a, 0x0 ); + return _mm256_add_pd(_mm256_mul_pd( ymm0, b),c); + } +}; - struct MultComplex{ - // Complex float - inline __m256 operator()(__m256 a, __m256 b){ +struct MultComplex{ + // Complex float + inline __m256 operator()(__m256 a, __m256 b){ #if defined (AVX1) - __m256 ymm0,ymm1,ymm2; - ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, - ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br - // FIXME AVX2 could MAC - ymm1 = _mm256_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi - ymm2 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai - ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi - return _mm256_addsub_ps(ymm0,ymm1); + __m256 ymm0,ymm1,ymm2; + ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, + ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br + // FIXME AVX2 could MAC + ymm1 = _mm256_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi + ymm2 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai + ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi + return _mm256_addsub_ps(ymm0,ymm1); #endif #if defined (AVXFMA4) - __m256 a_real = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ar ar, - __m256 a_imag = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ai ai - __m256 tmp = _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1)); - a_imag = _mm256_mul_ps( a_imag,tmp ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br - return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr + __m256 a_real = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ar ar, + __m256 a_imag = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ai ai + __m256 tmp = _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1)); + a_imag = _mm256_mul_ps( a_imag,tmp ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br + return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr #endif #if defined (AVX2) || defined (AVXFMA) - __m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar - __m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai - a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) )); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br - return _mm256_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr + __m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar + __m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai + a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) )); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br + return _mm256_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr #endif - } - // Complex double - inline __m256d operator()(__m256d a, __m256d b) { - // Multiplication of (ak+ibk)*(ck+idk) - // a + i b can be stored as a data structure - // From intel optimisation reference guide - /* - movsldup xmm0, Src1; load real parts into the destination, - ; a1, a1, a0, a0 - movaps xmm1, src2; load the 2nd pair of complex values, ; i.e. d1, c1, d0, c0 - mulps xmm0, xmm1; temporary results, a1d1, a1c1, a0d0, ; a0c0 - shufps xmm1, xmm1, b1; reorder the real and imaginary ; parts, c1, d1, c0, d0 - movshdup xmm2, Src1; load the imaginary parts into the ; destination, b1, b1, b0, b0 - mulps xmm2, xmm1; temporary results, b1c1, b1d1, b0c0, ; b0d0 - addsubps xmm0, xmm2; b1c1+a1d1, a1c1 -b1d1, b0c0+a0d - VSHUFPD (VEX.256 encoded version) - IF IMM0[0] = 0 - THEN DEST[63:0]=SRC1[63:0] ELSE DEST[63:0]=SRC1[127:64] FI; - IF IMM0[1] = 0 - THEN DEST[127:64]=SRC2[63:0] ELSE DEST[127:64]=SRC2[127:64] FI; - IF IMM0[2] = 0 - THEN DEST[191:128]=SRC1[191:128] ELSE DEST[191:128]=SRC1[255:192] FI; - IF IMM0[3] = 0 - THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i ; 0xC unchanged - */ + } + // Complex double + inline __m256d operator()(__m256d a, __m256d b) { + // Multiplication of (ak+ibk)*(ck+idk) + // a + i b can be stored as a data structure + // From intel optimisation reference guide + /* + movsldup xmm0, Src1; load real parts into the destination, + ; a1, a1, a0, a0 + movaps xmm1, src2; load the 2nd pair of complex values, ; i.e. d1, c1, d0, c0 + mulps xmm0, xmm1; temporary results, a1d1, a1c1, a0d0, ; a0c0 + shufps xmm1, xmm1, b1; reorder the real and imaginary ; parts, c1, d1, c0, d0 + movshdup xmm2, Src1; load the imaginary parts into the ; destination, b1, b1, b0, b0 + mulps xmm2, xmm1; temporary results, b1c1, b1d1, b0c0, ; b0d0 + addsubps xmm0, xmm2; b1c1+a1d1, a1c1 -b1d1, b0c0+a0d + VSHUFPD (VEX.256 encoded version) + IF IMM0[0] = 0 + THEN DEST[63:0]=SRC1[63:0] ELSE DEST[63:0]=SRC1[127:64] FI; + IF IMM0[1] = 0 + THEN DEST[127:64]=SRC2[63:0] ELSE DEST[127:64]=SRC2[127:64] FI; + IF IMM0[2] = 0 + THEN DEST[191:128]=SRC1[191:128] ELSE DEST[191:128]=SRC1[255:192] FI; + IF IMM0[3] = 0 + THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i ; 0xC unchanged + */ #if defined (AVX1) - __m256d ymm0,ymm1,ymm2; - ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00 - ymm0 = _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br - ymm1 = _mm256_shuffle_pd(b,b,0x5); // ymm1 <- br,bi b'01,01 - ymm2 = _mm256_shuffle_pd(a,a,0xF); // ymm2 <- ai,ai b'11,11 - ymm1 = _mm256_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi - return _mm256_addsub_pd(ymm0,ymm1); + __m256d ymm0,ymm1,ymm2; + ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00 + ymm0 = _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br + ymm1 = _mm256_shuffle_pd(b,b,0x5); // ymm1 <- br,bi b'01,01 + ymm2 = _mm256_shuffle_pd(a,a,0xF); // ymm2 <- ai,ai b'11,11 + ymm1 = _mm256_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi + return _mm256_addsub_pd(ymm0,ymm1); #endif #if defined (AVXFMA4) - __m256d a_real = _mm256_shuffle_pd(a,a,0x0);//arar - __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai - a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br - return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr + __m256d a_real = _mm256_shuffle_pd(a,a,0x0);//arar + __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai + a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br + return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr #endif #if defined (AVX2) || defined (AVXFMA) - __m256d a_real = _mm256_movedup_pd( a ); // Ar Ar - __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai - a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br - return _mm256_fmaddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr + __m256d a_real = _mm256_movedup_pd( a ); // Ar Ar + __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai + a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br + return _mm256_fmaddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr #endif - } + } - }; +}; #if 0 - struct ComplexDot { - - inline void Prep(__m256 ari,__m256 &air) { - cdotRIperm(ari,air); - } - inline void Mul(__m256 ari,__m256 air,__m256 b,__m256 &riir,__m256 &iirr) { - riir=air*b; - iirr=arr*b; - }; - inline void Madd(__m256 ari,__m256 air,__m256 b,__m256 &riir,__m256 &iirr) { - mac(riir,air,b); - mac(iirr,ari,b); - } - inline void End(__m256 ari,__m256 &air) { - // cdotRI - } +struct ComplexDot { + inline void Prep(__m256 ari,__m256 &air) { + cdotRIperm(ari,air); + } + inline void Mul(__m256 ari,__m256 air,__m256 b,__m256 &riir,__m256 &iirr) { + riir=air*b; + iirr=arr*b; }; + inline void Madd(__m256 ari,__m256 air,__m256 b,__m256 &riir,__m256 &iirr) { + mac(riir,air,b); + mac(iirr,ari,b); + } + inline void End(__m256 ari,__m256 &air) { + // cdotRI + } + +}; #endif - struct Mult{ +struct Mult{ - inline void mac(__m256 &a, __m256 b, __m256 c){ + inline void mac(__m256 &a, __m256 b, __m256 c){ #if defined (AVX1) - a= _mm256_add_ps(_mm256_mul_ps(b,c),a); + a= _mm256_add_ps(_mm256_mul_ps(b,c),a); #endif #if defined (AVXFMA4) - a= _mm256_macc_ps(b,c,a); + a= _mm256_macc_ps(b,c,a); #endif #if defined (AVX2) || defined (AVXFMA) - a= _mm256_fmadd_ps( b, c, a); + a= _mm256_fmadd_ps( b, c, a); #endif - } + } - inline void mac(__m256d &a, __m256d b, __m256d c){ + inline void mac(__m256d &a, __m256d b, __m256d c){ #if defined (AVX1) - a= _mm256_add_pd(_mm256_mul_pd(b,c),a); + a= _mm256_add_pd(_mm256_mul_pd(b,c),a); #endif #if defined (AVXFMA4) - a= _mm256_macc_pd(b,c,a); + a= _mm256_macc_pd(b,c,a); #endif #if defined (AVX2) || defined (AVXFMA) - a= _mm256_fmadd_pd( b, c, a); + a= _mm256_fmadd_pd( b, c, a); #endif - } + } - // Real float - inline __m256 operator()(__m256 a, __m256 b){ - return _mm256_mul_ps(a,b); - } - // Real double - inline __m256d operator()(__m256d a, __m256d b){ - return _mm256_mul_pd(a,b); - } - // Integer - inline __m256i operator()(__m256i a, __m256i b){ + // Real float + inline __m256 operator()(__m256 a, __m256 b){ + return _mm256_mul_ps(a,b); + } + // Real double + inline __m256d operator()(__m256d a, __m256d b){ + return _mm256_mul_pd(a,b); + } + // Integer + inline __m256i operator()(__m256i a, __m256i b){ #if defined (AVX1) || defined (AVXFMA) - __m128i a0,a1; - __m128i b0,b1; - a0 = _mm256_extractf128_si256(a,0); - b0 = _mm256_extractf128_si256(b,0); - a1 = _mm256_extractf128_si256(a,1); - b1 = _mm256_extractf128_si256(b,1); - a0 = _mm_mullo_epi32(a0,b0); - a1 = _mm_mullo_epi32(a1,b1); - return _mm256_set_m128i(a1,a0); + __m128i a0,a1; + __m128i b0,b1; + a0 = _mm256_extractf128_si256(a,0); + b0 = _mm256_extractf128_si256(b,0); + a1 = _mm256_extractf128_si256(a,1); + b1 = _mm256_extractf128_si256(b,1); + a0 = _mm_mullo_epi32(a0,b0); + a1 = _mm_mullo_epi32(a1,b1); + return _mm256_set_m128i(a1,a0); #endif #if defined (AVX2) - return _mm256_mullo_epi32(a,b); + return _mm256_mullo_epi32(a,b); #endif - } + } +}; + +struct Div { + // Real float + inline __m256 operator()(__m256 a, __m256 b) { + return _mm256_div_ps(a, b); + } + // Real double + inline __m256d operator()(__m256d a, __m256d b){ + return _mm256_div_pd(a,b); + } +}; + + +struct Conj{ + // Complex single + inline __m256 operator()(__m256 in){ + return _mm256_xor_ps(_mm256_addsub_ps(_mm256_setzero_ps(),in), _mm256_set1_ps(-0.f)); + } + // Complex double + inline __m256d operator()(__m256d in){ + return _mm256_xor_pd(_mm256_addsub_pd(_mm256_setzero_pd(),in), _mm256_set1_pd(-0.f)); + } + // do not define for integer input +}; + +struct TimesMinusI{ + //Complex single + inline __m256 operator()(__m256 in, __m256 ret){ + __m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in); // r,-i + return _mm256_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //-i,r + } + //Complex double + inline __m256d operator()(__m256d in, __m256d ret){ + __m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),in); // r,-i + return _mm256_shuffle_pd(tmp,tmp,0x5); + } +}; + +struct TimesI{ + //Complex single + inline __m256 operator()(__m256 in, __m256 ret){ + __m256 tmp =_mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // i,r + return _mm256_addsub_ps(_mm256_setzero_ps(),tmp); // i,-r + } + //Complex double + inline __m256d operator()(__m256d in, __m256d ret){ + __m256d tmp = _mm256_shuffle_pd(in,in,0x5); + return _mm256_addsub_pd(_mm256_setzero_pd(),tmp); // i,-r + } +}; + +////////////////////////////////////////////// +// Some Template specialization +////////////////////////////////////////////// + +struct Permute{ + + static inline __m256 Permute0(__m256 in){ + return _mm256_permute2f128_ps(in,in,0x01); //ABCD EFGH -> EFGH ABCD + }; + static inline __m256 Permute1(__m256 in){ + return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //ABCD EFGH -> CDAB GHEF + }; + static inline __m256 Permute2(__m256 in){ + return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //ABCD EFGH -> BADC FEHG + }; + static inline __m256 Permute3(__m256 in){ + return in; }; - struct Div { - // Real float - inline __m256 operator()(__m256 a, __m256 b) { - return _mm256_div_ps(a, b); - } - // Real double - inline __m256d operator()(__m256d a, __m256d b){ - return _mm256_div_pd(a,b); - } + static inline __m256d Permute0(__m256d in){ + return _mm256_permute2f128_pd(in,in,0x01); //AB CD -> CD AB }; - - - struct Conj{ - // Complex single - inline __m256 operator()(__m256 in){ - return _mm256_xor_ps(_mm256_addsub_ps(_mm256_setzero_ps(),in), _mm256_set1_ps(-0.f)); - } - // Complex double - inline __m256d operator()(__m256d in){ - return _mm256_xor_pd(_mm256_addsub_pd(_mm256_setzero_pd(),in), _mm256_set1_pd(-0.f)); - } - // do not define for integer input + static inline __m256d Permute1(__m256d in){ //AB CD -> BA DC + return _mm256_shuffle_pd(in,in,0x5); }; - - struct TimesMinusI{ - //Complex single - inline __m256 operator()(__m256 in, __m256 ret){ - __m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in); // r,-i - return _mm256_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //-i,r - } - //Complex double - inline __m256d operator()(__m256d in, __m256d ret){ - __m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),in); // r,-i - return _mm256_shuffle_pd(tmp,tmp,0x5); - } + static inline __m256d Permute2(__m256d in){ + return in; }; - - struct TimesI{ - //Complex single - inline __m256 operator()(__m256 in, __m256 ret){ - __m256 tmp =_mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // i,r - return _mm256_addsub_ps(_mm256_setzero_ps(),tmp); // i,-r - } - //Complex double - inline __m256d operator()(__m256d in, __m256d ret){ - __m256d tmp = _mm256_shuffle_pd(in,in,0x5); - return _mm256_addsub_pd(_mm256_setzero_pd(),tmp); // i,-r - } - }; - - ////////////////////////////////////////////// - // Some Template specialization - ////////////////////////////////////////////// - - struct Permute{ - - static inline __m256 Permute0(__m256 in){ - return _mm256_permute2f128_ps(in,in,0x01); //ABCD EFGH -> EFGH ABCD - }; - static inline __m256 Permute1(__m256 in){ - return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //ABCD EFGH -> CDAB GHEF - }; - static inline __m256 Permute2(__m256 in){ - return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //ABCD EFGH -> BADC FEHG - }; - static inline __m256 Permute3(__m256 in){ - return in; - }; - - static inline __m256d Permute0(__m256d in){ - return _mm256_permute2f128_pd(in,in,0x01); //AB CD -> CD AB - }; - static inline __m256d Permute1(__m256d in){ //AB CD -> BA DC - return _mm256_shuffle_pd(in,in,0x5); - }; - static inline __m256d Permute2(__m256d in){ - return in; - }; - static inline __m256d Permute3(__m256d in){ - return in; - }; + static inline __m256d Permute3(__m256d in){ + return in; }; +}; #define USE_FP16 - struct PrecisionChange { - static inline __m256i StoH (__m256 a,__m256 b) { - __m256i h; +struct PrecisionChange { + static inline __m256i StoH (__m256 a,__m256 b) { + __m256i h; #ifdef USE_FP16 - __m128i ha = _mm256_cvtps_ph(a,0); - __m128i hb = _mm256_cvtps_ph(b,0); - h =(__m256i) _mm256_castps128_ps256((__m128)ha); - h =(__m256i) _mm256_insertf128_ps((__m256)h,(__m128)hb,1); + __m128i ha = _mm256_cvtps_ph(a,0); + __m128i hb = _mm256_cvtps_ph(b,0); + h =(__m256i) _mm256_castps128_ps256((__m128)ha); + h =(__m256i) _mm256_insertf128_ps((__m256)h,(__m128)hb,1); #else - assert(0); + assert(0); #endif - return h; - } - static inline void HtoS (__m256i h,__m256 &sa,__m256 &sb) { + return h; + } + static inline void HtoS (__m256i h,__m256 &sa,__m256 &sb) { #ifdef USE_FP16 - sa = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,0)); - sb = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,1)); + sa = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,0)); + sb = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,1)); #else - assert(0); + assert(0); #endif - } - static inline __m256 DtoS (__m256d a,__m256d b) { - __m128 sa = _mm256_cvtpd_ps(a); - __m128 sb = _mm256_cvtpd_ps(b); - __m256 s = _mm256_castps128_ps256(sa); - s = _mm256_insertf128_ps(s,sb,1); - return s; - } - static inline void StoD (__m256 s,__m256d &a,__m256d &b) { - a = _mm256_cvtps_pd(_mm256_extractf128_ps(s,0)); - b = _mm256_cvtps_pd(_mm256_extractf128_ps(s,1)); - } - static inline __m256i DtoH (__m256d a,__m256d b,__m256d c,__m256d d) { - __m256 sa,sb; - sa = DtoS(a,b); - sb = DtoS(c,d); - return StoH(sa,sb); - } - static inline void HtoD (__m256i h,__m256d &a,__m256d &b,__m256d &c,__m256d &d) { - __m256 sa,sb; - HtoS(h,sa,sb); - StoD(sa,a,b); - StoD(sb,c,d); - } + } + static inline __m256 DtoS (__m256d a,__m256d b) { + __m128 sa = _mm256_cvtpd_ps(a); + __m128 sb = _mm256_cvtpd_ps(b); + __m256 s = _mm256_castps128_ps256(sa); + s = _mm256_insertf128_ps(s,sb,1); + return s; + } + static inline void StoD (__m256 s,__m256d &a,__m256d &b) { + a = _mm256_cvtps_pd(_mm256_extractf128_ps(s,0)); + b = _mm256_cvtps_pd(_mm256_extractf128_ps(s,1)); + } + static inline __m256i DtoH (__m256d a,__m256d b,__m256d c,__m256d d) { + __m256 sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); + } + static inline void HtoD (__m256i h,__m256d &a,__m256d &b,__m256d &c,__m256d &d) { + __m256 sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); + } +}; +struct Exchange{ + // 3210 ordering + static inline void Exchange0(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){ + //Invertible + //AB CD -> AC BD + //AC BD -> AB CD + out1= _mm256_permute2f128_ps(in1,in2,0x20); + out2= _mm256_permute2f128_ps(in1,in2,0x31); + }; + static inline void Exchange1(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){ + //Invertible + // ABCD EFGH ->ABEF CDGH + // ABEF CDGH ->ABCD EFGH + out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0)); + out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2)); + }; + static inline void Exchange2(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){ + // Invertible ? + // ABCD EFGH -> ACEG BDFH + // ACEG BDFH -> AEBF CGDH + // out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); + // out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); + // Bollocks; need + // AECG BFDH -> ABCD EFGH + out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); /*ACEG*/ + out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); /*BDFH*/ + out1= _mm256_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ + out2= _mm256_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ + }; + static inline void Exchange3(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){ + assert(0); + return; }; - struct Exchange{ - // 3210 ordering - static inline void Exchange0(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){ - //Invertible - //AB CD -> AC BD - //AC BD -> AB CD - out1= _mm256_permute2f128_ps(in1,in2,0x20); - out2= _mm256_permute2f128_ps(in1,in2,0x31); - }; - static inline void Exchange1(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){ - //Invertible - // ABCD EFGH ->ABEF CDGH - // ABEF CDGH ->ABCD EFGH - out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0)); - out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2)); - }; - static inline void Exchange2(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){ - // Invertible ? - // ABCD EFGH -> ACEG BDFH - // ACEG BDFH -> AEBF CGDH - // out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); - // out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); - // Bollocks; need - // AECG BFDH -> ABCD EFGH - out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); /*ACEG*/ - out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); /*BDFH*/ - out1= _mm256_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ - out2= _mm256_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ - }; - static inline void Exchange3(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){ - assert(0); - return; - }; - static inline void Exchange0(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){ - out1= _mm256_permute2f128_pd(in1,in2,0x20); - out2= _mm256_permute2f128_pd(in1,in2,0x31); - return; - }; - static inline void Exchange1(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){ - out1= _mm256_shuffle_pd(in1,in2,0x0); - out2= _mm256_shuffle_pd(in1,in2,0xF); - }; - static inline void Exchange2(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){ - assert(0); - return; - }; - static inline void Exchange3(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){ - assert(0); - return; - }; + static inline void Exchange0(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){ + out1= _mm256_permute2f128_pd(in1,in2,0x20); + out2= _mm256_permute2f128_pd(in1,in2,0x31); + return; }; + static inline void Exchange1(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){ + out1= _mm256_shuffle_pd(in1,in2,0x0); + out2= _mm256_shuffle_pd(in1,in2,0xF); + }; + static inline void Exchange2(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){ + assert(0); + return; + }; + static inline void Exchange3(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){ + assert(0); + return; + }; +}; #if defined (AVX2) @@ -567,203 +567,203 @@ namespace Optimization { #endif #if defined (AVX1) || defined (AVXFMA) -#define _mm256_alignr_epi32_grid(ret,a,b,n) { \ - __m128 aa, bb; \ - \ - aa = _mm256_extractf128_ps(a,1); \ - bb = _mm256_extractf128_ps(b,1); \ +#define _mm256_alignr_epi32_grid(ret,a,b,n) { \ + __m128 aa, bb; \ + \ + aa = _mm256_extractf128_ps(a,1); \ + bb = _mm256_extractf128_ps(b,1); \ aa = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16); \ - ret = _mm256_insertf128_ps(ret,aa,1); \ - \ - aa = _mm256_extractf128_ps(a,0); \ - bb = _mm256_extractf128_ps(b,0); \ + ret = _mm256_insertf128_ps(ret,aa,1); \ + \ + aa = _mm256_extractf128_ps(a,0); \ + bb = _mm256_extractf128_ps(b,0); \ aa = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16); \ - ret = _mm256_insertf128_ps(ret,aa,0); \ + ret = _mm256_insertf128_ps(ret,aa,0); \ } -#define _mm256_alignr_epi64_grid(ret,a,b,n) { \ - __m128d aa, bb; \ - \ - aa = _mm256_extractf128_pd(a,1); \ - bb = _mm256_extractf128_pd(b,1); \ +#define _mm256_alignr_epi64_grid(ret,a,b,n) { \ + __m128d aa, bb; \ + \ + aa = _mm256_extractf128_pd(a,1); \ + bb = _mm256_extractf128_pd(b,1); \ aa = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16); \ - ret = _mm256_insertf128_pd(ret,aa,1); \ - \ - aa = _mm256_extractf128_pd(a,0); \ - bb = _mm256_extractf128_pd(b,0); \ + ret = _mm256_insertf128_pd(ret,aa,1); \ + \ + aa = _mm256_extractf128_pd(a,0); \ + bb = _mm256_extractf128_pd(b,0); \ aa = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16); \ - ret = _mm256_insertf128_pd(ret,aa,0); \ + ret = _mm256_insertf128_pd(ret,aa,0); \ } #endif - struct Rotate{ +struct Rotate{ - static inline __m256 rotate(__m256 in,int n){ - switch(n){ - case 0: return tRotate<0>(in);break; - case 1: return tRotate<1>(in);break; - case 2: return tRotate<2>(in);break; - case 3: return tRotate<3>(in);break; - case 4: return tRotate<4>(in);break; - case 5: return tRotate<5>(in);break; - case 6: return tRotate<6>(in);break; - case 7: return tRotate<7>(in);break; - default: assert(0); - } + static inline __m256 rotate(__m256 in,int n){ + switch(n){ + case 0: return tRotate<0>(in);break; + case 1: return tRotate<1>(in);break; + case 2: return tRotate<2>(in);break; + case 3: return tRotate<3>(in);break; + case 4: return tRotate<4>(in);break; + case 5: return tRotate<5>(in);break; + case 6: return tRotate<6>(in);break; + case 7: return tRotate<7>(in);break; + default: assert(0); } - static inline __m256d rotate(__m256d in,int n){ - switch(n){ - case 0: return tRotate<0>(in);break; - case 1: return tRotate<1>(in);break; - case 2: return tRotate<2>(in);break; - case 3: return tRotate<3>(in);break; - default: assert(0); - } + } + static inline __m256d rotate(__m256d in,int n){ + switch(n){ + case 0: return tRotate<0>(in);break; + case 1: return tRotate<1>(in);break; + case 2: return tRotate<2>(in);break; + case 3: return tRotate<3>(in);break; + default: assert(0); } + } - template - static inline __m256 tRotate(__m256 in){ - __m256 tmp = Permute::Permute0(in); - __m256 ret; - if ( n > 3 ) { - _mm256_alignr_epi32_grid(ret,in,tmp,n); - } else { - _mm256_alignr_epi32_grid(ret,tmp,in,n); - } - return ret; + template + static inline __m256 tRotate(__m256 in){ + __m256 tmp = Permute::Permute0(in); + __m256 ret; + if ( n > 3 ) { + _mm256_alignr_epi32_grid(ret,in,tmp,n); + } else { + _mm256_alignr_epi32_grid(ret,tmp,in,n); } + return ret; + } - template - static inline __m256d tRotate(__m256d in){ - __m256d tmp = Permute::Permute0(in); - __m256d ret; - if ( n > 1 ) { - _mm256_alignr_epi64_grid(ret,in,tmp,n); - } else { - _mm256_alignr_epi64_grid(ret,tmp,in,n); - } - return ret; - }; - + template + static inline __m256d tRotate(__m256d in){ + __m256d tmp = Permute::Permute0(in); + __m256d ret; + if ( n > 1 ) { + _mm256_alignr_epi64_grid(ret,in,tmp,n); + } else { + _mm256_alignr_epi64_grid(ret,tmp,in,n); + } + return ret; }; - //Complex float Reduce - template<> - inline Grid::ComplexF Reduce::operator()(__m256 in){ - __m256 v1,v2; - v1=Optimization::Permute::Permute0(in); // avx 256; quad complex single - v1= _mm256_add_ps(v1,in); - v2=Optimization::Permute::Permute1(v1); - v1 = _mm256_add_ps(v1,v2); - u256f conv; conv.v = v1; - return Grid::ComplexF(conv.f[0],conv.f[1]); - } - - //Real float Reduce - template<> - inline Grid::RealF Reduce::operator()(__m256 in){ - __m256 v1,v2; - v1 = Optimization::Permute::Permute0(in); // avx 256; octo-double - v1 = _mm256_add_ps(v1,in); - v2 = Optimization::Permute::Permute1(v1); - v1 = _mm256_add_ps(v1,v2); - v2 = Optimization::Permute::Permute2(v1); - v1 = _mm256_add_ps(v1,v2); - u256f conv; conv.v=v1; - return conv.f[0]; - } - - - //Complex double Reduce - template<> - inline Grid::ComplexD Reduce::operator()(__m256d in){ - __m256d v1; - v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single - v1 = _mm256_add_pd(v1,in); - u256d conv; conv.v = v1; - return Grid::ComplexD(conv.f[0],conv.f[1]); - } - - //Real double Reduce - template<> - inline Grid::RealD Reduce::operator()(__m256d in){ - __m256d v1,v2; - v1 = Optimization::Permute::Permute0(in); // avx 256; quad double - v1 = _mm256_add_pd(v1,in); - v2 = Optimization::Permute::Permute1(v1); - v1 = _mm256_add_pd(v1,v2); - u256d conv; conv.v = v1; - return conv.f[0]; - } - - //Integer Reduce - template<> - inline Integer Reduce::operator()(__m256i in){ - __m128i ret; -#if defined (AVX2) - // AVX2 horizontal adds within upper and lower halves of register; use - // SSE to add upper and lower halves for result. - __m256i v1, v2; - __m128i u1, u2; - v1 = _mm256_hadd_epi32(in, in); - v2 = _mm256_hadd_epi32(v1, v1); - u1 = _mm256_castsi256_si128(v2); // upper half - u2 = _mm256_extracti128_si256(v2, 1); // lower half - ret = _mm_add_epi32(u1, u2); -#else - // No AVX horizontal add; extract upper and lower halves of register & use - // SSE intrinsics. - __m128i u1, u2, u3; - u1 = _mm256_extractf128_si256(in, 0); // upper half - u2 = _mm256_extractf128_si256(in, 1); // lower half - u3 = _mm_add_epi32(u1, u2); - u1 = _mm_hadd_epi32(u3, u3); - ret = _mm_hadd_epi32(u1, u1); -#endif - return _mm_cvtsi128_si32(ret); - } +}; +//Complex float Reduce +template<> +inline Grid::ComplexF Reduce::operator()(__m256 in){ + __m256 v1,v2; + v1=Optimization::Permute::Permute0(in); // avx 256; quad complex single + v1= _mm256_add_ps(v1,in); + v2=Optimization::Permute::Permute1(v1); + v1 = _mm256_add_ps(v1,v2); + u256f conv; conv.v = v1; + return Grid::ComplexF(conv.f[0],conv.f[1]); } +//Real float Reduce +template<> +inline Grid::RealF Reduce::operator()(__m256 in){ + __m256 v1,v2; + v1 = Optimization::Permute::Permute0(in); // avx 256; octo-double + v1 = _mm256_add_ps(v1,in); + v2 = Optimization::Permute::Permute1(v1); + v1 = _mm256_add_ps(v1,v2); + v2 = Optimization::Permute::Permute2(v1); + v1 = _mm256_add_ps(v1,v2); + u256f conv; conv.v=v1; + return conv.f[0]; +} + + +//Complex double Reduce +template<> +inline Grid::ComplexD Reduce::operator()(__m256d in){ + __m256d v1; + v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single + v1 = _mm256_add_pd(v1,in); + u256d conv; conv.v = v1; + return Grid::ComplexD(conv.f[0],conv.f[1]); +} + +//Real double Reduce +template<> +inline Grid::RealD Reduce::operator()(__m256d in){ + __m256d v1,v2; + v1 = Optimization::Permute::Permute0(in); // avx 256; quad double + v1 = _mm256_add_pd(v1,in); + v2 = Optimization::Permute::Permute1(v1); + v1 = _mm256_add_pd(v1,v2); + u256d conv; conv.v = v1; + return conv.f[0]; +} + +//Integer Reduce +template<> +inline Integer Reduce::operator()(__m256i in){ + __m128i ret; +#if defined (AVX2) + // AVX2 horizontal adds within upper and lower halves of register; use + // SSE to add upper and lower halves for result. + __m256i v1, v2; + __m128i u1, u2; + v1 = _mm256_hadd_epi32(in, in); + v2 = _mm256_hadd_epi32(v1, v1); + u1 = _mm256_castsi256_si128(v2); // upper half + u2 = _mm256_extracti128_si256(v2, 1); // lower half + ret = _mm_add_epi32(u1, u2); +#else + // No AVX horizontal add; extract upper and lower halves of register & use + // SSE intrinsics. + __m128i u1, u2, u3; + u1 = _mm256_extractf128_si256(in, 0); // upper half + u2 = _mm256_extractf128_si256(in, 1); // lower half + u3 = _mm_add_epi32(u1, u2); + u1 = _mm_hadd_epi32(u3, u3); + ret = _mm_hadd_epi32(u1, u1); +#endif + return _mm_cvtsi128_si32(ret); +} + +NAMESPACE_END(Optimization); + ////////////////////////////////////////////////////////////////////////////////////// // Here assign types - typedef __m256i SIMD_Htype; // Single precision type - typedef __m256 SIMD_Ftype; // Single precision type - typedef __m256d SIMD_Dtype; // Double precision type - typedef __m256i SIMD_Itype; // Integer type +typedef __m256i SIMD_Htype; // Single precision type +typedef __m256 SIMD_Ftype; // Single precision type +typedef __m256d SIMD_Dtype; // Double precision type +typedef __m256i SIMD_Itype; // Integer type - // prefecthing - inline void v_prefetch0(int size, const char *ptr){ - for(int i=0;i using ReduceSIMD = Optimization::Reduce; +template using ReduceSIMD = Optimization::Reduce; - // Arithmetic operations - typedef Optimization::Sum SumSIMD; - typedef Optimization::Sub SubSIMD; - typedef Optimization::Div DivSIMD; - typedef Optimization::Mult MultSIMD; - typedef Optimization::MultComplex MultComplexSIMD; - typedef Optimization::MultRealPart MultRealPartSIMD; - typedef Optimization::MaddRealPart MaddRealPartSIMD; - typedef Optimization::Conj ConjSIMD; - typedef Optimization::TimesMinusI TimesMinusISIMD; - typedef Optimization::TimesI TimesISIMD; +// Arithmetic operations +typedef Optimization::Sum SumSIMD; +typedef Optimization::Sub SubSIMD; +typedef Optimization::Div DivSIMD; +typedef Optimization::Mult MultSIMD; +typedef Optimization::MultComplex MultComplexSIMD; +typedef Optimization::MultRealPart MultRealPartSIMD; +typedef Optimization::MaddRealPart MaddRealPartSIMD; +typedef Optimization::Conj ConjSIMD; +typedef Optimization::TimesMinusI TimesMinusISIMD; +typedef Optimization::TimesI TimesISIMD; -} // namespace Grid +NAMESPACE_END(Grid) From fbc2380cb865174057b20b471b9b7c2e09e3dfce Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 18:05:36 +0000 Subject: [PATCH 004/754] NAMESPACE & format --- lib/simd/Grid_avx512.h | 1069 ++++++++++++++++++++-------------------- 1 file changed, 530 insertions(+), 539 deletions(-) diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index 85d27421..ff572464 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -25,112 +25,107 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #include +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Optimization); -namespace Grid{ -namespace Optimization { +union u512f { + __m512 v; + float f[16]; +}; - union u512f { - __m512 v; - float f[16]; - }; - - union u512d { - __m512d v; - double f[8]; - }; +union u512d { + __m512d v; + double f[8]; +}; - struct Vsplat{ - //Complex float - inline __m512 operator()(float a, float b){ - return _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a); - } - // Real float - inline __m512 operator()(float a){ - return _mm512_set1_ps(a); - } - //Complex double - inline __m512d operator()(double a, double b){ - return _mm512_set_pd(b,a,b,a,b,a,b,a); - } - //Real double - inline __m512d operator()(double a){ - return _mm512_set1_pd(a); - } - //Integer - inline __m512i operator()(Integer a){ - return _mm512_set1_epi32(a); - } - }; +struct Vsplat{ + //Complex float + inline __m512 operator()(float a, float b){ + return _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a); + } + // Real float + inline __m512 operator()(float a){ + return _mm512_set1_ps(a); + } + //Complex double + inline __m512d operator()(double a, double b){ + return _mm512_set_pd(b,a,b,a,b,a,b,a); + } + //Real double + inline __m512d operator()(double a){ + return _mm512_set1_pd(a); + } + //Integer + inline __m512i operator()(Integer a){ + return _mm512_set1_epi32(a); + } +}; - struct Vstore{ - //Float - inline void operator()(__m512 a, float* F){ - _mm512_store_ps(F,a); - } - //Double - inline void operator()(__m512d a, double* D){ - _mm512_store_pd(D,a); - } - //Integer - inline void operator()(__m512i a, Integer* I){ - _mm512_store_si512((__m512i *)I,a); - } +struct Vstore{ + //Float + inline void operator()(__m512 a, float* F){ + _mm512_store_ps(F,a); + } + //Double + inline void operator()(__m512d a, double* D){ + _mm512_store_pd(D,a); + } + //Integer + inline void operator()(__m512i a, Integer* I){ + _mm512_store_si512((__m512i *)I,a); + } - }; +}; +struct Vstream{ + //Float + inline void operator()(float * a, __m512 b){ + _mm512_stream_ps(a,b); + // _mm512_store_ps(a,b); + } + //Double + inline void operator()(double * a, __m512d b){ + _mm512_stream_pd(a,b); + // _mm512_store_pd(a,b); + } - struct Vstream{ - //Float - inline void operator()(float * a, __m512 b){ - _mm512_stream_ps(a,b); - // _mm512_store_ps(a,b); - } - //Double - inline void operator()(double * a, __m512d b){ - _mm512_stream_pd(a,b); - // _mm512_store_pd(a,b); - } +}; - }; +struct Vset{ + // Complex float + inline __m512 operator()(Grid::ComplexF *a){ + return _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(), + a[5].imag(),a[5].real(),a[4].imag(),a[4].real(), + a[3].imag(),a[3].real(),a[2].imag(),a[2].real(), + a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); + } + // Complex double + inline __m512d operator()(Grid::ComplexD *a){ + return _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(), + a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); + } + // Real float + inline __m512 operator()(float *a){ + return _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8], + a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); + } + // Real double + inline __m512d operator()(double *a){ + return _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); + } + // Integer + inline __m512i operator()(Integer *a){ + return _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8], + a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); + } +}; - - struct Vset{ - // Complex float - inline __m512 operator()(Grid::ComplexF *a){ - return _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(), - a[5].imag(),a[5].real(),a[4].imag(),a[4].real(), - a[3].imag(),a[3].real(),a[2].imag(),a[2].real(), - a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); - } - // Complex double - inline __m512d operator()(Grid::ComplexD *a){ - return _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(), - a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); - } - // Real float - inline __m512 operator()(float *a){ - return _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8], - a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); - } - // Real double - inline __m512d operator()(double *a){ - return _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); - } - // Integer - inline __m512i operator()(Integer *a){ - return _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8], - a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); - } - - - }; - - template +template struct Reduce{ //Need templated class to overload output type //General form must generate error if compiled @@ -140,501 +135,497 @@ namespace Optimization { return 0; } }; - - - ///////////////////////////////////////////////////// - // Arithmetic operations - ///////////////////////////////////////////////////// - struct Sum{ - //Complex/Real float - inline __m512 operator()(__m512 a, __m512 b){ - return _mm512_add_ps(a,b); - } - //Complex/Real double - inline __m512d operator()(__m512d a, __m512d b){ - return _mm512_add_pd(a,b); - } - //Integer - inline __m512i operator()(__m512i a, __m512i b){ - return _mm512_add_epi32(a,b); - } - }; +///////////////////////////////////////////////////// +// Arithmetic operations +///////////////////////////////////////////////////// +struct Sum{ + //Complex/Real float + inline __m512 operator()(__m512 a, __m512 b){ + return _mm512_add_ps(a,b); + } + //Complex/Real double + inline __m512d operator()(__m512d a, __m512d b){ + return _mm512_add_pd(a,b); + } + //Integer + inline __m512i operator()(__m512i a, __m512i b){ + return _mm512_add_epi32(a,b); + } +}; - struct Sub{ - //Complex/Real float - inline __m512 operator()(__m512 a, __m512 b){ - return _mm512_sub_ps(a,b); - } - //Complex/Real double - inline __m512d operator()(__m512d a, __m512d b){ - return _mm512_sub_pd(a,b); - } - //Integer - inline __m512i operator()(__m512i a, __m512i b){ - return _mm512_sub_epi32(a,b); - } - }; +struct Sub{ + //Complex/Real float + inline __m512 operator()(__m512 a, __m512 b){ + return _mm512_sub_ps(a,b); + } + //Complex/Real double + inline __m512d operator()(__m512d a, __m512d b){ + return _mm512_sub_pd(a,b); + } + //Integer + inline __m512i operator()(__m512i a, __m512i b){ + return _mm512_sub_epi32(a,b); + } +}; - // Note, we can beat the shuf overhead in chain with two temporaries - // Ar Ai , Br Bi, Ai Ar // one shuf - //tmpr Ar Br, Ai Bi // Mul/Mac/Mac - //tmpi Br Ai, Bi Ar // Mul/Mac/Mac - // add tmpi,shuf(tmpi) - // sub tmpr,shuf(tmpi) - // shuf(tmpr,tmpi). // Could drop/trade for write mask +// Note, we can beat the shuf overhead in chain with two temporaries +// Ar Ai , Br Bi, Ai Ar // one shuf +//tmpr Ar Br, Ai Bi // Mul/Mac/Mac +//tmpi Br Ai, Bi Ar // Mul/Mac/Mac +// add tmpi,shuf(tmpi) +// sub tmpr,shuf(tmpi) +// shuf(tmpr,tmpi). // Could drop/trade for write mask - // Gives - // 2mul,4 mac +add+sub = 8 flop type insns - // 3shuf + 2 (+shuf) = 5/6 simd perm and 1/2 the load. +// Gives +// 2mul,4 mac +add+sub = 8 flop type insns +// 3shuf + 2 (+shuf) = 5/6 simd perm and 1/2 the load. - struct MultRealPart{ - inline __m512 operator()(__m512 a, __m512 b){ - __m512 ymm0; - ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar, - return _mm512_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br - } - inline __m512d operator()(__m512d a, __m512d b){ - __m512d ymm0; - ymm0 = _mm512_shuffle_pd(a,a,0x00); // ymm0 <- ar ar, ar,ar b'00,00 - return _mm512_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br - } - }; - struct MaddRealPart{ - inline __m512 operator()(__m512 a, __m512 b, __m512 c){ - __m512 ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar, - return _mm512_fmadd_ps( ymm0, b, c); - } - inline __m512d operator()(__m512d a, __m512d b, __m512d c){ - __m512d ymm0 = _mm512_shuffle_pd( a, a, 0x00 ); - return _mm512_fmadd_pd( ymm0, b, c); - } - }; +struct MultRealPart{ + inline __m512 operator()(__m512 a, __m512 b){ + __m512 ymm0; + ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar, + return _mm512_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br + } + inline __m512d operator()(__m512d a, __m512d b){ + __m512d ymm0; + ymm0 = _mm512_shuffle_pd(a,a,0x00); // ymm0 <- ar ar, ar,ar b'00,00 + return _mm512_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br + } +}; +struct MaddRealPart{ + inline __m512 operator()(__m512 a, __m512 b, __m512 c){ + __m512 ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar, + return _mm512_fmadd_ps( ymm0, b, c); + } + inline __m512d operator()(__m512d a, __m512d b, __m512d c){ + __m512d ymm0 = _mm512_shuffle_pd( a, a, 0x00 ); + return _mm512_fmadd_pd( ymm0, b, c); + } +}; - struct MultComplex{ - // Complex float - inline __m512 operator()(__m512 a, __m512 b){ - // dup, dup, perm, mul, madd - __m512 a_real = _mm512_moveldup_ps( a ); // Ar Ar - __m512 a_imag = _mm512_movehdup_ps( a ); // Ai Ai - a_imag = _mm512_mul_ps( a_imag, _mm512_permute_ps( b, 0xB1 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br - return _mm512_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr - } - // Complex double - inline __m512d operator()(__m512d a, __m512d b){ - __m512d a_real = _mm512_shuffle_pd( a, a, 0x00 ); - __m512d a_imag = _mm512_shuffle_pd( a, a, 0xFF ); - a_imag = _mm512_mul_pd( a_imag, _mm512_permute_pd( b, 0x55 ) ); - return _mm512_fmaddsub_pd( a_real, b, a_imag ); - } - }; +struct MultComplex{ + // Complex float + inline __m512 operator()(__m512 a, __m512 b){ + // dup, dup, perm, mul, madd + __m512 a_real = _mm512_moveldup_ps( a ); // Ar Ar + __m512 a_imag = _mm512_movehdup_ps( a ); // Ai Ai + a_imag = _mm512_mul_ps( a_imag, _mm512_permute_ps( b, 0xB1 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br + return _mm512_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr + } + // Complex double + inline __m512d operator()(__m512d a, __m512d b){ + __m512d a_real = _mm512_shuffle_pd( a, a, 0x00 ); + __m512d a_imag = _mm512_shuffle_pd( a, a, 0xFF ); + a_imag = _mm512_mul_pd( a_imag, _mm512_permute_pd( b, 0x55 ) ); + return _mm512_fmaddsub_pd( a_real, b, a_imag ); + } +}; - struct Mult{ +struct Mult{ - inline void mac(__m512 &a, __m512 b, __m512 c){ - a= _mm512_fmadd_ps( b, c, a); - } - inline void mac(__m512d &a, __m512d b, __m512d c){ - a= _mm512_fmadd_pd( b, c, a); - } - // Real float - inline __m512 operator()(__m512 a, __m512 b){ - return _mm512_mul_ps(a,b); - } - // Real double - inline __m512d operator()(__m512d a, __m512d b){ - return _mm512_mul_pd(a,b); - } - // Integer - inline __m512i operator()(__m512i a, __m512i b){ - return _mm512_mullo_epi32(a,b); - } - }; + inline void mac(__m512 &a, __m512 b, __m512 c){ + a= _mm512_fmadd_ps( b, c, a); + } + inline void mac(__m512d &a, __m512d b, __m512d c){ + a= _mm512_fmadd_pd( b, c, a); + } + // Real float + inline __m512 operator()(__m512 a, __m512 b){ + return _mm512_mul_ps(a,b); + } + // Real double + inline __m512d operator()(__m512d a, __m512d b){ + return _mm512_mul_pd(a,b); + } + // Integer + inline __m512i operator()(__m512i a, __m512i b){ + return _mm512_mullo_epi32(a,b); + } +}; - struct Div{ - // Real float - inline __m512 operator()(__m512 a, __m512 b){ - return _mm512_div_ps(a,b); - } - // Real double - inline __m512d operator()(__m512d a, __m512d b){ - return _mm512_div_pd(a,b); - } - }; +struct Div{ + // Real float + inline __m512 operator()(__m512 a, __m512 b){ + return _mm512_div_ps(a,b); + } + // Real double + inline __m512d operator()(__m512d a, __m512d b){ + return _mm512_div_pd(a,b); + } +}; - struct Conj{ - // Complex single - inline __m512 operator()(__m512 in){ - return _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // Zero out 0+real 0-imag - } - // Complex double - inline __m512d operator()(__m512d in){ - return _mm512_mask_sub_pd(in, 0xaa,_mm512_setzero_pd(), in); - } - // do not define for integer input - }; +struct Conj{ + // Complex single + inline __m512 operator()(__m512 in){ + return _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // Zero out 0+real 0-imag + } + // Complex double + inline __m512d operator()(__m512d in){ + return _mm512_mask_sub_pd(in, 0xaa,_mm512_setzero_pd(), in); + } + // do not define for integer input +}; - struct TimesMinusI{ - //Complex single - inline __m512 operator()(__m512 in, __m512 ret){ - //__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag - //return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,1,0)); // 0x4E?? - __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); - return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); - } - //Complex double - inline __m512d operator()(__m512d in, __m512d ret){ - //__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag - //return _mm512_shuffle_pd(tmp,tmp,0x55); - __m512d tmp = _mm512_shuffle_pd(in,in,0x55); - return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); - } - }; +struct TimesMinusI{ + //Complex single + inline __m512 operator()(__m512 in, __m512 ret){ + //__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag + //return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,1,0)); // 0x4E?? + __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); + return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); + } + //Complex double + inline __m512d operator()(__m512d in, __m512d ret){ + //__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag + //return _mm512_shuffle_pd(tmp,tmp,0x55); + __m512d tmp = _mm512_shuffle_pd(in,in,0x55); + return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); + } +}; - struct TimesI{ - //Complex single - inline __m512 operator()(__m512 in, __m512 ret){ - __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); - return _mm512_mask_sub_ps(tmp,0x5555,_mm512_setzero_ps(),tmp); - } - //Complex double - inline __m512d operator()(__m512d in, __m512d ret){ - __m512d tmp = _mm512_shuffle_pd(in,in,0x55); - return _mm512_mask_sub_pd(tmp,0x55,_mm512_setzero_pd(),tmp); - } - - - }; +struct TimesI{ + //Complex single + inline __m512 operator()(__m512 in, __m512 ret){ + __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); + return _mm512_mask_sub_ps(tmp,0x5555,_mm512_setzero_ps(),tmp); + } + //Complex double + inline __m512d operator()(__m512d in, __m512d ret){ + __m512d tmp = _mm512_shuffle_pd(in,in,0x55); + return _mm512_mask_sub_pd(tmp,0x55,_mm512_setzero_pd(),tmp); + } +}; - // Gpermute utilities consider coalescing into 1 Gpermute - struct Permute{ +// Gpermute utilities consider coalescing into 1 Gpermute +struct Permute{ - static inline __m512 Permute0(__m512 in){ - return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); - }; - static inline __m512 Permute1(__m512 in){ - return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); - }; - static inline __m512 Permute2(__m512 in){ - return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); - }; - static inline __m512 Permute3(__m512 in){ - return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); - }; - - static inline __m512d Permute0(__m512d in){ - return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); - }; - static inline __m512d Permute1(__m512d in){ - return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); - }; - static inline __m512d Permute2(__m512d in){ - return _mm512_shuffle_pd(in,in,0x55); - }; - static inline __m512d Permute3(__m512d in){ - return in; - }; - + static inline __m512 Permute0(__m512 in){ + return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); }; + static inline __m512 Permute1(__m512 in){ + return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); + }; + static inline __m512 Permute2(__m512 in){ + return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); + }; + static inline __m512 Permute3(__m512 in){ + return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); + }; + + static inline __m512d Permute0(__m512d in){ + return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); + }; + static inline __m512d Permute1(__m512d in){ + return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); + }; + static inline __m512d Permute2(__m512d in){ + return _mm512_shuffle_pd(in,in,0x55); + }; + static inline __m512d Permute3(__m512d in){ + return in; + }; + +}; #define USE_FP16 - struct PrecisionChange { - static inline __m512i StoH (__m512 a,__m512 b) { - __m512i h; +struct PrecisionChange { + static inline __m512i StoH (__m512 a,__m512 b) { + __m512i h; #ifdef USE_FP16 - __m256i ha = _mm512_cvtps_ph(a,0); - __m256i hb = _mm512_cvtps_ph(b,0); - h =(__m512i) _mm512_castps256_ps512((__m256)ha); - h =(__m512i) _mm512_insertf64x4((__m512d)h,(__m256d)hb,1); + __m256i ha = _mm512_cvtps_ph(a,0); + __m256i hb = _mm512_cvtps_ph(b,0); + h =(__m512i) _mm512_castps256_ps512((__m256)ha); + h =(__m512i) _mm512_insertf64x4((__m512d)h,(__m256d)hb,1); #else - assert(0); + assert(0); #endif - return h; - } - static inline void HtoS (__m512i h,__m512 &sa,__m512 &sb) { + return h; + } + + static inline void HtoS (__m512i h,__m512 &sa,__m512 &sb) { #ifdef USE_FP16 - sa = _mm512_cvtph_ps((__m256i)_mm512_extractf64x4_pd((__m512d)h,0)); - sb = _mm512_cvtph_ps((__m256i)_mm512_extractf64x4_pd((__m512d)h,1)); + sa = _mm512_cvtph_ps((__m256i)_mm512_extractf64x4_pd((__m512d)h,0)); + sb = _mm512_cvtph_ps((__m256i)_mm512_extractf64x4_pd((__m512d)h,1)); #else - assert(0); + assert(0); #endif - } - static inline __m512 DtoS (__m512d a,__m512d b) { - __m256 sa = _mm512_cvtpd_ps(a); - __m256 sb = _mm512_cvtpd_ps(b); - __m512 s = _mm512_castps256_ps512(sa); - s =(__m512) _mm512_insertf64x4((__m512d)s,(__m256d)sb,1); - return s; - } - static inline void StoD (__m512 s,__m512d &a,__m512d &b) { - a = _mm512_cvtps_pd((__m256)_mm512_extractf64x4_pd((__m512d)s,0)); - b = _mm512_cvtps_pd((__m256)_mm512_extractf64x4_pd((__m512d)s,1)); - } - static inline __m512i DtoH (__m512d a,__m512d b,__m512d c,__m512d d) { - __m512 sa,sb; - sa = DtoS(a,b); - sb = DtoS(c,d); - return StoH(sa,sb); - } - static inline void HtoD (__m512i h,__m512d &a,__m512d &b,__m512d &c,__m512d &d) { - __m512 sa,sb; - HtoS(h,sa,sb); - StoD(sa,a,b); - StoD(sb,c,d); - } + } + + static inline __m512 DtoS (__m512d a,__m512d b) { + __m256 sa = _mm512_cvtpd_ps(a); + __m256 sb = _mm512_cvtpd_ps(b); + __m512 s = _mm512_castps256_ps512(sa); + s =(__m512) _mm512_insertf64x4((__m512d)s,(__m256d)sb,1); + return s; + } + + static inline void StoD (__m512 s,__m512d &a,__m512d &b) { + a = _mm512_cvtps_pd((__m256)_mm512_extractf64x4_pd((__m512d)s,0)); + b = _mm512_cvtps_pd((__m256)_mm512_extractf64x4_pd((__m512d)s,1)); + } + + static inline __m512i DtoH (__m512d a,__m512d b,__m512d c,__m512d d) { + __m512 sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); + } + + static inline void HtoD (__m512i h,__m512d &a,__m512d &b,__m512d &c,__m512d &d) { + __m512 sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); + } +}; +// On extracting face: Ah Al , Bh Bl -> Ah Bh, Al Bl +// On merging buffers: Ah,Bh , Al Bl -> Ah Al, Bh, Bl +// The operation is its own inverse +struct Exchange{ + // 3210 ordering + static inline void Exchange0(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){ + out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0)); + out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2)); + }; + static inline void Exchange1(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){ + out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); + out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); + out1= _mm512_shuffle_f32x4(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ + out2= _mm512_shuffle_f32x4(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ + }; + static inline void Exchange2(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){ + out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0)); + out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2)); + }; + static inline void Exchange3(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){ + out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); + out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); + out1= _mm512_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ + out2= _mm512_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ }; - // On extracting face: Ah Al , Bh Bl -> Ah Bh, Al Bl - // On merging buffers: Ah,Bh , Al Bl -> Ah Al, Bh, Bl - // The operation is its own inverse - struct Exchange{ - // 3210 ordering - static inline void Exchange0(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){ - out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0)); - out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2)); - }; - static inline void Exchange1(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){ - out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); - out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); - out1= _mm512_shuffle_f32x4(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ - out2= _mm512_shuffle_f32x4(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ - }; - static inline void Exchange2(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){ - out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0)); - out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2)); - }; - static inline void Exchange3(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){ - out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); - out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); - out1= _mm512_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ - out2= _mm512_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ - }; - static inline void Exchange0(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){ - out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0)); - out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2)); - }; - static inline void Exchange1(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){ - out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); - out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); - out1= _mm512_shuffle_f64x2(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ - out2= _mm512_shuffle_f64x2(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ - }; - static inline void Exchange2(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){ - out1 = _mm512_shuffle_pd(in1,in2,0x00); - out2 = _mm512_shuffle_pd(in1,in2,0xFF); - }; - static inline void Exchange3(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){ - assert(0); - return; - }; + static inline void Exchange0(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){ + out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0)); + out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2)); + }; + static inline void Exchange1(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){ + out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); + out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); + out1= _mm512_shuffle_f64x2(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ + out2= _mm512_shuffle_f64x2(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ + }; + static inline void Exchange2(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){ + out1 = _mm512_shuffle_pd(in1,in2,0x00); + out2 = _mm512_shuffle_pd(in1,in2,0xFF); + }; + static inline void Exchange3(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){ + assert(0); + return; + }; +}; + + +struct Rotate{ + + static inline __m512 rotate(__m512 in,int n){ + switch(n){ + case 0: return tRotate<0>(in);break; + case 1: return tRotate<1>(in);break; + case 2: return tRotate<2>(in);break; + case 3: return tRotate<3>(in);break; + case 4: return tRotate<4>(in);break; + case 5: return tRotate<5>(in);break; + case 6: return tRotate<6>(in);break; + case 7: return tRotate<7>(in);break; + + case 8 : return tRotate<8>(in);break; + case 9 : return tRotate<9>(in);break; + case 10: return tRotate<10>(in);break; + case 11: return tRotate<11>(in);break; + case 12: return tRotate<12>(in);break; + case 13: return tRotate<13>(in);break; + case 14: return tRotate<14>(in);break; + case 15: return tRotate<15>(in);break; + default: assert(0); + } + } + static inline __m512d rotate(__m512d in,int n){ + switch(n){ + case 0: return tRotate<0>(in);break; + case 1: return tRotate<1>(in);break; + case 2: return tRotate<2>(in);break; + case 3: return tRotate<3>(in);break; + case 4: return tRotate<4>(in);break; + case 5: return tRotate<5>(in);break; + case 6: return tRotate<6>(in);break; + case 7: return tRotate<7>(in);break; + default: assert(0); + } + } + + template static inline __m512 tRotate(__m512 in){ + return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n); }; - - struct Rotate{ - - static inline __m512 rotate(__m512 in,int n){ - switch(n){ - case 0: return tRotate<0>(in);break; - case 1: return tRotate<1>(in);break; - case 2: return tRotate<2>(in);break; - case 3: return tRotate<3>(in);break; - case 4: return tRotate<4>(in);break; - case 5: return tRotate<5>(in);break; - case 6: return tRotate<6>(in);break; - case 7: return tRotate<7>(in);break; - - case 8 : return tRotate<8>(in);break; - case 9 : return tRotate<9>(in);break; - case 10: return tRotate<10>(in);break; - case 11: return tRotate<11>(in);break; - case 12: return tRotate<12>(in);break; - case 13: return tRotate<13>(in);break; - case 14: return tRotate<14>(in);break; - case 15: return tRotate<15>(in);break; - default: assert(0); - } - } - static inline __m512d rotate(__m512d in,int n){ - switch(n){ - case 0: return tRotate<0>(in);break; - case 1: return tRotate<1>(in);break; - case 2: return tRotate<2>(in);break; - case 3: return tRotate<3>(in);break; - case 4: return tRotate<4>(in);break; - case 5: return tRotate<5>(in);break; - case 6: return tRotate<6>(in);break; - case 7: return tRotate<7>(in);break; - default: assert(0); - } - } - - template static inline __m512 tRotate(__m512 in){ - return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n); - }; - - template static inline __m512d tRotate(__m512d in){ - return (__m512d)_mm512_alignr_epi64((__m512i)in,(__m512i)in,n); - }; - + template static inline __m512d tRotate(__m512d in){ + return (__m512d)_mm512_alignr_epi64((__m512i)in,(__m512i)in,n); }; - ////////////////////////////////////////////// - // Some Template specialization +}; - // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases +////////////////////////////////////////////// +// Some Template specialization + +// Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases #ifndef __INTEL_COMPILER #warning "Slow reduction due to incomplete reduce intrinsics" - //Complex float Reduce - template<> - inline Grid::ComplexF Reduce::operator()(__m512 in){ - __m512 v1,v2; - v1=Optimization::Permute::Permute0(in); // avx 512; quad complex single - v1= _mm512_add_ps(v1,in); - v2=Optimization::Permute::Permute1(v1); - v1 = _mm512_add_ps(v1,v2); - v2=Optimization::Permute::Permute2(v1); - v1 = _mm512_add_ps(v1,v2); - u512f conv; conv.v = v1; - return Grid::ComplexF(conv.f[0],conv.f[1]); - } +//Complex float Reduce +template<> +inline Grid::ComplexF Reduce::operator()(__m512 in){ + __m512 v1,v2; + v1=Optimization::Permute::Permute0(in); // avx 512; quad complex single + v1= _mm512_add_ps(v1,in); + v2=Optimization::Permute::Permute1(v1); + v1 = _mm512_add_ps(v1,v2); + v2=Optimization::Permute::Permute2(v1); + v1 = _mm512_add_ps(v1,v2); + u512f conv; conv.v = v1; + return Grid::ComplexF(conv.f[0],conv.f[1]); +} - //Real float Reduce - template<> - inline Grid::RealF Reduce::operator()(__m512 in){ - __m512 v1,v2; - v1 = Optimization::Permute::Permute0(in); // avx 512; octo-double - v1 = _mm512_add_ps(v1,in); - v2 = Optimization::Permute::Permute1(v1); - v1 = _mm512_add_ps(v1,v2); - v2 = Optimization::Permute::Permute2(v1); - v1 = _mm512_add_ps(v1,v2); - v2 = Optimization::Permute::Permute3(v1); - v1 = _mm512_add_ps(v1,v2); - u512f conv; conv.v=v1; - return conv.f[0]; - } +//Real float Reduce +template<> +inline Grid::RealF Reduce::operator()(__m512 in){ + __m512 v1,v2; + v1 = Optimization::Permute::Permute0(in); // avx 512; octo-double + v1 = _mm512_add_ps(v1,in); + v2 = Optimization::Permute::Permute1(v1); + v1 = _mm512_add_ps(v1,v2); + v2 = Optimization::Permute::Permute2(v1); + v1 = _mm512_add_ps(v1,v2); + v2 = Optimization::Permute::Permute3(v1); + v1 = _mm512_add_ps(v1,v2); + u512f conv; conv.v=v1; + return conv.f[0]; +} +//Complex double Reduce +template<> +inline Grid::ComplexD Reduce::operator()(__m512d in){ + __m512d v1; + v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single + v1 = _mm512_add_pd(v1,in); + v1 = Optimization::Permute::Permute1(in); // sse 128; paired complex single + v1 = _mm512_add_pd(v1,in); + u512d conv; conv.v = v1; + return Grid::ComplexD(conv.f[0],conv.f[1]); +} - //Complex double Reduce - template<> - inline Grid::ComplexD Reduce::operator()(__m512d in){ - __m512d v1; - v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single - v1 = _mm512_add_pd(v1,in); - v1 = Optimization::Permute::Permute1(in); // sse 128; paired complex single - v1 = _mm512_add_pd(v1,in); - u512d conv; conv.v = v1; - return Grid::ComplexD(conv.f[0],conv.f[1]); - } +//Real double Reduce +template<> +inline Grid::RealD Reduce::operator()(__m512d in){ + __m512d v1,v2; + v1 = Optimization::Permute::Permute0(in); // avx 512; quad double + v1 = _mm512_add_pd(v1,in); + v2 = Optimization::Permute::Permute1(v1); + v1 = _mm512_add_pd(v1,v2); + v2 = Optimization::Permute::Permute2(v1); + v1 = _mm512_add_pd(v1,v2); + u512d conv; conv.v = v1; + return conv.f[0]; +} - //Real double Reduce - template<> - inline Grid::RealD Reduce::operator()(__m512d in){ - __m512d v1,v2; - v1 = Optimization::Permute::Permute0(in); // avx 512; quad double - v1 = _mm512_add_pd(v1,in); - v2 = Optimization::Permute::Permute1(v1); - v1 = _mm512_add_pd(v1,v2); - v2 = Optimization::Permute::Permute2(v1); - v1 = _mm512_add_pd(v1,v2); - u512d conv; conv.v = v1; - return conv.f[0]; - } - - //Integer Reduce - template<> - inline Integer Reduce::operator()(__m512i in){ - // No full vector reduce, use AVX to add upper and lower halves of register - // and perform AVX reduction. - __m256i v1, v2, v3; - __m128i u1, u2, ret; - v1 = _mm512_castsi512_si256(in); // upper half - v2 = _mm512_extracti32x8_epi32(in, 1); // lower half - v3 = _mm256_add_epi32(v1, v2); - v1 = _mm256_hadd_epi32(v3, v3); - v2 = _mm256_hadd_epi32(v1, v1); - u1 = _mm256_castsi256_si128(v2) // upper half +//Integer Reduce +template<> +inline Integer Reduce::operator()(__m512i in){ + // No full vector reduce, use AVX to add upper and lower halves of register + // and perform AVX reduction. + __m256i v1, v2, v3; + __m128i u1, u2, ret; + v1 = _mm512_castsi512_si256(in); // upper half + v2 = _mm512_extracti32x8_epi32(in, 1); // lower half + v3 = _mm256_add_epi32(v1, v2); + v1 = _mm256_hadd_epi32(v3, v3); + v2 = _mm256_hadd_epi32(v1, v1); + u1 = _mm256_castsi256_si128(v2) // upper half u2 = _mm256_extracti128_si256(v2, 1); // lower half - ret = _mm_add_epi32(u1, u2); - return _mm_cvtsi128_si32(ret); - } + ret = _mm_add_epi32(u1, u2); + return _mm_cvtsi128_si32(ret); +} #else - //Complex float Reduce - template<> - inline Grid::ComplexF Reduce::operator()(__m512 in){ - return Grid::ComplexF(_mm512_mask_reduce_add_ps(0x5555, in),_mm512_mask_reduce_add_ps(0xAAAA, in)); - } - //Real float Reduce - template<> - inline Grid::RealF Reduce::operator()(__m512 in){ - return _mm512_reduce_add_ps(in); - } +//Complex float Reduce +template<> +inline Grid::ComplexF Reduce::operator()(__m512 in){ + return Grid::ComplexF(_mm512_mask_reduce_add_ps(0x5555, in),_mm512_mask_reduce_add_ps(0xAAAA, in)); +} +//Real float Reduce +template<> +inline Grid::RealF Reduce::operator()(__m512 in){ + return _mm512_reduce_add_ps(in); +} - //Complex double Reduce - template<> - inline Grid::ComplexD Reduce::operator()(__m512d in){ - return Grid::ComplexD(_mm512_mask_reduce_add_pd(0x55, in),_mm512_mask_reduce_add_pd(0xAA, in)); - } +//Complex double Reduce +template<> +inline Grid::ComplexD Reduce::operator()(__m512d in){ + return Grid::ComplexD(_mm512_mask_reduce_add_pd(0x55, in),_mm512_mask_reduce_add_pd(0xAA, in)); +} - //Real double Reduce - template<> - inline Grid::RealD Reduce::operator()(__m512d in){ - return _mm512_reduce_add_pd(in); - } +//Real double Reduce +template<> +inline Grid::RealD Reduce::operator()(__m512d in){ + return _mm512_reduce_add_pd(in); +} - //Integer Reduce - template<> - inline Integer Reduce::operator()(__m512i in){ - return _mm512_reduce_add_epi32(in); - } +//Integer Reduce +template<> +inline Integer Reduce::operator()(__m512i in){ + return _mm512_reduce_add_epi32(in); +} #endif -} +NAMESPACE_END(Optimization); ////////////////////////////////////////////////////////////////////////////////////// // Here assign types +typedef __m512i SIMD_Htype; // Single precision type +typedef __m512 SIMD_Ftype; // Single precision type +typedef __m512d SIMD_Dtype; // Double precision type +typedef __m512i SIMD_Itype; // Integer type - typedef __m512i SIMD_Htype; // Single precision type - typedef __m512 SIMD_Ftype; // Single precision type - typedef __m512d SIMD_Dtype; // Double precision type - typedef __m512i SIMD_Itype; // Integer type - - // prefecth - inline void v_prefetch0(int size, const char *ptr){ - for(int i=0;i using ReduceSIMD = Optimization::Reduce; - - - // Arithmetic operations - typedef Optimization::Sum SumSIMD; - typedef Optimization::Sub SubSIMD; - typedef Optimization::Mult MultSIMD; - typedef Optimization::Div DivSIMD; - typedef Optimization::MultComplex MultComplexSIMD; - typedef Optimization::MultRealPart MultRealPartSIMD; - typedef Optimization::MaddRealPart MaddRealPartSIMD; - typedef Optimization::Conj ConjSIMD; - typedef Optimization::TimesMinusI TimesMinusISIMD; - typedef Optimization::TimesI TimesISIMD; - } +inline void prefetch_HINT_T0(const char *ptr){ + _mm_prefetch(ptr,_MM_HINT_T0); +} + +// Function name aliases +typedef Optimization::Vsplat VsplatSIMD; +typedef Optimization::Vstore VstoreSIMD; +typedef Optimization::Vset VsetSIMD; +typedef Optimization::Vstream VstreamSIMD; +template using ReduceSIMD = Optimization::Reduce; + +// Arithmetic operations +typedef Optimization::Sum SumSIMD; +typedef Optimization::Sub SubSIMD; +typedef Optimization::Mult MultSIMD; +typedef Optimization::Div DivSIMD; +typedef Optimization::MultComplex MultComplexSIMD; +typedef Optimization::MultRealPart MultRealPartSIMD; +typedef Optimization::MaddRealPart MaddRealPartSIMD; +typedef Optimization::Conj ConjSIMD; +typedef Optimization::TimesMinusI TimesMinusISIMD; +typedef Optimization::TimesI TimesISIMD; + +NAMESPACE_END(Grid); From bbb657da5c31fdbcfbb2b695533305a85039ed90 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 18:10:11 +0000 Subject: [PATCH 005/754] NAMESPACE and formatting --- lib/simd/Grid_generic.h | 724 ++++++++++++++++++++-------------------- 1 file changed, 363 insertions(+), 361 deletions(-) diff --git a/lib/simd/Grid_generic.h b/lib/simd/Grid_generic.h index e1d5f894..d555a672 100644 --- a/lib/simd/Grid_generic.h +++ b/lib/simd/Grid_generic.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -25,413 +25,413 @@ Author: Antonin Portelli 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #include "Grid_generic_types.h" -namespace Grid { -namespace Optimization { +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Optimization); - struct Vsplat{ - // Complex - template - inline vec operator()(T a, T b){ - vec out; +struct Vsplat{ + // Complex + template + inline vec operator()(T a, T b){ + vec out; - VECTOR_FOR(i, W::r, 2) + VECTOR_FOR(i, W::r, 2) { out.v[i] = a; out.v[i+1] = b; } - return out; - } + return out; + } - // Real - template - inline vec operator()(T a){ - vec out; + // Real + template + inline vec operator()(T a){ + vec out; - VECTOR_FOR(i, W::r, 1) + VECTOR_FOR(i, W::r, 1) { out.v[i] = a; } - return out; - } - }; + return out; + } +}; - struct Vstore{ - // Real - template - inline void operator()(vec a, T *D){ - *((vec *)D) = a; - } - }; +struct Vstore{ + // Real + template + inline void operator()(vec a, T *D){ + *((vec *)D) = a; + } +}; - struct Vstream{ - // Real - template - inline void operator()(T * a, vec b){ - *((vec *)a) = b; - } - }; +struct Vstream{ + // Real + template + inline void operator()(T * a, vec b){ + *((vec *)a) = b; + } +}; - struct Vset{ - // Complex - template - inline vec operator()(std::complex *a){ - vec out; +struct Vset{ + // Complex + template + inline vec operator()(std::complex *a){ + vec out; - VECTOR_FOR(i, W::c, 1) + VECTOR_FOR(i, W::c, 1) { out.v[2*i] = a[i].real(); out.v[2*i+1] = a[i].imag(); } - return out; - } + return out; + } - // Real - template - inline vec operator()(T *a){ - vec out; + // Real + template + inline vec operator()(T *a){ + vec out; - out = *((vec *)a); + out = *((vec *)a); - return out; - } - }; + return out; + } +}; - ///////////////////////////////////////////////////// - // Arithmetic operations - ///////////////////////////////////////////////////// - struct Sum{ - // Complex/Real - template - inline vec operator()(vec a, vec b){ - vec out; +///////////////////////////////////////////////////// +// Arithmetic operations +///////////////////////////////////////////////////// +struct Sum{ + // Complex/Real + template + inline vec operator()(vec a, vec b){ + vec out; - VECTOR_FOR(i, W::r, 1) + VECTOR_FOR(i, W::r, 1) { out.v[i] = a.v[i] + b.v[i]; } - return out; - } - }; + return out; + } +}; - struct Sub{ - // Complex/Real - template - inline vec operator()(vec a, vec b){ - vec out; +struct Sub{ + // Complex/Real + template + inline vec operator()(vec a, vec b){ + vec out; - VECTOR_FOR(i, W::r, 1) + VECTOR_FOR(i, W::r, 1) { out.v[i] = a.v[i] - b.v[i]; } - return out; - } - }; + return out; + } +}; - struct Mult{ - // Real - template - inline vec operator()(vec a, vec b){ - vec out; +struct Mult{ + // Real + template + inline vec operator()(vec a, vec b){ + vec out; - VECTOR_FOR(i, W::r, 1) + VECTOR_FOR(i, W::r, 1) { out.v[i] = a.v[i]*b.v[i]; } - return out; - } - }; + return out; + } +}; - #define cmul(a, b, c, i)\ - c[i] = a[i]*b[i] - a[i+1]*b[i+1];\ +#define cmul(a, b, c, i) \ + c[i] = a[i]*b[i] - a[i+1]*b[i+1]; \ c[i+1] = a[i]*b[i+1] + a[i+1]*b[i]; - struct MultRealPart{ - template - inline vec operator()(vec a, vec b){ - vec out; +struct MultRealPart{ + template + inline vec operator()(vec a, vec b){ + vec out; - VECTOR_FOR(i, W::c, 1) + VECTOR_FOR(i, W::c, 1) { - out.v[2*i] = a.v[2*i]*b.v[2*i]; - out.v[2*i+1] = a.v[2*i]*b.v[2*i+1]; + out.v[2*i] = a.v[2*i]*b.v[2*i]; + out.v[2*i+1] = a.v[2*i]*b.v[2*i+1]; } - return out; - } - }; + return out; + } +}; - struct MaddRealPart{ - template - inline vec operator()(vec a, vec b, vec c){ - vec out; +struct MaddRealPart{ + template + inline vec operator()(vec a, vec b, vec c){ + vec out; - VECTOR_FOR(i, W::c, 1) + VECTOR_FOR(i, W::c, 1) { - out.v[2*i] = a.v[2*i]*b.v[2*i] + c.v[2*i]; - out.v[2*i+1] = a.v[2*i]*b.v[2*i+1] + c.v[2*i+1]; + out.v[2*i] = a.v[2*i]*b.v[2*i] + c.v[2*i]; + out.v[2*i+1] = a.v[2*i]*b.v[2*i+1] + c.v[2*i+1]; } - return out; - } - }; + return out; + } +}; - struct MultComplex{ - // Complex - template - inline vec operator()(vec a, vec b){ - vec out; +struct MultComplex{ + // Complex + template + inline vec operator()(vec a, vec b){ + vec out; - VECTOR_FOR(i, W::c, 1) + VECTOR_FOR(i, W::c, 1) { cmul(a.v, b.v, out.v, 2*i); } - return out; - } - }; + return out; + } +}; - #undef cmul +#undef cmul - struct Div{ - // Real - template - inline vec operator()(vec a, vec b){ - vec out; +struct Div{ + // Real + template + inline vec operator()(vec a, vec b){ + vec out; - VECTOR_FOR(i, W::r, 1) + VECTOR_FOR(i, W::r, 1) { out.v[i] = a.v[i]/b.v[i]; } - return out; - } - }; + return out; + } +}; - #define conj(a, b, i)\ - b[i] = a[i];\ +#define conj(a, b, i) \ + b[i] = a[i]; \ b[i+1] = -a[i+1]; - struct Conj{ - // Complex - template - inline vec operator()(vec a){ - vec out; +struct Conj{ + // Complex + template + inline vec operator()(vec a){ + vec out; - VECTOR_FOR(i, W::c, 1) + VECTOR_FOR(i, W::c, 1) { conj(a.v, out.v, 2*i); } - return out; - } - }; + return out; + } +}; - #undef conj +#undef conj - #define timesmi(a, b, i)\ - b[i] = a[i+1];\ +#define timesmi(a, b, i) \ + b[i] = a[i+1]; \ b[i+1] = -a[i]; - struct TimesMinusI{ - // Complex - template - inline vec operator()(vec a, vec b){ - vec out; +struct TimesMinusI{ + // Complex + template + inline vec operator()(vec a, vec b){ + vec out; - VECTOR_FOR(i, W::c, 1) + VECTOR_FOR(i, W::c, 1) { timesmi(a.v, out.v, 2*i); } - return out; - } - }; + return out; + } +}; - #undef timesmi +#undef timesmi - #define timesi(a, b, i)\ - b[i] = -a[i+1];\ +#define timesi(a, b, i) \ + b[i] = -a[i+1]; \ b[i+1] = a[i]; - struct TimesI{ - // Complex - template - inline vec operator()(vec a, vec b){ - vec out; +struct TimesI{ + // Complex + template + inline vec operator()(vec a, vec b){ + vec out; - VECTOR_FOR(i, W::c, 1) + VECTOR_FOR(i, W::c, 1) { timesi(a.v, out.v, 2*i); } - return out; - } - }; + return out; + } +}; - #undef timesi +#undef timesi - struct PrecisionChange { - static inline vech StoH (const vecf &a,const vecf &b) { - vech ret; +struct PrecisionChange { + static inline vech StoH (const vecf &a,const vecf &b) { + vech ret; #ifdef USE_FP16 - vech *ha = (vech *)&a; - vech *hb = (vech *)&b; - const int nf = W::r; - // VECTOR_FOR(i, nf,1){ ret.v[i] = ( (uint16_t *) &a.v[i])[1] ; } - // VECTOR_FOR(i, nf,1){ ret.v[i+nf] = ( (uint16_t *) &b.v[i])[1] ; } - VECTOR_FOR(i, nf,1){ ret.v[i] = ha->v[2*i+1]; } - VECTOR_FOR(i, nf,1){ ret.v[i+nf] = hb->v[2*i+1]; } + vech *ha = (vech *)&a; + vech *hb = (vech *)&b; + const int nf = W::r; + // VECTOR_FOR(i, nf,1){ ret.v[i] = ( (uint16_t *) &a.v[i])[1] ; } + // VECTOR_FOR(i, nf,1){ ret.v[i+nf] = ( (uint16_t *) &b.v[i])[1] ; } + VECTOR_FOR(i, nf,1){ ret.v[i] = ha->v[2*i+1]; } + VECTOR_FOR(i, nf,1){ ret.v[i+nf] = hb->v[2*i+1]; } #else - assert(0); + assert(0); #endif - return ret; - } - static inline void HtoS (vech h,vecf &sa,vecf &sb) { + return ret; + } + static inline void HtoS (vech h,vecf &sa,vecf &sb) { #ifdef USE_FP16 - const int nf = W::r; - const int nh = W::r; - vech *ha = (vech *)&sa; - vech *hb = (vech *)&sb; - VECTOR_FOR(i, nf, 1){ sb.v[i]= sa.v[i] = 0; } - // VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sa.v[i]))[1] = h.v[i];} - // VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sb.v[i]))[1] = h.v[i+nf];} - VECTOR_FOR(i, nf, 1){ ha->v[2*i+1]=h.v[i]; } - VECTOR_FOR(i, nf, 1){ hb->v[2*i+1]=h.v[i+nf]; } + const int nf = W::r; + const int nh = W::r; + vech *ha = (vech *)&sa; + vech *hb = (vech *)&sb; + VECTOR_FOR(i, nf, 1){ sb.v[i]= sa.v[i] = 0; } + // VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sa.v[i]))[1] = h.v[i];} + // VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sb.v[i]))[1] = h.v[i+nf];} + VECTOR_FOR(i, nf, 1){ ha->v[2*i+1]=h.v[i]; } + VECTOR_FOR(i, nf, 1){ hb->v[2*i+1]=h.v[i+nf]; } #else - assert(0); + assert(0); #endif - } - static inline vecf DtoS (vecd a,vecd b) { - const int nd = W::r; - const int nf = W::r; - vecf ret; - VECTOR_FOR(i, nd,1){ ret.v[i] = a.v[i] ; } - VECTOR_FOR(i, nd,1){ ret.v[i+nd] = b.v[i] ; } - return ret; - } - static inline void StoD (vecf s,vecd &a,vecd &b) { - const int nd = W::r; - VECTOR_FOR(i, nd,1){ a.v[i] = s.v[i] ; } - VECTOR_FOR(i, nd,1){ b.v[i] = s.v[i+nd] ; } - } - static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) { - vecf sa,sb; - sa = DtoS(a,b); - sb = DtoS(c,d); - return StoH(sa,sb); - } - static inline void HtoD (vech h,vecd &a,vecd &b,vecd &c,vecd &d) { - vecf sa,sb; - HtoS(h,sa,sb); - StoD(sa,a,b); - StoD(sb,c,d); - } - }; + } + static inline vecf DtoS (vecd a,vecd b) { + const int nd = W::r; + const int nf = W::r; + vecf ret; + VECTOR_FOR(i, nd,1){ ret.v[i] = a.v[i] ; } + VECTOR_FOR(i, nd,1){ ret.v[i+nd] = b.v[i] ; } + return ret; + } + static inline void StoD (vecf s,vecd &a,vecd &b) { + const int nd = W::r; + VECTOR_FOR(i, nd,1){ a.v[i] = s.v[i] ; } + VECTOR_FOR(i, nd,1){ b.v[i] = s.v[i+nd] ; } + } + static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) { + vecf sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); + } + static inline void HtoD (vech h,vecd &a,vecd &b,vecd &c,vecd &d) { + vecf sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); + } +}; - ////////////////////////////////////////////// - // Exchange support - struct Exchange{ +////////////////////////////////////////////// +// Exchange support +struct Exchange{ - template + template static inline void ExchangeN(vec &out1,vec &out2,vec &in1,vec &in2){ - const int w = W::r; - unsigned int mask = w >> (n + 1); - // std::cout << " Exchange "< - static inline void Exchange0(vec &out1,vec &out2,vec &in1,vec &in2){ - ExchangeN(out1,out2,in1,in2); - }; - template - static inline void Exchange1(vec &out1,vec &out2,vec &in1,vec &in2){ - ExchangeN(out1,out2,in1,in2); - }; - template - static inline void Exchange2(vec &out1,vec &out2,vec &in1,vec &in2){ - ExchangeN(out1,out2,in1,in2); - }; - template - static inline void Exchange3(vec &out1,vec &out2,vec &in1,vec &in2){ - ExchangeN(out1,out2,in1,in2); - }; + const int w = W::r; + unsigned int mask = w >> (n + 1); + // std::cout << " Exchange "< + static inline void Exchange0(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN(out1,out2,in1,in2); }; + template + static inline void Exchange1(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN(out1,out2,in1,in2); + }; + template + static inline void Exchange2(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN(out1,out2,in1,in2); + }; + template + static inline void Exchange3(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN(out1,out2,in1,in2); + }; +}; - ////////////////////////////////////////////// - // Some Template specialization - #define perm(a, b, n, w)\ - unsigned int _mask = w >> (n + 1);\ - VECTOR_FOR(i, w, 1)\ - {\ - b[i] = a[i^_mask];\ +////////////////////////////////////////////// +// Some Template specialization +#define perm(a, b, n, w) \ + unsigned int _mask = w >> (n + 1); \ + VECTOR_FOR(i, w, 1) \ + { \ + b[i] = a[i^_mask]; \ } - #define DECL_PERMUTE_N(n)\ - template \ - static inline vec Permute##n(vec in) {\ - vec out;\ - perm(in.v, out.v, n, W::r);\ - return out;\ +#define DECL_PERMUTE_N(n) \ + template \ + static inline vec Permute##n(vec in) { \ + vec out; \ + perm(in.v, out.v, n, W::r); \ + return out; \ } - struct Permute{ - DECL_PERMUTE_N(0); - DECL_PERMUTE_N(1); - DECL_PERMUTE_N(2); - DECL_PERMUTE_N(3); - }; +struct Permute{ + DECL_PERMUTE_N(0); + DECL_PERMUTE_N(1); + DECL_PERMUTE_N(2); + DECL_PERMUTE_N(3); +}; - #undef perm - #undef DECL_PERMUTE_N +#undef perm +#undef DECL_PERMUTE_N - #define rot(a, b, n, w)\ - VECTOR_FOR(i, w, 1)\ - {\ - b[i] = a[(i + n)%w];\ +#define rot(a, b, n, w) \ + VECTOR_FOR(i, w, 1) \ + { \ + b[i] = a[(i + n)%w]; \ } - struct Rotate{ +struct Rotate{ - template static inline vec tRotate(vec in){ - return rotate(in, n); - } + template static inline vec tRotate(vec in){ + return rotate(in, n); + } - template - static inline vec rotate(vec in, int n){ - vec out; + template + static inline vec rotate(vec in, int n){ + vec out; - rot(in.v, out.v, n, W::r); + rot(in.v, out.v, n, W::r); - return out; - } - }; - - #undef rot - - #define acc(v, a, off, step, n)\ - for (unsigned int i = off; i < n; i += step)\ - {\ - a += v[i];\ + return out; } +}; + +#undef rot - template +#define acc(v, a, off, step, n) \ + for (unsigned int i = off; i < n; i += step) \ + { \ + a += v[i]; \ + } + +template struct Reduce{ //Need templated class to overload output type //General form must generate error if compiled @@ -442,89 +442,91 @@ namespace Optimization { } }; - //Complex float Reduce - template <> - inline Grid::ComplexF Reduce::operator()(vecf in){ - float a = 0.f, b = 0.f; +//Complex float Reduce +template <> +inline Grid::ComplexF Reduce::operator()(vecf in){ + float a = 0.f, b = 0.f; - acc(in.v, a, 0, 2, W::r); - acc(in.v, b, 1, 2, W::r); + acc(in.v, a, 0, 2, W::r); + acc(in.v, b, 1, 2, W::r); - return Grid::ComplexF(a, b); - } - - //Real float Reduce - template<> - inline Grid::RealF Reduce::operator()(vecf in){ - float a = 0.; - - acc(in.v, a, 0, 1, W::r); - - return a; - } - - //Complex double Reduce - template<> - inline Grid::ComplexD Reduce::operator()(vecd in){ - double a = 0., b = 0.; - - acc(in.v, a, 0, 2, W::r); - acc(in.v, b, 1, 2, W::r); - - return Grid::ComplexD(a, b); - } - - //Real double Reduce - template<> - inline Grid::RealD Reduce::operator()(vecd in){ - double a = 0.f; - - acc(in.v, a, 0, 1, W::r); - - return a; - } - - //Integer Reduce - template<> - inline Integer Reduce::operator()(veci in){ - Integer a = 0; - - acc(in.v, a, 0, 1, W::r); - - return a; - } - - #undef acc // EIGEN compatibility + return Grid::ComplexF(a, b); } + +//Real float Reduce +template<> +inline Grid::RealF Reduce::operator()(vecf in){ + float a = 0.; + + acc(in.v, a, 0, 1, W::r); + + return a; +} + +//Complex double Reduce +template<> +inline Grid::ComplexD Reduce::operator()(vecd in){ + double a = 0., b = 0.; + + acc(in.v, a, 0, 2, W::r); + acc(in.v, b, 1, 2, W::r); + + return Grid::ComplexD(a, b); +} + +//Real double Reduce +template<> +inline Grid::RealD Reduce::operator()(vecd in){ + double a = 0.f; + + acc(in.v, a, 0, 1, W::r); + + return a; +} + +//Integer Reduce +template<> +inline Integer Reduce::operator()(veci in){ + Integer a = 0; + + acc(in.v, a, 0, 1, W::r); + + return a; +} + +#undef acc // EIGEN compatibility +NAMESPACE_END(Optimization) ////////////////////////////////////////////////////////////////////////////////////// // Here assign types - typedef Optimization::vech SIMD_Htype; // Reduced precision type - typedef Optimization::vecf SIMD_Ftype; // Single precision type - typedef Optimization::vecd SIMD_Dtype; // Double precision type - typedef Optimization::veci SIMD_Itype; // Integer type +typedef Optimization::vech SIMD_Htype; // Reduced precision type +typedef Optimization::vecf SIMD_Ftype; // Single precision type +typedef Optimization::vecd SIMD_Dtype; // Double precision type +typedef Optimization::veci SIMD_Itype; // Integer type - // prefetch utilities - inline void v_prefetch0(int size, const char *ptr){}; - inline void prefetch_HINT_T0(const char *ptr){}; +// prefetch utilities +inline void v_prefetch0(int size, const char *ptr){}; +inline void prefetch_HINT_T0(const char *ptr){}; - // Function name aliases - typedef Optimization::Vsplat VsplatSIMD; - typedef Optimization::Vstore VstoreSIMD; - typedef Optimization::Vset VsetSIMD; - typedef Optimization::Vstream VstreamSIMD; - template using ReduceSIMD = Optimization::Reduce; +// Function name aliases +typedef Optimization::Vsplat VsplatSIMD; +typedef Optimization::Vstore VstoreSIMD; +typedef Optimization::Vset VsetSIMD; +typedef Optimization::Vstream VstreamSIMD; +template using ReduceSIMD = Optimization::Reduce; + +// Arithmetic operations +typedef Optimization::Sum SumSIMD; +typedef Optimization::Sub SubSIMD; +typedef Optimization::Div DivSIMD; +typedef Optimization::Mult MultSIMD; +typedef Optimization::MultComplex MultComplexSIMD; +typedef Optimization::MultRealPart MultRealPartSIMD; +typedef Optimization::MaddRealPart MaddRealPartSIMD; +typedef Optimization::Conj ConjSIMD; +typedef Optimization::TimesMinusI TimesMinusISIMD; +typedef Optimization::TimesI TimesISIMD; + +NAMESPACE_END(Grid) - // Arithmetic operations - typedef Optimization::Sum SumSIMD; - typedef Optimization::Sub SubSIMD; - typedef Optimization::Div DivSIMD; - typedef Optimization::Mult MultSIMD; - typedef Optimization::MultComplex MultComplexSIMD; - typedef Optimization::MultRealPart MultRealPartSIMD; - typedef Optimization::MaddRealPart MaddRealPartSIMD; - typedef Optimization::Conj ConjSIMD; - typedef Optimization::TimesMinusI TimesMinusISIMD; - typedef Optimization::TimesI TimesISIMD; -} From 6ab744c720286e11af8d8e2cd8ea8c92b9017b06 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 18:11:04 +0000 Subject: [PATCH 006/754] NAMESPACE and formatting --- lib/simd/Grid_generic_types.h | 85 ++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 42 deletions(-) diff --git a/lib/simd/Grid_generic_types.h b/lib/simd/Grid_generic_types.h index 642f6ffe..282a76c3 100644 --- a/lib/simd/Grid_generic_types.h +++ b/lib/simd/Grid_generic_types.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -24,8 +24,8 @@ Author: Antonin Portelli 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes"); @@ -34,52 +34,53 @@ static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer mul // playing with compiler pragmas #ifdef VECTOR_LOOPS #ifdef __clang__ -#define VECTOR_FOR(i, w, inc)\ -_Pragma("clang loop unroll(full) vectorize(enable) interleave(enable) vectorize_width(w)")\ -for (unsigned int i = 0; i < w; i += inc) +#define VECTOR_FOR(i, w, inc) \ + _Pragma("clang loop unroll(full) vectorize(enable) interleave(enable) vectorize_width(w)") \ + for (unsigned int i = 0; i < w; i += inc) #elif defined __INTEL_COMPILER -#define VECTOR_FOR(i, w, inc)\ -_Pragma("simd vectorlength(w*8)")\ -for (unsigned int i = 0; i < w; i += inc) +#define VECTOR_FOR(i, w, inc) \ + _Pragma("simd vectorlength(w*8)") \ + for (unsigned int i = 0; i < w; i += inc) #else -#define VECTOR_FOR(i, w, inc)\ -for (unsigned int i = 0; i < w; i += inc) +#define VECTOR_FOR(i, w, inc) \ + for (unsigned int i = 0; i < w; i += inc) #endif #else -#define VECTOR_FOR(i, w, inc)\ -for (unsigned int i = 0; i < w; i += inc) +#define VECTOR_FOR(i, w, inc) \ + for (unsigned int i = 0; i < w; i += inc) #endif -namespace Grid { -namespace Optimization { +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Optimization); - // type traits giving the number of elements for each vector type - template struct W; - template <> struct W { - constexpr static unsigned int c = GEN_SIMD_WIDTH/16u; - constexpr static unsigned int r = GEN_SIMD_WIDTH/8u; - }; - template <> struct W { - constexpr static unsigned int c = GEN_SIMD_WIDTH/8u; - constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; - }; - template <> struct W { - constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; - }; - template <> struct W { - constexpr static unsigned int c = GEN_SIMD_WIDTH/4u; - constexpr static unsigned int r = GEN_SIMD_WIDTH/2u; - }; +// type traits giving the number of elements for each vector type +template struct W; +template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/16u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/8u; +}; +template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/8u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; +}; +template <> struct W { + constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; +}; +template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/4u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/2u; +}; - // SIMD vector types - template - struct vec { - alignas(GEN_SIMD_WIDTH) T v[W::r]; - }; +// SIMD vector types +template +struct vec { + alignas(GEN_SIMD_WIDTH) T v[W::r]; +}; - typedef vec vecf; - typedef vec vecd; - typedef vec vech; // half precision comms - typedef vec veci; +typedef vec vecf; +typedef vec vecd; +typedef vec vech; // half precision comms +typedef vec veci; -}} +NAMESPACE_END(Optimization); +NAMESPACE_END(Grid); From ec89714ccecd5d11c289951f581576c55517559c Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 18:24:16 +0000 Subject: [PATCH 007/754] NAMESPACE --- lib/simd/Grid_neon.h | 1060 +++++++++++++++++++++--------------------- 1 file changed, 528 insertions(+), 532 deletions(-) diff --git a/lib/simd/Grid_neon.h b/lib/simd/Grid_neon.h index 9f757fbf..f53575cb 100644 --- a/lib/simd/Grid_neon.h +++ b/lib/simd/Grid_neon.h @@ -25,8 +25,8 @@ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ /* @@ -45,555 +45,551 @@ #include "Grid_generic_types.h" #include -namespace Grid { -namespace Optimization { +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Optimization); - template - union uconv { - float32x4_t f; - vtype v; - }; - union u128f { - float32x4_t v; - float f[4]; - }; - union u128d { - float64x2_t v; - double f[2]; - }; - // half precision - union u128h { - float16x8_t v; - uint16_t f[8]; - }; +template +union uconv { + float32x4_t f; + vtype v; +}; +union u128f { + float32x4_t v; + float f[4]; +}; +union u128d { + float64x2_t v; + double f[2]; +}; +// half precision +union u128h { + float16x8_t v; + uint16_t f[8]; +}; - struct Vsplat{ - //Complex float - inline float32x4_t operator()(float a, float b){ - float tmp[4]={a,b,a,b}; - return vld1q_f32(tmp); - } - // Real float - inline float32x4_t operator()(float a){ - return vdupq_n_f32(a); - } - //Complex double - inline float64x2_t operator()(double a, double b){ - double tmp[2]={a,b}; - return vld1q_f64(tmp); - } - //Real double - inline float64x2_t operator()(double a){ - return vdupq_n_f64(a); - } - //Integer - inline uint32x4_t operator()(Integer a){ - return vdupq_n_u32(a); - } - }; - - struct Vstore{ - //Float - inline void operator()(float32x4_t a, float* F){ - vst1q_f32(F, a); - } - //Double - inline void operator()(float64x2_t a, double* D){ - vst1q_f64(D, a); - } - //Integer - inline void operator()(uint32x4_t a, Integer* I){ - vst1q_u32(I, a); - } - - }; - - struct Vstream{ // N:equivalents to _mm_stream_p* in NEON? - //Float // N:generic - inline void operator()(float * a, float32x4_t b){ - memcpy(a,&b,4*sizeof(float)); - } - //Double // N:generic - inline void operator()(double * a, float64x2_t b){ - memcpy(a,&b,2*sizeof(double)); - } - - - }; - - // Nils: Vset untested; not used currently in Grid at all; - // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b - struct Vset{ - // Complex float - inline float32x4_t operator()(Grid::ComplexF *a){ - float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()}; - return vld1q_f32(tmp); - } - // Complex double - inline float64x2_t operator()(Grid::ComplexD *a){ - double tmp[2]={a[0].imag(),a[0].real()}; - return vld1q_f64(tmp); - } - // Real float - inline float32x4_t operator()(float *a){ - float tmp[4]={a[3],a[2],a[1],a[0]}; - return vld1q_f32(tmp); - } - // Real double - inline float64x2_t operator()(double *a){ - double tmp[2]={a[1],a[0]}; - return vld1q_f64(tmp); - } - // Integer - inline uint32x4_t operator()(Integer *a){ - return vld1q_dup_u32(a); - } - }; - - template - struct Reduce{ - //Need templated class to overload output type - //General form must generate error if compiled - inline Out_type operator()(In_type in){ - printf("Error, using wrong Reduce function\n"); - exit(1); - return 0; - } - }; - - ///////////////////////////////////////////////////// - // Arithmetic operations - ///////////////////////////////////////////////////// - struct Sum{ - //Complex/Real float - inline float32x4_t operator()(float32x4_t a, float32x4_t b){ - return vaddq_f32(a,b); - } - //Complex/Real double - inline float64x2_t operator()(float64x2_t a, float64x2_t b){ - return vaddq_f64(a,b); - } - //Integer - inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){ - return vaddq_u32(a,b); - } - }; - - struct Sub{ - //Complex/Real float - inline float32x4_t operator()(float32x4_t a, float32x4_t b){ - return vsubq_f32(a,b); - } - //Complex/Real double - inline float64x2_t operator()(float64x2_t a, float64x2_t b){ - return vsubq_f64(a,b); - } - //Integer - inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){ - return vsubq_u32(a,b); - } - }; - - struct MultRealPart{ - inline float32x4_t operator()(float32x4_t a, float32x4_t b){ - float32x4_t re = vtrn1q_f32(a, a); - return vmulq_f32(re, b); - } - inline float64x2_t operator()(float64x2_t a, float64x2_t b){ - float64x2_t re = vzip1q_f64(a, a); - return vmulq_f64(re, b); - } - }; - - struct MaddRealPart{ - inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){ - float32x4_t re = vtrn1q_f32(a, a); - return vfmaq_f32(c, re, b); - } - inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){ - float64x2_t re = vzip1q_f64(a, a); - return vfmaq_f64(c, re, b); - } - }; - - struct Div{ - // Real float - inline float32x4_t operator()(float32x4_t a, float32x4_t b){ - return vdivq_f32(a, b); - } - // Real double - inline float64x2_t operator()(float64x2_t a, float64x2_t b){ - return vdivq_f64(a, b); - } - }; - - struct MultComplex{ - // Complex float - inline float32x4_t operator()(float32x4_t a, float32x4_t b){ - - float32x4_t r0, r1, r2, r3, r4; - - // a = ar ai Ar Ai - // b = br bi Br Bi - // collect real/imag part, negate bi and Bi - r0 = vtrn1q_f32(b, b); // br br Br Br - r1 = vnegq_f32(b); // -br -bi -Br -Bi - r2 = vtrn2q_f32(b, r1); // bi -bi Bi -Bi - - // the fun part - r3 = vmulq_f32(r2, a); // bi*ar -bi*ai ... - r4 = vrev64q_f32(r3); // -bi*ai bi*ar ... - - // fma(a,b,c) = a+b*c - return vfmaq_f32(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi ... - - // no fma, use mul and add - // float32x4_t r5; - // r5 = vmulq_f32(r0, a); - // return vaddq_f32(r4, r5); - } - // Complex double - inline float64x2_t operator()(float64x2_t a, float64x2_t b){ - - float64x2_t r0, r1, r2, r3, r4; - - // b = br bi - // collect real/imag part, negate bi - r0 = vtrn1q_f64(b, b); // br br - r1 = vnegq_f64(b); // -br -bi - r2 = vtrn2q_f64(b, r1); // bi -bi - - // the fun part - r3 = vmulq_f64(r2, a); // bi*ar -bi*ai - r4 = vextq_f64(r3,r3,1); // -bi*ai bi*ar - - // fma(a,b,c) = a+b*c - return vfmaq_f64(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi - - // no fma, use mul and add - // float64x2_t r5; - // r5 = vmulq_f64(r0, a); - // return vaddq_f64(r4, r5); - } - }; - - struct Mult{ - // Real float - inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){ - //return vaddq_f32(vmulq_f32(b,c),a); - return vfmaq_f32(a, b, c); - } - inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){ - //return vaddq_f64(vmulq_f64(b,c),a); - return vfmaq_f64(a, b, c); - } - inline float32x4_t operator()(float32x4_t a, float32x4_t b){ - return vmulq_f32(a,b); - } - // Real double - inline float64x2_t operator()(float64x2_t a, float64x2_t b){ - return vmulq_f64(a,b); - } - // Integer - inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){ - return vmulq_u32(a,b); - } - }; - - struct Conj{ - // Complex single - inline float32x4_t operator()(float32x4_t in){ - // ar ai br bi -> ar -ai br -bi - float32x4_t r0, r1; - r0 = vnegq_f32(in); // -ar -ai -br -bi - r1 = vrev64q_f32(r0); // -ai -ar -bi -br - return vtrn1q_f32(in, r1); // ar -ai br -bi - } - // Complex double - inline float64x2_t operator()(float64x2_t in){ - - float64x2_t r0, r1; - r0 = vextq_f64(in, in, 1); // ai ar - r1 = vnegq_f64(r0); // -ai -ar - return vextq_f64(r0, r1, 1); // ar -ai - } - // do not define for integer input - }; - - struct TimesMinusI{ - //Complex single - inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ - // ar ai br bi -> ai -ar ai -br - float32x4_t r0, r1; - r0 = vnegq_f32(in); // -ar -ai -br -bi - r1 = vrev64q_f32(in); // ai ar bi br - return vtrn1q_f32(r1, r0); // ar -ai br -bi - } - //Complex double - inline float64x2_t operator()(float64x2_t in, float64x2_t ret){ - // a ib -> b -ia - float64x2_t tmp; - tmp = vnegq_f64(in); - return vextq_f64(in, tmp, 1); - } - }; - - struct TimesI{ - //Complex single - inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ - // ar ai br bi -> -ai ar -bi br - float32x4_t r0, r1; - r0 = vnegq_f32(in); // -ar -ai -br -bi - r1 = vrev64q_f32(r0); // -ai -ar -bi -br - return vtrn1q_f32(r1, in); // -ai ar -bi br - } - //Complex double - inline float64x2_t operator()(float64x2_t in, float64x2_t ret){ - // a ib -> -b ia - float64x2_t tmp; - tmp = vnegq_f64(in); - return vextq_f64(tmp, in, 1); - } - }; - - struct Permute{ - - static inline float32x4_t Permute0(float32x4_t in){ // N:ok - // AB CD -> CD AB - return vextq_f32(in, in, 2); - }; - static inline float32x4_t Permute1(float32x4_t in){ // N:ok - // AB CD -> BA DC - return vrev64q_f32(in); - }; - static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle - return in; - }; - static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle - return in; - }; - - static inline float64x2_t Permute0(float64x2_t in){ // N:ok - // AB -> BA - return vextq_f64(in, in, 1); - }; - static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle - return in; - }; - static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle - return in; - }; - static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle - return in; - }; - - }; - - struct Rotate{ - - static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok - switch(n){ - case 0: // AB CD -> AB CD - return tRotate<0>(in); - break; - case 1: // AB CD -> BC DA - return tRotate<1>(in); - break; - case 2: // AB CD -> CD AB - return tRotate<2>(in); - break; - case 3: // AB CD -> DA BC - return tRotate<3>(in); - break; - default: assert(0); - } - } - static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok - switch(n){ - case 0: // AB -> AB - return tRotate<0>(in); - break; - case 1: // AB -> BA - return tRotate<1>(in); - break; - default: assert(0); - } - } - - template static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); }; - template static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); }; - - }; - - struct PrecisionChange { - - static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) { - float16x4_t h = vcvt_f16_f32(a); - return vcvt_high_f16_f32(h, b); - } - static inline void HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) { - sb = vcvt_high_f32_f16(h); - // there is no direct conversion from lower float32x4_t to float64x2_t - // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang - // float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang - // workaround for clang - uint32x4_t h1u = reinterpret_cast(h); - float16x8_t h1 = reinterpret_cast(vextq_u32(h1u, h1u, 2)); - sa = vcvt_high_f32_f16(h1); - } - static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) { - float32x2_t s = vcvt_f32_f64(a); - return vcvt_high_f32_f64(s, b); - - } - static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) { - b = vcvt_high_f64_f32(s); - // there is no direct conversion from lower float32x4_t to float64x2_t - float32x4_t s1 = vextq_f32(s, s, 2); - a = vcvt_high_f64_f32(s1); - - } - static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) { - float32x4_t s1 = DtoS(a, b); - float32x4_t s2 = DtoS(c, d); - return StoH(s1, s2); - } - static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) { - float32x4_t s1, s2; - HtoS(h, s1, s2); - StoD(s1, a, b); - StoD(s2, c, d); - } - }; - - ////////////////////////////////////////////// - // Exchange support - - struct Exchange{ - static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ - // in1: ABCD -> out1: ABEF - // in2: EFGH -> out2: CDGH - - // z: CDAB - float32x4_t z = vextq_f32(in1, in1, 2); - // out1: ABEF - out1 = vextq_f32(z, in2, 2); - - // z: GHEF - z = vextq_f32(in2, in2, 2); - // out2: CDGH - out2 = vextq_f32(in1, z, 2); - }; - - static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ - // in1: ABCD -> out1: AECG - // in2: EFGH -> out2: BFDH - out1 = vtrn1q_f32(in1, in2); - out2 = vtrn2q_f32(in1, in2); - }; - static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ - assert(0); - return; - }; - static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ - assert(0); - return; - }; - // double precision - static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ - // in1: AB -> out1: AC - // in2: CD -> out2: BD - out1 = vzip1q_f64(in1, in2); - out2 = vzip2q_f64(in1, in2); - }; - static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ - assert(0); - return; - }; - static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ - assert(0); - return; - }; - static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ - assert(0); - return; - }; - }; - - ////////////////////////////////////////////// - // Some Template specialization - - - //Complex float Reduce - template<> - inline Grid::ComplexF Reduce::operator()(float32x4_t in){ - float32x4_t v1; // two complex - v1 = Optimization::Permute::Permute0(in); - v1 = vaddq_f32(v1,in); - u128f conv; conv.v=v1; - return Grid::ComplexF(conv.f[0],conv.f[1]); +struct Vsplat{ + //Complex float + inline float32x4_t operator()(float a, float b){ + float tmp[4]={a,b,a,b}; + return vld1q_f32(tmp); } - //Real float Reduce - template<> - inline Grid::RealF Reduce::operator()(float32x4_t in){ - return vaddvq_f32(in); + // Real float + inline float32x4_t operator()(float a){ + return vdupq_n_f32(a); + } + //Complex double + inline float64x2_t operator()(double a, double b){ + double tmp[2]={a,b}; + return vld1q_f64(tmp); + } + //Real double + inline float64x2_t operator()(double a){ + return vdupq_n_f64(a); + } + //Integer + inline uint32x4_t operator()(Integer a){ + return vdupq_n_u32(a); + } +}; + +struct Vstore{ + //Float + inline void operator()(float32x4_t a, float* F){ + vst1q_f32(F, a); + } + //Double + inline void operator()(float64x2_t a, double* D){ + vst1q_f64(D, a); + } + //Integer + inline void operator()(uint32x4_t a, Integer* I){ + vst1q_u32(I, a); } +}; - //Complex double Reduce - template<> - inline Grid::ComplexD Reduce::operator()(float64x2_t in){ - u128d conv; conv.v = in; - return Grid::ComplexD(conv.f[0],conv.f[1]); +struct Vstream{ // N:equivalents to _mm_stream_p* in NEON? + //Float // N:generic + inline void operator()(float * a, float32x4_t b){ + memcpy(a,&b,4*sizeof(float)); + } + //Double // N:generic + inline void operator()(double * a, float64x2_t b){ + memcpy(a,&b,2*sizeof(double)); + } +}; + +// Nils: Vset untested; not used currently in Grid at all; +// git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b +struct Vset{ + // Complex float + inline float32x4_t operator()(Grid::ComplexF *a){ + float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()}; + return vld1q_f32(tmp); + } + // Complex double + inline float64x2_t operator()(Grid::ComplexD *a){ + double tmp[2]={a[0].imag(),a[0].real()}; + return vld1q_f64(tmp); + } + // Real float + inline float32x4_t operator()(float *a){ + float tmp[4]={a[3],a[2],a[1],a[0]}; + return vld1q_f32(tmp); + } + // Real double + inline float64x2_t operator()(double *a){ + double tmp[2]={a[1],a[0]}; + return vld1q_f64(tmp); + } + // Integer + inline uint32x4_t operator()(Integer *a){ + return vld1q_dup_u32(a); + } +}; + +template +struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } +}; + +///////////////////////////////////////////////////// +// Arithmetic operations +///////////////////////////////////////////////////// +struct Sum{ + //Complex/Real float + inline float32x4_t operator()(float32x4_t a, float32x4_t b){ + return vaddq_f32(a,b); + } + //Complex/Real double + inline float64x2_t operator()(float64x2_t a, float64x2_t b){ + return vaddq_f64(a,b); + } + //Integer + inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){ + return vaddq_u32(a,b); + } +}; + +struct Sub{ + //Complex/Real float + inline float32x4_t operator()(float32x4_t a, float32x4_t b){ + return vsubq_f32(a,b); + } + //Complex/Real double + inline float64x2_t operator()(float64x2_t a, float64x2_t b){ + return vsubq_f64(a,b); + } + //Integer + inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){ + return vsubq_u32(a,b); + } +}; + +struct MultRealPart{ + inline float32x4_t operator()(float32x4_t a, float32x4_t b){ + float32x4_t re = vtrn1q_f32(a, a); + return vmulq_f32(re, b); + } + inline float64x2_t operator()(float64x2_t a, float64x2_t b){ + float64x2_t re = vzip1q_f64(a, a); + return vmulq_f64(re, b); + } +}; + +struct MaddRealPart{ + inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){ + float32x4_t re = vtrn1q_f32(a, a); + return vfmaq_f32(c, re, b); + } + inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){ + float64x2_t re = vzip1q_f64(a, a); + return vfmaq_f64(c, re, b); + } +}; + +struct Div{ + // Real float + inline float32x4_t operator()(float32x4_t a, float32x4_t b){ + return vdivq_f32(a, b); + } + // Real double + inline float64x2_t operator()(float64x2_t a, float64x2_t b){ + return vdivq_f64(a, b); + } +}; + +struct MultComplex{ + // Complex float + inline float32x4_t operator()(float32x4_t a, float32x4_t b){ + + float32x4_t r0, r1, r2, r3, r4; + + // a = ar ai Ar Ai + // b = br bi Br Bi + // collect real/imag part, negate bi and Bi + r0 = vtrn1q_f32(b, b); // br br Br Br + r1 = vnegq_f32(b); // -br -bi -Br -Bi + r2 = vtrn2q_f32(b, r1); // bi -bi Bi -Bi + + // the fun part + r3 = vmulq_f32(r2, a); // bi*ar -bi*ai ... + r4 = vrev64q_f32(r3); // -bi*ai bi*ar ... + + // fma(a,b,c) = a+b*c + return vfmaq_f32(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi ... + + // no fma, use mul and add + // float32x4_t r5; + // r5 = vmulq_f32(r0, a); + // return vaddq_f32(r4, r5); + } + // Complex double + inline float64x2_t operator()(float64x2_t a, float64x2_t b){ + + float64x2_t r0, r1, r2, r3, r4; + + // b = br bi + // collect real/imag part, negate bi + r0 = vtrn1q_f64(b, b); // br br + r1 = vnegq_f64(b); // -br -bi + r2 = vtrn2q_f64(b, r1); // bi -bi + + // the fun part + r3 = vmulq_f64(r2, a); // bi*ar -bi*ai + r4 = vextq_f64(r3,r3,1); // -bi*ai bi*ar + + // fma(a,b,c) = a+b*c + return vfmaq_f64(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi + + // no fma, use mul and add + // float64x2_t r5; + // r5 = vmulq_f64(r0, a); + // return vaddq_f64(r4, r5); + } +}; + +struct Mult{ + // Real float + inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){ + //return vaddq_f32(vmulq_f32(b,c),a); + return vfmaq_f32(a, b, c); + } + inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){ + //return vaddq_f64(vmulq_f64(b,c),a); + return vfmaq_f64(a, b, c); + } + inline float32x4_t operator()(float32x4_t a, float32x4_t b){ + return vmulq_f32(a,b); + } + // Real double + inline float64x2_t operator()(float64x2_t a, float64x2_t b){ + return vmulq_f64(a,b); + } + // Integer + inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){ + return vmulq_u32(a,b); + } +}; + +struct Conj{ + // Complex single + inline float32x4_t operator()(float32x4_t in){ + // ar ai br bi -> ar -ai br -bi + float32x4_t r0, r1; + r0 = vnegq_f32(in); // -ar -ai -br -bi + r1 = vrev64q_f32(r0); // -ai -ar -bi -br + return vtrn1q_f32(in, r1); // ar -ai br -bi + } + // Complex double + inline float64x2_t operator()(float64x2_t in){ + + float64x2_t r0, r1; + r0 = vextq_f64(in, in, 1); // ai ar + r1 = vnegq_f64(r0); // -ai -ar + return vextq_f64(r0, r1, 1); // ar -ai + } + // do not define for integer input +}; + +struct TimesMinusI{ + //Complex single + inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ + // ar ai br bi -> ai -ar ai -br + float32x4_t r0, r1; + r0 = vnegq_f32(in); // -ar -ai -br -bi + r1 = vrev64q_f32(in); // ai ar bi br + return vtrn1q_f32(r1, r0); // ar -ai br -bi + } + //Complex double + inline float64x2_t operator()(float64x2_t in, float64x2_t ret){ + // a ib -> b -ia + float64x2_t tmp; + tmp = vnegq_f64(in); + return vextq_f64(in, tmp, 1); + } +}; + +struct TimesI{ + //Complex single + inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ + // ar ai br bi -> -ai ar -bi br + float32x4_t r0, r1; + r0 = vnegq_f32(in); // -ar -ai -br -bi + r1 = vrev64q_f32(r0); // -ai -ar -bi -br + return vtrn1q_f32(r1, in); // -ai ar -bi br + } + //Complex double + inline float64x2_t operator()(float64x2_t in, float64x2_t ret){ + // a ib -> -b ia + float64x2_t tmp; + tmp = vnegq_f64(in); + return vextq_f64(tmp, in, 1); + } +}; + +struct Permute{ + + static inline float32x4_t Permute0(float32x4_t in){ // N:ok + // AB CD -> CD AB + return vextq_f32(in, in, 2); + }; + static inline float32x4_t Permute1(float32x4_t in){ // N:ok + // AB CD -> BA DC + return vrev64q_f32(in); + }; + static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle + return in; + }; + static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle + return in; + }; + + static inline float64x2_t Permute0(float64x2_t in){ // N:ok + // AB -> BA + return vextq_f64(in, in, 1); + }; + static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle + return in; + }; + static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle + return in; + }; + static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle + return in; + }; + +}; + +struct Rotate{ + + static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok + switch(n){ + case 0: // AB CD -> AB CD + return tRotate<0>(in); + break; + case 1: // AB CD -> BC DA + return tRotate<1>(in); + break; + case 2: // AB CD -> CD AB + return tRotate<2>(in); + break; + case 3: // AB CD -> DA BC + return tRotate<3>(in); + break; + default: assert(0); + } + } + static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok + switch(n){ + case 0: // AB -> AB + return tRotate<0>(in); + break; + case 1: // AB -> BA + return tRotate<1>(in); + break; + default: assert(0); + } } - //Real double Reduce - template<> - inline Grid::RealD Reduce::operator()(float64x2_t in){ - return vaddvq_f64(in); - } + template static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); }; + template static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); }; - //Integer Reduce - template<> - inline Integer Reduce::operator()(uint32x4_t in){ - return vaddvq_u32(in); +}; + +struct PrecisionChange { + + static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) { + float16x4_t h = vcvt_f16_f32(a); + return vcvt_high_f16_f32(h, b); } + static inline void HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) { + sb = vcvt_high_f32_f16(h); + // there is no direct conversion from lower float32x4_t to float64x2_t + // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang + // float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang + // workaround for clang + uint32x4_t h1u = reinterpret_cast(h); + float16x8_t h1 = reinterpret_cast(vextq_u32(h1u, h1u, 2)); + sa = vcvt_high_f32_f16(h1); + } + static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) { + float32x2_t s = vcvt_f32_f64(a); + return vcvt_high_f32_f64(s, b); + + } + static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) { + b = vcvt_high_f64_f32(s); + // there is no direct conversion from lower float32x4_t to float64x2_t + float32x4_t s1 = vextq_f32(s, s, 2); + a = vcvt_high_f64_f32(s1); + + } + static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) { + float32x4_t s1 = DtoS(a, b); + float32x4_t s2 = DtoS(c, d); + return StoH(s1, s2); + } + static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) { + float32x4_t s1, s2; + HtoS(h, s1, s2); + StoD(s1, a, b); + StoD(s2, c, d); + } +}; + +////////////////////////////////////////////// +// Exchange support + +struct Exchange{ + static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ + // in1: ABCD -> out1: ABEF + // in2: EFGH -> out2: CDGH + + // z: CDAB + float32x4_t z = vextq_f32(in1, in1, 2); + // out1: ABEF + out1 = vextq_f32(z, in2, 2); + + // z: GHEF + z = vextq_f32(in2, in2, 2); + // out2: CDGH + out2 = vextq_f32(in1, z, 2); + }; + + static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ + // in1: ABCD -> out1: AECG + // in2: EFGH -> out2: BFDH + out1 = vtrn1q_f32(in1, in2); + out2 = vtrn2q_f32(in1, in2); + }; + static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ + assert(0); + return; + }; + static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ + assert(0); + return; + }; + // double precision + static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ + // in1: AB -> out1: AC + // in2: CD -> out2: BD + out1 = vzip1q_f64(in1, in2); + out2 = vzip2q_f64(in1, in2); + }; + static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ + assert(0); + return; + }; + static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ + assert(0); + return; + }; + static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ + assert(0); + return; + }; +}; + +////////////////////////////////////////////// +// Some Template specialization + + +//Complex float Reduce +template<> +inline Grid::ComplexF Reduce::operator()(float32x4_t in){ + float32x4_t v1; // two complex + v1 = Optimization::Permute::Permute0(in); + v1 = vaddq_f32(v1,in); + u128f conv; conv.v=v1; + return Grid::ComplexF(conv.f[0],conv.f[1]); } +//Real float Reduce +template<> +inline Grid::RealF Reduce::operator()(float32x4_t in){ + return vaddvq_f32(in); +} + + +//Complex double Reduce +template<> +inline Grid::ComplexD Reduce::operator()(float64x2_t in){ + u128d conv; conv.v = in; + return Grid::ComplexD(conv.f[0],conv.f[1]); +} + +//Real double Reduce +template<> +inline Grid::RealD Reduce::operator()(float64x2_t in){ + return vaddvq_f64(in); +} + +//Integer Reduce +template<> +inline Integer Reduce::operator()(uint32x4_t in){ + return vaddvq_u32(in); +} + +NAMESPACE_END(Optimization); ////////////////////////////////////////////////////////////////////////////////////// // Here assign types // typedef Optimization::vech SIMD_Htype; // Reduced precision type - typedef float16x8_t SIMD_Htype; // Half precision type - typedef float32x4_t SIMD_Ftype; // Single precision type - typedef float64x2_t SIMD_Dtype; // Double precision type - typedef uint32x4_t SIMD_Itype; // Integer type +typedef float16x8_t SIMD_Htype; // Half precision type +typedef float32x4_t SIMD_Ftype; // Single precision type +typedef float64x2_t SIMD_Dtype; // Double precision type +typedef uint32x4_t SIMD_Itype; // Integer type - inline void v_prefetch0(int size, const char *ptr){}; // prefetch utilities - inline void prefetch_HINT_T0(const char *ptr){}; +inline void v_prefetch0(int size, const char *ptr){}; // prefetch utilities +inline void prefetch_HINT_T0(const char *ptr){}; - // Function name aliases - typedef Optimization::Vsplat VsplatSIMD; - typedef Optimization::Vstore VstoreSIMD; - typedef Optimization::Vset VsetSIMD; - typedef Optimization::Vstream VstreamSIMD; - template using ReduceSIMD = Optimization::Reduce; +// Function name aliases +typedef Optimization::Vsplat VsplatSIMD; +typedef Optimization::Vstore VstoreSIMD; +typedef Optimization::Vset VsetSIMD; +typedef Optimization::Vstream VstreamSIMD; +template using ReduceSIMD = Optimization::Reduce; +// Arithmetic operations +typedef Optimization::Sum SumSIMD; +typedef Optimization::Sub SubSIMD; +typedef Optimization::Div DivSIMD; +typedef Optimization::Mult MultSIMD; +typedef Optimization::MultComplex MultComplexSIMD; +typedef Optimization::MultRealPart MultRealPartSIMD; +typedef Optimization::MaddRealPart MaddRealPartSIMD; +typedef Optimization::Conj ConjSIMD; +typedef Optimization::TimesMinusI TimesMinusISIMD; +typedef Optimization::TimesI TimesISIMD; - - - // Arithmetic operations - typedef Optimization::Sum SumSIMD; - typedef Optimization::Sub SubSIMD; - typedef Optimization::Div DivSIMD; - typedef Optimization::Mult MultSIMD; - typedef Optimization::MultComplex MultComplexSIMD; - typedef Optimization::MultRealPart MultRealPartSIMD; - typedef Optimization::MaddRealPart MaddRealPartSIMD; - typedef Optimization::Conj ConjSIMD; - typedef Optimization::TimesMinusI TimesMinusISIMD; - typedef Optimization::TimesI TimesISIMD; - -} +NAMESPACE_END(Grid); From 00c49d4c17f783c85a566fc5892cee02969b4207 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 18:25:39 +0000 Subject: [PATCH 008/754] Format --- lib/simd/Grid_qpx.h | 903 ++++++++++++++++++++++---------------------- 1 file changed, 452 insertions(+), 451 deletions(-) diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index 8de7bde8..a4efeb91 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -25,14 +25,14 @@ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - ******************************************************************************/ +******************************************************************************/ #ifndef GEN_SIMD_WIDTH #define GEN_SIMD_WIDTH 32u #endif #include "Grid_generic_types.h" // Definitions for simulated integer SIMD. -namespace Grid { +NAMESPACE_BEGIN(Grid); #ifdef QPX #include @@ -41,139 +41,140 @@ namespace Grid { #include #endif -namespace Optimization { - typedef struct - { - float v0,v1,v2,v3; - } vector4float; +NAMESPACE_BEGIN(Optimization); - inline std::ostream & operator<<(std::ostream& stream, const vector4double a) - { - stream << "{"<::r, 1) + VECTOR_FOR(i, W::r, 1) { out.v[i] = a; } - return out; - } - }; + return out; + } +}; - struct Vstore{ - //Float - inline void operator()(vector4double a, float *f){ - vec_st(a, 0, f); - } +struct Vstore{ + //Float + inline void operator()(vector4double a, float *f){ + vec_st(a, 0, f); + } - inline void operator()(vector4double a, vector4float &f){ - vec_st(a, 0, (float *)(&f)); - } + inline void operator()(vector4double a, vector4float &f){ + vec_st(a, 0, (float *)(&f)); + } - inline void operator()(vector4float a, float *f){ - f[0] = a.v0; - f[1] = a.v1; - f[2] = a.v2; - f[3] = a.v3; - } + inline void operator()(vector4float a, float *f){ + f[0] = a.v0; + f[1] = a.v1; + f[2] = a.v2; + f[3] = a.v3; + } - //Double - inline void operator()(vector4double a, double *d){ - vec_st(a, 0, d); - } + //Double + inline void operator()(vector4double a, double *d){ + vec_st(a, 0, d); + } - //Integer - inline void operator()(veci a, Integer *i){ - *((veci *)i) = a; - } - }; + //Integer + inline void operator()(veci a, Integer *i){ + *((veci *)i) = a; + } +}; - struct Vstream{ - //Float - inline void operator()(float *f, vector4double a){ - vec_st(a, 0, f); - } +struct Vstream{ + //Float + inline void operator()(float *f, vector4double a){ + vec_st(a, 0, f); + } - inline void operator()(vector4float f, vector4double a){ - vec_st(a, 0, (float *)(&f)); - } + inline void operator()(vector4float f, vector4double a){ + vec_st(a, 0, (float *)(&f)); + } - inline void operator()(float *f, vector4float a){ - f[0] = a.v0; - f[1] = a.v1; - f[2] = a.v2; - f[3] = a.v3; - } - //Double - inline void operator()(double *d, vector4double a){ - vec_st(a, 0, d); - } + inline void operator()(float *f, vector4float a){ + f[0] = a.v0; + f[1] = a.v1; + f[2] = a.v2; + f[3] = a.v3; + } + //Double + inline void operator()(double *d, vector4double a){ + vec_st(a, 0, d); + } - }; +}; - struct Vset{ - // Complex float - inline vector4float operator()(Grid::ComplexF *a){ - return (vector4float){a[0].real(), a[0].imag(), a[1].real(), a[1].imag()}; - } - // Complex double - inline vector4double operator()(Grid::ComplexD *a){ - return vec_ld(0, (double *)a); - } +struct Vset{ + // Complex float + inline vector4float operator()(Grid::ComplexF *a){ + return (vector4float){a[0].real(), a[0].imag(), a[1].real(), a[1].imag()}; + } + // Complex double + inline vector4double operator()(Grid::ComplexD *a){ + return vec_ld(0, (double *)a); + } - // Real float - inline vector4float operator()(float *a){ - return (vector4float){a[0], a[1], a[2], a[3]}; - } + // Real float + inline vector4float operator()(float *a){ + return (vector4float){a[0], a[1], a[2], a[3]}; + } - inline vector4double operator()(vector4float a){ - return vec_ld(0, (float *)(&a)); - } + inline vector4double operator()(vector4float a){ + return vec_ld(0, (float *)(&a)); + } - // Real double - inline vector4double operator()(double *a){ - return vec_ld(0, a); - } - // Integer - inline veci operator()(Integer *a){ - veci out; + // Real double + inline vector4double operator()(double *a){ + return vec_ld(0, a); + } + // Integer + inline veci operator()(Integer *a){ + veci out; - out = *((veci *)a); + out = *((veci *)a); - return out; - } - }; + return out; + } +}; - template +template struct Reduce{ //Need templated class to overload output type //General form must generate error if compiled @@ -184,406 +185,407 @@ namespace Optimization { } }; - ///////////////////////////////////////////////////// - // Arithmetic operations - ///////////////////////////////////////////////////// +///////////////////////////////////////////////////// +// Arithmetic operations +///////////////////////////////////////////////////// - #define FLOAT_WRAP_3(fn, pref)\ +#define FLOAT_WRAP_3(fn, pref) \ pref vector4float fn(vector4float a, vector4float b, vector4float c) \ - {\ - vector4double ad, bd, rd, cd; \ - vector4float r;\ - \ - ad = Vset()(a);\ - bd = Vset()(b);\ - cd = Vset()(c);\ - rd = fn(ad, bd, cd); \ - Vstore()(rd, r);\ - \ - return r;\ + { \ + vector4double ad, bd, rd, cd; \ + vector4float r; \ + \ + ad = Vset()(a); \ + bd = Vset()(b); \ + cd = Vset()(c); \ + rd = fn(ad, bd, cd); \ + Vstore()(rd, r); \ + \ + return r; \ } - #define FLOAT_WRAP_2(fn, pref)\ - pref vector4float fn(vector4float a, vector4float b)\ - {\ - vector4double ad, bd, rd;\ - vector4float r;\ - \ - ad = Vset()(a);\ - bd = Vset()(b);\ - rd = fn(ad, bd);\ - Vstore()(rd, r);\ - \ - return r;\ +#define FLOAT_WRAP_2(fn, pref) \ + pref vector4float fn(vector4float a, vector4float b) \ + { \ + vector4double ad, bd, rd; \ + vector4float r; \ + \ + ad = Vset()(a); \ + bd = Vset()(b); \ + rd = fn(ad, bd); \ + Vstore()(rd, r); \ + \ + return r; \ } - #define FLOAT_WRAP_1(fn, pref)\ - pref vector4float fn(vector4float a)\ - {\ - vector4double ad, rd;\ - vector4float r;\ - \ - ad = Vset()(a);\ - rd = fn(ad);\ - Vstore()(rd, r);\ - \ - return r;\ +#define FLOAT_WRAP_1(fn, pref) \ + pref vector4float fn(vector4float a) \ + { \ + vector4double ad, rd; \ + vector4float r; \ + \ + ad = Vset()(a); \ + rd = fn(ad); \ + Vstore()(rd, r); \ + \ + return r; \ } - struct Sum{ - //Complex/Real double - inline vector4double operator()(vector4double a, vector4double b){ - return vec_add(a, b); - } +struct Sum{ + //Complex/Real double + inline vector4double operator()(vector4double a, vector4double b){ + return vec_add(a, b); + } - //Complex/Real float - FLOAT_WRAP_2(operator(), inline) + //Complex/Real float + FLOAT_WRAP_2(operator(), inline) - //Integer - inline veci operator()(veci a, veci b){ - veci out; + //Integer + inline veci operator()(veci a, veci b){ + veci out; - VECTOR_FOR(i, W::r, 1) + VECTOR_FOR(i, W::r, 1) { out.v[i] = a.v[i] + b.v[i]; } - return out; - } - }; + return out; + } +}; - struct Sub{ - //Complex/Real double - inline vector4double operator()(vector4double a, vector4double b){ - return vec_sub(a, b); - } +struct Sub{ + //Complex/Real double + inline vector4double operator()(vector4double a, vector4double b){ + return vec_sub(a, b); + } - //Complex/Real float - FLOAT_WRAP_2(operator(), inline) + //Complex/Real float + FLOAT_WRAP_2(operator(), inline) - //Integer - inline veci operator()(veci a, veci b){ - veci out; + //Integer + inline veci operator()(veci a, veci b){ + veci out; - VECTOR_FOR(i, W::r, 1) + VECTOR_FOR(i, W::r, 1) { out.v[i] = a.v[i] - b.v[i]; } - return out; - } - }; + return out; + } +}; - struct MultRealPart{ - // Complex double - inline vector4double operator()(vector4double a, vector4double b){ - // return vec_xmul(b, a); - return vec_xmul(a, b); - } - FLOAT_WRAP_2(operator(), inline) - }; - struct MaddRealPart{ - // Complex double - inline vector4double operator()(vector4double a, vector4double b,vector4double c){ - return vec_xmadd(a, b, c); - } - FLOAT_WRAP_3(operator(), inline) - }; - struct MultComplex{ - // Complex double - inline vector4double operator()(vector4double a, vector4double b){ - return vec_xxnpmadd(a, b, vec_xmul(b, a)); - } +struct MultRealPart{ + // Complex double + inline vector4double operator()(vector4double a, vector4double b){ + // return vec_xmul(b, a); + return vec_xmul(a, b); + } + FLOAT_WRAP_2(operator(), inline) +}; +struct MaddRealPart{ + // Complex double + inline vector4double operator()(vector4double a, vector4double b,vector4double c){ + return vec_xmadd(a, b, c); + } + FLOAT_WRAP_3(operator(), inline) +}; +struct MultComplex{ + // Complex double + inline vector4double operator()(vector4double a, vector4double b){ + return vec_xxnpmadd(a, b, vec_xmul(b, a)); + } - // Complex float - FLOAT_WRAP_2(operator(), inline) - }; + // Complex float + FLOAT_WRAP_2(operator(), inline) +}; - struct Mult{ - // Real double - inline vector4double operator()(vector4double a, vector4double b){ - return vec_mul(a, b); - } +struct Mult{ + // Real double + inline vector4double operator()(vector4double a, vector4double b){ + return vec_mul(a, b); + } - // Real float - FLOAT_WRAP_2(operator(), inline) + // Real float + FLOAT_WRAP_2(operator(), inline) - // Integer - inline veci operator()(veci a, veci b){ - veci out; + // Integer + inline veci operator()(veci a, veci b){ + veci out; - VECTOR_FOR(i, W::r, 1) + VECTOR_FOR(i, W::r, 1) { out.v[i] = a.v[i]*b.v[i]; } - return out; - } - }; + return out; + } +}; - struct Div{ - // Real double - inline vector4double operator()(vector4double a, vector4double b){ - return vec_swdiv(a, b); - } +struct Div{ + // Real double + inline vector4double operator()(vector4double a, vector4double b){ + return vec_swdiv(a, b); + } - // Real float - FLOAT_WRAP_2(operator(), inline) + // Real float + FLOAT_WRAP_2(operator(), inline) - // Integer - inline veci operator()(veci a, veci b){ - veci out; + // Integer + inline veci operator()(veci a, veci b){ + veci out; - VECTOR_FOR(i, W::r, 1) + VECTOR_FOR(i, W::r, 1) { out.v[i] = a.v[i]/b.v[i]; } - return out; - } - }; + return out; + } +}; - struct Conj{ - // Complex double - inline vector4double operator()(vector4double v){ - return vec_mul(v, (vector4double){1., -1., 1., -1.}); - } +struct Conj{ + // Complex double + inline vector4double operator()(vector4double v){ + return vec_mul(v, (vector4double){1., -1., 1., -1.}); + } - // Complex float - FLOAT_WRAP_1(operator(), inline) - }; + // Complex float + FLOAT_WRAP_1(operator(), inline) +}; - struct TimesMinusI{ - //Complex double - inline vector4double operator()(vector4double v, vector4double ret){ - return vec_xxcpnmadd(v, (vector4double){1., 1., 1., 1.}, - (vector4double){0., 0., 0., 0.}); - } +struct TimesMinusI{ + //Complex double + inline vector4double operator()(vector4double v, vector4double ret){ + return vec_xxcpnmadd(v, (vector4double){1., 1., 1., 1.}, + (vector4double){0., 0., 0., 0.}); + } - // Complex float - FLOAT_WRAP_2(operator(), inline) - }; + // Complex float + FLOAT_WRAP_2(operator(), inline) +}; - struct TimesI{ - //Complex double - inline vector4double operator()(vector4double v, vector4double ret){ - return vec_xxcpnmadd(v, (vector4double){-1., -1., -1., -1.}, - (vector4double){0., 0., 0., 0.}); - } +struct TimesI{ + //Complex double + inline vector4double operator()(vector4double v, vector4double ret){ + return vec_xxcpnmadd(v, (vector4double){-1., -1., -1., -1.}, + (vector4double){0., 0., 0., 0.}); + } - // Complex float - FLOAT_WRAP_2(operator(), inline) - }; + // Complex float + FLOAT_WRAP_2(operator(), inline) +}; #define USE_FP16 - struct PrecisionChange { - static inline vech StoH (const vector4float &a, const vector4float &b) { - vech ret; - std::cout << GridLogError << "QPX single to half precision conversion not yet supported." << std::endl; - assert(0); - return ret; - } - static inline void HtoS (vech h, vector4float &sa, vector4float &sb) { - std::cout << GridLogError << "QPX half to single precision conversion not yet supported." << std::endl; - assert(0); - } - static inline vector4float DtoS (vector4double a, vector4double b) { - vector4float ret; - std::cout << GridLogError << "QPX double to single precision conversion not yet supported." << std::endl; - assert(0); - return ret; - } - static inline void StoD (vector4float s, vector4double &a, vector4double &b) { - std::cout << GridLogError << "QPX single to double precision conversion not yet supported." << std::endl; - assert(0); - } - static inline vech DtoH (vector4double a, vector4double b, - vector4double c, vector4double d) { - vech ret; - std::cout << GridLogError << "QPX double to half precision conversion not yet supported." << std::endl; - assert(0); - return ret; - } - static inline void HtoD (vech h, vector4double &a, vector4double &b, - vector4double &c, vector4double &d) { - std::cout << GridLogError << "QPX half to double precision conversion not yet supported." << std::endl; - assert(0); - } - }; +struct PrecisionChange { + static inline vech StoH (const vector4float &a, const vector4float &b) { + vech ret; + std::cout << GridLogError << "QPX single to half precision conversion not yet supported." << std::endl; + assert(0); + return ret; + } + static inline void HtoS (vech h, vector4float &sa, vector4float &sb) { + std::cout << GridLogError << "QPX half to single precision conversion not yet supported." << std::endl; + assert(0); + } + static inline vector4float DtoS (vector4double a, vector4double b) { + vector4float ret; + std::cout << GridLogError << "QPX double to single precision conversion not yet supported." << std::endl; + assert(0); + return ret; + } + static inline void StoD (vector4float s, vector4double &a, vector4double &b) { + std::cout << GridLogError << "QPX single to double precision conversion not yet supported." << std::endl; + assert(0); + } + static inline vech DtoH (vector4double a, vector4double b, + vector4double c, vector4double d) { + vech ret; + std::cout << GridLogError << "QPX double to half precision conversion not yet supported." << std::endl; + assert(0); + return ret; + } + static inline void HtoD (vech h, vector4double &a, vector4double &b, + vector4double &c, vector4double &d) { + std::cout << GridLogError << "QPX half to double precision conversion not yet supported." << std::endl; + assert(0); + } +}; - ////////////////////////////////////////////// - // Exchange support -#define FLOAT_WRAP_EXCHANGE(fn) \ +////////////////////////////////////////////// +// Exchange support +#define FLOAT_WRAP_EXCHANGE(fn) \ static inline void fn(vector4float &out1, vector4float &out2, \ - vector4float in1, vector4float in2) \ - { \ - vector4double out1d, out2d, in1d, in2d; \ - in1d = Vset()(in1); \ - in2d = Vset()(in2); \ - fn(out1d, out2d, in1d, in2d); \ - Vstore()(out1d, out1); \ - Vstore()(out2d, out2); \ + vector4float in1, vector4float in2) \ + { \ + vector4double out1d, out2d, in1d, in2d; \ + in1d = Vset()(in1); \ + in2d = Vset()(in2); \ + fn(out1d, out2d, in1d, in2d); \ + Vstore()(out1d, out1); \ + Vstore()(out2d, out2); \ } - struct Exchange{ +struct Exchange{ - // double precision - static inline void Exchange0(vector4double &out1, vector4double &out2, - vector4double in1, vector4double in2) { - out1 = vec_perm(in1, in2, vec_gpci(0145)); - out2 = vec_perm(in1, in2, vec_gpci(02367)); - } - static inline void Exchange1(vector4double &out1, vector4double &out2, - vector4double in1, vector4double in2) { - out1 = vec_perm(in1, in2, vec_gpci(0426)); - out2 = vec_perm(in1, in2, vec_gpci(01537)); - } - static inline void Exchange2(vector4double &out1, vector4double &out2, - vector4double in1, vector4double in2) { - assert(0); - } - static inline void Exchange3(vector4double &out1, vector4double &out2, - vector4double in1, vector4double in2) { - assert(0); - } + // double precision + static inline void Exchange0(vector4double &out1, vector4double &out2, + vector4double in1, vector4double in2) { + out1 = vec_perm(in1, in2, vec_gpci(0145)); + out2 = vec_perm(in1, in2, vec_gpci(02367)); + } + static inline void Exchange1(vector4double &out1, vector4double &out2, + vector4double in1, vector4double in2) { + out1 = vec_perm(in1, in2, vec_gpci(0426)); + out2 = vec_perm(in1, in2, vec_gpci(01537)); + } + static inline void Exchange2(vector4double &out1, vector4double &out2, + vector4double in1, vector4double in2) { + assert(0); + } + static inline void Exchange3(vector4double &out1, vector4double &out2, + vector4double in1, vector4double in2) { + assert(0); + } - // single precision - FLOAT_WRAP_EXCHANGE(Exchange0); - FLOAT_WRAP_EXCHANGE(Exchange1); - FLOAT_WRAP_EXCHANGE(Exchange2); - FLOAT_WRAP_EXCHANGE(Exchange3); + // single precision + FLOAT_WRAP_EXCHANGE(Exchange0); + FLOAT_WRAP_EXCHANGE(Exchange1); + FLOAT_WRAP_EXCHANGE(Exchange2); + FLOAT_WRAP_EXCHANGE(Exchange3); +}; + +struct Permute{ + //Complex double + static inline vector4double Permute0(vector4double v){ //0123 -> 2301 + return vec_perm(v, v, vec_gpci(02301)); + }; + static inline vector4double Permute1(vector4double v){ //0123 -> 1032 + return vec_perm(v, v, vec_gpci(01032)); + }; + static inline vector4double Permute2(vector4double v){ + return v; + }; + static inline vector4double Permute3(vector4double v){ + return v; }; - struct Permute{ - //Complex double - static inline vector4double Permute0(vector4double v){ //0123 -> 2301 - return vec_perm(v, v, vec_gpci(02301)); - }; - static inline vector4double Permute1(vector4double v){ //0123 -> 1032 - return vec_perm(v, v, vec_gpci(01032)); - }; - static inline vector4double Permute2(vector4double v){ - return v; - }; - static inline vector4double Permute3(vector4double v){ - return v; - }; + // Complex float + FLOAT_WRAP_1(Permute0, static inline) + FLOAT_WRAP_1(Permute1, static inline) + FLOAT_WRAP_1(Permute2, static inline) + FLOAT_WRAP_1(Permute3, static inline) +}; + +struct Rotate{ - // Complex float - FLOAT_WRAP_1(Permute0, static inline) - FLOAT_WRAP_1(Permute1, static inline) - FLOAT_WRAP_1(Permute2, static inline) - FLOAT_WRAP_1(Permute3, static inline) + template static inline vector4double tRotate(vector4double v){ + if ( n==1 ) return vec_perm(v, v, vec_gpci(01230)); + if ( n==2 ) return vec_perm(v, v, vec_gpci(02301)); + if ( n==3 ) return vec_perm(v, v, vec_gpci(03012)); + return v; }; - - struct Rotate{ - - template static inline vector4double tRotate(vector4double v){ - if ( n==1 ) return vec_perm(v, v, vec_gpci(01230)); - if ( n==2 ) return vec_perm(v, v, vec_gpci(02301)); - if ( n==3 ) return vec_perm(v, v, vec_gpci(03012)); - return v; - }; - template static inline vector4float tRotate(vector4float a) - { - vector4double ad, rd; - vector4float r; - ad = Vset()(a); - rd = tRotate(ad); - Vstore()(rd, r); - return r; - }; - - static inline vector4double rotate(vector4double v, int n){ - switch(n){ - case 0: - return v; - break; - case 1: - return tRotate<1>(v); - break; - case 2: - return tRotate<2>(v); - break; - case 3: - return tRotate<3>(v); - break; - default: assert(0); - } - } - - static inline vector4float rotate(vector4float v, int n){ - vector4double vd, rd; - vector4float r; - vd = Vset()(v); - rd = rotate(vd, n); - Vstore()(rd, r); - return r; - } + template static inline vector4float tRotate(vector4float a) + { + vector4double ad, rd; + vector4float r; + ad = Vset()(a); + rd = tRotate(ad); + Vstore()(rd, r); + return r; }; - - //Complex float Reduce - template<> - inline Grid::ComplexF - Reduce::operator()(vector4float v) { //2 complex - vector4float v1,v2; - - v1 = Optimization::Permute::Permute0(v); - v1 = Optimization::Sum()(v1, v); - - return Grid::ComplexF(v1.v0, v1.v1); - } - //Real float Reduce - template<> - inline Grid::RealF - Reduce::operator()(vector4float v){ //4 floats - vector4float v1,v2; - - v1 = Optimization::Permute::Permute0(v); - v1 = Optimization::Sum()(v1, v); - v2 = Optimization::Permute::Permute1(v1); - v1 = Optimization::Sum()(v1, v2); - - return v1.v0; - } - - - //Complex double Reduce - template<> - inline Grid::ComplexD - Reduce::operator()(vector4double v){ //2 complex - vector4double v1; - - v1 = Optimization::Permute::Permute0(v); - v1 = vec_add(v1, v); - - return Grid::ComplexD(vec_extract(v1, 0), vec_extract(v1, 1)); - } - - //Real double Reduce - template<> - inline Grid::RealD - Reduce::operator()(vector4double v){ //4 doubles - vector4double v1,v2; - - v1 = Optimization::Permute::Permute0(v); - v1 = vec_add(v1, v); - v2 = Optimization::Permute::Permute1(v1); - v1 = vec_add(v1, v2); - return vec_extract(v1, 0); - } - - //Integer Reduce - template<> - inline Integer Reduce::operator()(veci in){ - Integer a = 0; - for (unsigned int i = 0; i < W::r; ++i) - { - a += in.v[i]; + static inline vector4double rotate(vector4double v, int n){ + switch(n){ + case 0: + return v; + break; + case 1: + return tRotate<1>(v); + break; + case 2: + return tRotate<2>(v); + break; + case 3: + return tRotate<3>(v); + break; + default: assert(0); } - return a; } + + static inline vector4float rotate(vector4float v, int n){ + vector4double vd, rd; + vector4float r; + vd = Vset()(v); + rd = rotate(vd, n); + Vstore()(rd, r); + return r; + } +}; + +//Complex float Reduce +template<> +inline Grid::ComplexF +Reduce::operator()(vector4float v) { //2 complex + vector4float v1,v2; + + v1 = Optimization::Permute::Permute0(v); + v1 = Optimization::Sum()(v1, v); + + return Grid::ComplexF(v1.v0, v1.v1); } +//Real float Reduce +template<> +inline Grid::RealF +Reduce::operator()(vector4float v){ //4 floats + vector4float v1,v2; + + v1 = Optimization::Permute::Permute0(v); + v1 = Optimization::Sum()(v1, v); + v2 = Optimization::Permute::Permute1(v1); + v1 = Optimization::Sum()(v1, v2); + + return v1.v0; +} + + +//Complex double Reduce +template<> +inline Grid::ComplexD +Reduce::operator()(vector4double v){ //2 complex + vector4double v1; + + v1 = Optimization::Permute::Permute0(v); + v1 = vec_add(v1, v); + + return Grid::ComplexD(vec_extract(v1, 0), vec_extract(v1, 1)); +} + +//Real double Reduce +template<> +inline Grid::RealD +Reduce::operator()(vector4double v){ //4 doubles + vector4double v1,v2; + + v1 = Optimization::Permute::Permute0(v); + v1 = vec_add(v1, v); + v2 = Optimization::Permute::Permute1(v1); + v1 = vec_add(v1, v2); + + return vec_extract(v1, 0); +} + +//Integer Reduce +template<> +inline Integer Reduce::operator()(veci in){ + Integer a = 0; + for (unsigned int i = 0; i < W::r; ++i) + { + a += in.v[i]; + } + return a; +} + +NAMESPACE_END(Optimization); //////////////////////////////////////////////////////////////////////////////// // Here assign types @@ -596,7 +598,6 @@ typedef Optimization::veci SIMD_Itype; // Integer type inline void v_prefetch0(int size, const char *ptr){}; inline void prefetch_HINT_T0(const char *ptr){}; - // Function name aliases typedef Optimization::Vsplat VsplatSIMD; typedef Optimization::Vstore VstoreSIMD; @@ -615,5 +616,5 @@ typedef Optimization::MaddRealPart MaddRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; - -} + +NAMESPACE_END(Grid) From d8ff895e74f6ffa63f4a9ec5fab14dfb5145ef0e Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 18:27:22 +0000 Subject: [PATCH 009/754] NAMESPACE and format --- lib/simd/Grid_sse4.h | 1074 +++++++++++++++++++++--------------------- 1 file changed, 533 insertions(+), 541 deletions(-) diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index 0b1f9ffb..72a2ec9a 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -25,8 +25,8 @@ Author: neo 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ //---------------------------------------------------------------------- /*! @file Grid_sse4.h @brief Optimization libraries for SSE4 instructions set @@ -38,580 +38,572 @@ Author: neo #include -namespace Grid { -namespace Optimization { +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Optimization); - template - union uconv { - __m128 f; - vtype v; - }; +template +union uconv { + __m128 f; + vtype v; +}; - union u128f { - __m128 v; - float f[4]; - }; - union u128d { - __m128d v; - double f[2]; - }; +union u128f { + __m128 v; + float f[4]; +}; +union u128d { + __m128d v; + double f[2]; +}; - struct Vsplat{ - //Complex float - inline __m128 operator()(float a, float b){ - return _mm_set_ps(b,a,b,a); - } - // Real float - inline __m128 operator()(float a){ - return _mm_set_ps(a,a,a,a); - } - //Complex double - inline __m128d operator()(double a, double b){ - return _mm_set_pd(b,a); - } - //Real double - inline __m128d operator()(double a){ - return _mm_set_pd(a,a); - } - //Integer - inline __m128i operator()(Integer a){ - return _mm_set1_epi32(a); - } +struct Vsplat{ + //Complex float + inline __m128 operator()(float a, float b){ + return _mm_set_ps(b,a,b,a); + } + // Real float + inline __m128 operator()(float a){ + return _mm_set_ps(a,a,a,a); + } + //Complex double + inline __m128d operator()(double a, double b){ + return _mm_set_pd(b,a); + } + //Real double + inline __m128d operator()(double a){ + return _mm_set_pd(a,a); + } + //Integer + inline __m128i operator()(Integer a){ + return _mm_set1_epi32(a); + } +}; + +struct Vstore{ + //Float + inline void operator()(__m128 a, float* F){ + _mm_store_ps(F,a); + } + //Double + inline void operator()(__m128d a, double* D){ + _mm_store_pd(D,a); + } + //Integer + inline void operator()(__m128i a, Integer* I){ + _mm_store_si128((__m128i *)I,a); + } + +}; + +struct Vstream{ + //Float + inline void operator()(float * a, __m128 b){ + _mm_stream_ps(a,b); + } + //Double + inline void operator()(double * a, __m128d b){ + _mm_stream_pd(a,b); + } + + +}; + +struct Vset{ + // Complex float + inline __m128 operator()(Grid::ComplexF *a){ + return _mm_set_ps(a[1].imag(), a[1].real(),a[0].imag(),a[0].real()); + } + // Complex double + inline __m128d operator()(Grid::ComplexD *a){ + return _mm_set_pd(a[0].imag(),a[0].real()); + } + // Real float + inline __m128 operator()(float *a){ + return _mm_set_ps(a[3],a[2],a[1],a[0]); + } + // Real double + inline __m128d operator()(double *a){ + return _mm_set_pd(a[1],a[0]); + } + // Integer + inline __m128i operator()(Integer *a){ + return _mm_set_epi32(a[3],a[2],a[1],a[0]); + } + + +}; + +template +struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } +}; + +///////////////////////////////////////////////////// +// Arithmetic operations +///////////////////////////////////////////////////// +struct Sum{ + //Complex/Real float + inline __m128 operator()(__m128 a, __m128 b){ + return _mm_add_ps(a,b); + } + //Complex/Real double + inline __m128d operator()(__m128d a, __m128d b){ + return _mm_add_pd(a,b); + } + //Integer + inline __m128i operator()(__m128i a, __m128i b){ + return _mm_add_epi32(a,b); + } +}; + +struct Sub{ + //Complex/Real float + inline __m128 operator()(__m128 a, __m128 b){ + return _mm_sub_ps(a,b); + } + //Complex/Real double + inline __m128d operator()(__m128d a, __m128d b){ + return _mm_sub_pd(a,b); + } + //Integer + inline __m128i operator()(__m128i a, __m128i b){ + return _mm_sub_epi32(a,b); + } +}; + +struct MultRealPart{ + inline __m128 operator()(__m128 a, __m128 b){ + __m128 ymm0; + ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, + return _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br + } + inline __m128d operator()(__m128d a, __m128d b){ + __m128d ymm0; + ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00 + return _mm_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br + } +}; +struct MaddRealPart{ + inline __m128 operator()(__m128 a, __m128 b, __m128 c){ + __m128 ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, + return _mm_add_ps(_mm_mul_ps( ymm0, b),c); + } + inline __m128d operator()(__m128d a, __m128d b, __m128d c){ + __m128d ymm0 = _mm_shuffle_pd( a, a, 0x0 ); + return _mm_add_pd(_mm_mul_pd( ymm0, b),c); + } +}; + +struct MultComplex{ + // Complex float + inline __m128 operator()(__m128 a, __m128 b){ + __m128 ymm0,ymm1,ymm2; + ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, + ymm0 = _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br + ymm1 = _mm_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi + ymm2 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai + ymm1 = _mm_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi + return _mm_addsub_ps(ymm0,ymm1); + } + // Complex double + inline __m128d operator()(__m128d a, __m128d b){ + __m128d ymm0,ymm1,ymm2; + ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, + ymm0 = _mm_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br + ymm1 = _mm_shuffle_pd(b,b,0x1); // ymm1 <- br,bi b01 + ymm2 = _mm_shuffle_pd(a,a,0x3); // ymm2 <- ai,ai b11 + ymm1 = _mm_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi + return _mm_addsub_pd(ymm0,ymm1); + } +}; + +struct Mult{ + + inline void mac(__m128 &a, __m128 b, __m128 c){ + a= _mm_add_ps(_mm_mul_ps(b,c),a); + } + + inline void mac(__m128d &a, __m128d b, __m128d c){ + a= _mm_add_pd(_mm_mul_pd(b,c),a); + } + + // Real float + inline __m128 operator()(__m128 a, __m128 b){ + return _mm_mul_ps(a,b); + } + // Real double + inline __m128d operator()(__m128d a, __m128d b){ + return _mm_mul_pd(a,b); + } + // Integer + inline __m128i operator()(__m128i a, __m128i b){ + return _mm_mullo_epi32(a,b); + } +}; + +struct Div{ + // Real float + inline __m128 operator()(__m128 a, __m128 b){ + return _mm_div_ps(a,b); + } + // Real double + inline __m128d operator()(__m128d a, __m128d b){ + return _mm_div_pd(a,b); + } +}; + + +struct Conj{ + // Complex single + inline __m128 operator()(__m128 in){ + return _mm_xor_ps(_mm_addsub_ps(_mm_setzero_ps(),in), _mm_set1_ps(-0.f)); + } + // Complex double + inline __m128d operator()(__m128d in){ + return _mm_xor_pd(_mm_addsub_pd(_mm_setzero_pd(),in), _mm_set1_pd(-0.f));//untested + } + // do not define for integer input +}; + +struct TimesMinusI{ + //Complex single + inline __m128 operator()(__m128 in, __m128 ret){ + __m128 tmp =_mm_addsub_ps(_mm_setzero_ps(),in); // r,-i + return _mm_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1)); + } + //Complex double + inline __m128d operator()(__m128d in, __m128d ret){ + __m128d tmp =_mm_addsub_pd(_mm_setzero_pd(),in); // r,-i + return _mm_shuffle_pd(tmp,tmp,0x1); + } + + +}; + +struct TimesI{ + //Complex single + inline __m128 operator()(__m128 in, __m128 ret){ + __m128 tmp =_mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); + return _mm_addsub_ps(_mm_setzero_ps(),tmp); // r,-i + } + //Complex double + inline __m128d operator()(__m128d in, __m128d ret){ + __m128d tmp = _mm_shuffle_pd(in,in,0x1); + return _mm_addsub_pd(_mm_setzero_pd(),tmp); // r,-i + } +}; + +struct Permute{ + + static inline __m128 Permute0(__m128 in){ + return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //AB CD -> CD AB + }; + static inline __m128 Permute1(__m128 in){ + return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //AB CD -> BA DC + }; + static inline __m128 Permute2(__m128 in){ + return in; + }; + static inline __m128 Permute3(__m128 in){ + return in; }; - struct Vstore{ - //Float - inline void operator()(__m128 a, float* F){ - _mm_store_ps(F,a); - } - //Double - inline void operator()(__m128d a, double* D){ - _mm_store_pd(D,a); - } - //Integer - inline void operator()(__m128i a, Integer* I){ - _mm_store_si128((__m128i *)I,a); - } - + static inline __m128d Permute0(__m128d in){ //AB -> BA + return _mm_shuffle_pd(in,in,0x1); }; - - struct Vstream{ - //Float - inline void operator()(float * a, __m128 b){ - _mm_stream_ps(a,b); - } - //Double - inline void operator()(double * a, __m128d b){ - _mm_stream_pd(a,b); - } - - + static inline __m128d Permute1(__m128d in){ + return in; }; - - struct Vset{ - // Complex float - inline __m128 operator()(Grid::ComplexF *a){ - return _mm_set_ps(a[1].imag(), a[1].real(),a[0].imag(),a[0].real()); - } - // Complex double - inline __m128d operator()(Grid::ComplexD *a){ - return _mm_set_pd(a[0].imag(),a[0].real()); - } - // Real float - inline __m128 operator()(float *a){ - return _mm_set_ps(a[3],a[2],a[1],a[0]); - } - // Real double - inline __m128d operator()(double *a){ - return _mm_set_pd(a[1],a[0]); - } - // Integer - inline __m128i operator()(Integer *a){ - return _mm_set_epi32(a[3],a[2],a[1],a[0]); - } - - + static inline __m128d Permute2(__m128d in){ + return in; }; - - template - struct Reduce{ - //Need templated class to overload output type - //General form must generate error if compiled - inline Out_type operator()(In_type in){ - printf("Error, using wrong Reduce function\n"); - exit(1); - return 0; - } + static inline __m128d Permute3(__m128d in){ + return in; }; - - ///////////////////////////////////////////////////// - // Arithmetic operations - ///////////////////////////////////////////////////// - struct Sum{ - //Complex/Real float - inline __m128 operator()(__m128 a, __m128 b){ - return _mm_add_ps(a,b); - } - //Complex/Real double - inline __m128d operator()(__m128d a, __m128d b){ - return _mm_add_pd(a,b); - } - //Integer - inline __m128i operator()(__m128i a, __m128i b){ - return _mm_add_epi32(a,b); - } - }; - - struct Sub{ - //Complex/Real float - inline __m128 operator()(__m128 a, __m128 b){ - return _mm_sub_ps(a,b); - } - //Complex/Real double - inline __m128d operator()(__m128d a, __m128d b){ - return _mm_sub_pd(a,b); - } - //Integer - inline __m128i operator()(__m128i a, __m128i b){ - return _mm_sub_epi32(a,b); - } - }; - - struct MultRealPart{ - inline __m128 operator()(__m128 a, __m128 b){ - __m128 ymm0; - ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, - return _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br - } - inline __m128d operator()(__m128d a, __m128d b){ - __m128d ymm0; - ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00 - return _mm_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br - } - }; - struct MaddRealPart{ - inline __m128 operator()(__m128 a, __m128 b, __m128 c){ - __m128 ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, - return _mm_add_ps(_mm_mul_ps( ymm0, b),c); - } - inline __m128d operator()(__m128d a, __m128d b, __m128d c){ - __m128d ymm0 = _mm_shuffle_pd( a, a, 0x0 ); - return _mm_add_pd(_mm_mul_pd( ymm0, b),c); - } - }; - - struct MultComplex{ - // Complex float - inline __m128 operator()(__m128 a, __m128 b){ - __m128 ymm0,ymm1,ymm2; - ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, - ymm0 = _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br - ymm1 = _mm_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi - ymm2 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai - ymm1 = _mm_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi - return _mm_addsub_ps(ymm0,ymm1); - } - // Complex double - inline __m128d operator()(__m128d a, __m128d b){ - __m128d ymm0,ymm1,ymm2; - ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, - ymm0 = _mm_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br - ymm1 = _mm_shuffle_pd(b,b,0x1); // ymm1 <- br,bi b01 - ymm2 = _mm_shuffle_pd(a,a,0x3); // ymm2 <- ai,ai b11 - ymm1 = _mm_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi - return _mm_addsub_pd(ymm0,ymm1); - } - }; - - struct Mult{ - - inline void mac(__m128 &a, __m128 b, __m128 c){ - a= _mm_add_ps(_mm_mul_ps(b,c),a); - } - - inline void mac(__m128d &a, __m128d b, __m128d c){ - a= _mm_add_pd(_mm_mul_pd(b,c),a); - } - - // Real float - inline __m128 operator()(__m128 a, __m128 b){ - return _mm_mul_ps(a,b); - } - // Real double - inline __m128d operator()(__m128d a, __m128d b){ - return _mm_mul_pd(a,b); - } - // Integer - inline __m128i operator()(__m128i a, __m128i b){ - return _mm_mullo_epi32(a,b); - } - }; - - struct Div{ - // Real float - inline __m128 operator()(__m128 a, __m128 b){ - return _mm_div_ps(a,b); - } - // Real double - inline __m128d operator()(__m128d a, __m128d b){ - return _mm_div_pd(a,b); - } - }; - - - struct Conj{ - // Complex single - inline __m128 operator()(__m128 in){ - return _mm_xor_ps(_mm_addsub_ps(_mm_setzero_ps(),in), _mm_set1_ps(-0.f)); - } - // Complex double - inline __m128d operator()(__m128d in){ - return _mm_xor_pd(_mm_addsub_pd(_mm_setzero_pd(),in), _mm_set1_pd(-0.f));//untested - } - // do not define for integer input - }; - - struct TimesMinusI{ - //Complex single - inline __m128 operator()(__m128 in, __m128 ret){ - __m128 tmp =_mm_addsub_ps(_mm_setzero_ps(),in); // r,-i - return _mm_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1)); - } - //Complex double - inline __m128d operator()(__m128d in, __m128d ret){ - __m128d tmp =_mm_addsub_pd(_mm_setzero_pd(),in); // r,-i - return _mm_shuffle_pd(tmp,tmp,0x1); - } - - - }; - - struct TimesI{ - //Complex single - inline __m128 operator()(__m128 in, __m128 ret){ - __m128 tmp =_mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); - return _mm_addsub_ps(_mm_setzero_ps(),tmp); // r,-i - } - //Complex double - inline __m128d operator()(__m128d in, __m128d ret){ - __m128d tmp = _mm_shuffle_pd(in,in,0x1); - return _mm_addsub_pd(_mm_setzero_pd(),tmp); // r,-i - } - }; - - struct Permute{ - - static inline __m128 Permute0(__m128 in){ - return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //AB CD -> CD AB - }; - static inline __m128 Permute1(__m128 in){ - return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //AB CD -> BA DC - }; - static inline __m128 Permute2(__m128 in){ - return in; - }; - static inline __m128 Permute3(__m128 in){ - return in; - }; - - static inline __m128d Permute0(__m128d in){ //AB -> BA - return _mm_shuffle_pd(in,in,0x1); - }; - static inline __m128d Permute1(__m128d in){ - return in; - }; - static inline __m128d Permute2(__m128d in){ - return in; - }; - static inline __m128d Permute3(__m128d in){ - return in; - }; - }; - +}; #define _my_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) #define _my_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) #ifdef SFW_FP16 - struct Grid_half { - Grid_half(){} - Grid_half(uint16_t raw) : x(raw) {} - uint16_t x; - }; - union FP32 { - unsigned int u; - float f; - }; +struct Grid_half { + Grid_half(){} + Grid_half(uint16_t raw) : x(raw) {} + uint16_t x; +}; +union FP32 { + unsigned int u; + float f; +}; - // PAB - Lifted and adapted from Eigen, which is GPL V2 - inline float sfw_half_to_float(Grid_half h) { - const FP32 magic = { 113 << 23 }; - const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift - FP32 o; - o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits - unsigned int exp = shifted_exp & o.u; // just the exponent - o.u += (127 - 15) << 23; // exponent adjust - // handle exponent special cases - if (exp == shifted_exp) { // Inf/NaN? - o.u += (128 - 16) << 23; // extra exp adjust - } else if (exp == 0) { // Zero/Denormal? - o.u += 1 << 23; // extra exp adjust - o.f -= magic.f; // renormalize - } - o.u |= (h.x & 0x8000) << 16; // sign bit - return o.f; +// PAB - Lifted and adapted from Eigen, which is GPL V2 +inline float sfw_half_to_float(Grid_half h) { + const FP32 magic = { 113 << 23 }; + const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift + FP32 o; + o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits + unsigned int exp = shifted_exp & o.u; // just the exponent + o.u += (127 - 15) << 23; // exponent adjust + // handle exponent special cases + if (exp == shifted_exp) { // Inf/NaN? + o.u += (128 - 16) << 23; // extra exp adjust + } else if (exp == 0) { // Zero/Denormal? + o.u += 1 << 23; // extra exp adjust + o.f -= magic.f; // renormalize } - inline Grid_half sfw_float_to_half(float ff) { - FP32 f; f.f = ff; - const FP32 f32infty = { 255 << 23 }; - const FP32 f16max = { (127 + 16) << 23 }; - const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; - unsigned int sign_mask = 0x80000000u; - Grid_half o; + o.u |= (h.x & 0x8000) << 16; // sign bit + return o.f; +} +inline Grid_half sfw_float_to_half(float ff) { + FP32 f; f.f = ff; + const FP32 f32infty = { 255 << 23 }; + const FP32 f16max = { (127 + 16) << 23 }; + const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; + unsigned int sign_mask = 0x80000000u; + Grid_half o; - o.x = static_cast(0x0u); - unsigned int sign = f.u & sign_mask; - f.u ^= sign; - // NOTE all the integer compares in this function can be safely - // compiled into signed compares since all operands are below - // 0x80000000. Important if you want fast straight SSE2 code - // (since there's no unsigned PCMPGTD). - if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) - o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf - } else { // (De)normalized number or zero - if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero - // use a magic value to align our 10 mantissa bits at the bottom of - // the float. as long as FP addition is round-to-nearest-even this - // just works. - f.f += denorm_magic.f; - // and one integer subtract of the bias later, we have our final float! - o.x = static_cast(f.u - denorm_magic.u); - } else { - unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd + o.x = static_cast(0x0u); + unsigned int sign = f.u & sign_mask; + f.u ^= sign; + // NOTE all the integer compares in this function can be safely + // compiled into signed compares since all operands are below + // 0x80000000. Important if you want fast straight SSE2 code + // (since there's no unsigned PCMPGTD). + if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) + o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf + } else { // (De)normalized number or zero + if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero + // use a magic value to align our 10 mantissa bits at the bottom of + // the float. as long as FP addition is round-to-nearest-even this + // just works. + f.f += denorm_magic.f; + // and one integer subtract of the bias later, we have our final float! + o.x = static_cast(f.u - denorm_magic.u); + } else { + unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd - // update exponent, rounding bias part 1 - f.u += ((unsigned int)(15 - 127) << 23) + 0xfff; - // rounding bias part 2 - f.u += mant_odd; - // take the bits! - o.x = static_cast(f.u >> 13); - } - } - o.x |= static_cast(sign >> 16); - return o; - } - static inline __m128i Grid_mm_cvtps_ph(__m128 f,int discard) { - __m128i ret=(__m128i)_mm_setzero_ps(); - float *fp = (float *)&f; - Grid_half *hp = (Grid_half *)&ret; - hp[0] = sfw_float_to_half(fp[0]); - hp[1] = sfw_float_to_half(fp[1]); - hp[2] = sfw_float_to_half(fp[2]); - hp[3] = sfw_float_to_half(fp[3]); - return ret; - } - static inline __m128 Grid_mm_cvtph_ps(__m128i h,int discard) { - __m128 ret=_mm_setzero_ps(); - float *fp = (float *)&ret; - Grid_half *hp = (Grid_half *)&h; - fp[0] = sfw_half_to_float(hp[0]); - fp[1] = sfw_half_to_float(hp[1]); - fp[2] = sfw_half_to_float(hp[2]); - fp[3] = sfw_half_to_float(hp[3]); - return ret; - } + // update exponent, rounding bias part 1 + f.u += ((unsigned int)(15 - 127) << 23) + 0xfff; + // rounding bias part 2 + f.u += mant_odd; + // take the bits! + o.x = static_cast(f.u >> 13); + } + } + o.x |= static_cast(sign >> 16); + return o; +} +static inline __m128i Grid_mm_cvtps_ph(__m128 f,int discard) { + __m128i ret=(__m128i)_mm_setzero_ps(); + float *fp = (float *)&f; + Grid_half *hp = (Grid_half *)&ret; + hp[0] = sfw_float_to_half(fp[0]); + hp[1] = sfw_float_to_half(fp[1]); + hp[2] = sfw_float_to_half(fp[2]); + hp[3] = sfw_float_to_half(fp[3]); + return ret; +} +static inline __m128 Grid_mm_cvtph_ps(__m128i h,int discard) { + __m128 ret=_mm_setzero_ps(); + float *fp = (float *)&ret; + Grid_half *hp = (Grid_half *)&h; + fp[0] = sfw_half_to_float(hp[0]); + fp[1] = sfw_half_to_float(hp[1]); + fp[2] = sfw_half_to_float(hp[2]); + fp[3] = sfw_half_to_float(hp[3]); + return ret; +} #else #define Grid_mm_cvtps_ph _mm_cvtps_ph #define Grid_mm_cvtph_ps _mm_cvtph_ps #endif - struct PrecisionChange { - static inline __m128i StoH (__m128 a,__m128 b) { - __m128i ha = Grid_mm_cvtps_ph(a,0); - __m128i hb = Grid_mm_cvtps_ph(b,0); - __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); - return h; - } - static inline void HtoS (__m128i h,__m128 &sa,__m128 &sb) { - sa = Grid_mm_cvtph_ps(h,0); - h = (__m128i)_my_alignr_epi32((__m128i)h,(__m128i)h,2); - sb = Grid_mm_cvtph_ps(h,0); - } - static inline __m128 DtoS (__m128d a,__m128d b) { - __m128 sa = _mm_cvtpd_ps(a); - __m128 sb = _mm_cvtpd_ps(b); - __m128 s = _mm_shuffle_ps(sa,sb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); - return s; - } - static inline void StoD (__m128 s,__m128d &a,__m128d &b) { - a = _mm_cvtps_pd(s); - s = (__m128)_my_alignr_epi32((__m128i)s,(__m128i)s,2); - b = _mm_cvtps_pd(s); - } - static inline __m128i DtoH (__m128d a,__m128d b,__m128d c,__m128d d) { - __m128 sa,sb; - sa = DtoS(a,b); - sb = DtoS(c,d); - return StoH(sa,sb); - } - static inline void HtoD (__m128i h,__m128d &a,__m128d &b,__m128d &c,__m128d &d) { - __m128 sa,sb; - HtoS(h,sa,sb); - StoD(sa,a,b); - StoD(sb,c,d); - } +struct PrecisionChange { + static inline __m128i StoH (__m128 a,__m128 b) { + __m128i ha = Grid_mm_cvtps_ph(a,0); + __m128i hb = Grid_mm_cvtps_ph(b,0); + __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); + return h; + } + static inline void HtoS (__m128i h,__m128 &sa,__m128 &sb) { + sa = Grid_mm_cvtph_ps(h,0); + h = (__m128i)_my_alignr_epi32((__m128i)h,(__m128i)h,2); + sb = Grid_mm_cvtph_ps(h,0); + } + static inline __m128 DtoS (__m128d a,__m128d b) { + __m128 sa = _mm_cvtpd_ps(a); + __m128 sb = _mm_cvtpd_ps(b); + __m128 s = _mm_shuffle_ps(sa,sb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); + return s; + } + static inline void StoD (__m128 s,__m128d &a,__m128d &b) { + a = _mm_cvtps_pd(s); + s = (__m128)_my_alignr_epi32((__m128i)s,(__m128i)s,2); + b = _mm_cvtps_pd(s); + } + static inline __m128i DtoH (__m128d a,__m128d b,__m128d c,__m128d d) { + __m128 sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); + } + static inline void HtoD (__m128i h,__m128d &a,__m128d &b,__m128d &c,__m128d &d) { + __m128 sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); + } +}; + +struct Exchange{ + // 3210 ordering + static inline void Exchange0(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ + out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0)); + out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2)); + }; + static inline void Exchange1(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ + out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); /*ACEG*/ + out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); /*BDFH*/ + out1= _mm_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ + out2= _mm_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ + }; + static inline void Exchange2(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ + assert(0); + return; + }; + static inline void Exchange3(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ + assert(0); + return; }; - struct Exchange{ - // 3210 ordering - static inline void Exchange0(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ - out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0)); - out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2)); - }; - static inline void Exchange1(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ - out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); /*ACEG*/ - out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); /*BDFH*/ - out1= _mm_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ - out2= _mm_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ - }; - static inline void Exchange2(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ - assert(0); - return; - }; - static inline void Exchange3(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ - assert(0); - return; - }; - - static inline void Exchange0(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){ - out1= _mm_shuffle_pd(in1,in2,0x0); - out2= _mm_shuffle_pd(in1,in2,0x3); - }; - static inline void Exchange1(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){ - assert(0); - return; - }; - static inline void Exchange2(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){ - assert(0); - return; - }; - static inline void Exchange3(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){ - assert(0); - return; - }; + static inline void Exchange0(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){ + out1= _mm_shuffle_pd(in1,in2,0x0); + out2= _mm_shuffle_pd(in1,in2,0x3); }; - - struct Rotate{ - - static inline __m128 rotate(__m128 in,int n){ - switch(n){ - case 0: return tRotate<0>(in);break; - case 1: return tRotate<1>(in);break; - case 2: return tRotate<2>(in);break; - case 3: return tRotate<3>(in);break; - default: assert(0); - } - } - static inline __m128d rotate(__m128d in,int n){ - switch(n){ - case 0: return tRotate<0>(in);break; - case 1: return tRotate<1>(in);break; - default: assert(0); - } - } - - template static inline __m128 tRotate(__m128 in){ return (__m128)_my_alignr_epi32((__m128i)in,(__m128i)in,n); }; - template static inline __m128d tRotate(__m128d in){ return (__m128d)_my_alignr_epi64((__m128i)in,(__m128i)in,n); }; - + static inline void Exchange1(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){ + assert(0); + return; }; - ////////////////////////////////////////////// - // Some Template specialization + static inline void Exchange2(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){ + assert(0); + return; + }; + static inline void Exchange3(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){ + assert(0); + return; + }; +}; +struct Rotate{ - //Complex float Reduce - template<> - inline Grid::ComplexF Reduce::operator()(__m128 in){ - __m128 v1; // two complex - v1= Optimization::Permute::Permute0(in); - v1= _mm_add_ps(v1,in); - u128f conv; conv.v=v1; - return Grid::ComplexF(conv.f[0],conv.f[1]); + static inline __m128 rotate(__m128 in,int n){ + switch(n){ + case 0: return tRotate<0>(in);break; + case 1: return tRotate<1>(in);break; + case 2: return tRotate<2>(in);break; + case 3: return tRotate<3>(in);break; + default: assert(0); + } } - //Real float Reduce - template<> - inline Grid::RealF Reduce::operator()(__m128 in){ - __m128 v1,v2; // quad single - v1= Optimization::Permute::Permute0(in); - v1= _mm_add_ps(v1,in); - v2= Optimization::Permute::Permute1(v1); - v1 = _mm_add_ps(v1,v2); - u128f conv; conv.v=v1; - return conv.f[0]; - } - - - //Complex double Reduce - template<> - inline Grid::ComplexD Reduce::operator()(__m128d in){ - u128d conv; conv.v = in; - return Grid::ComplexD(conv.f[0],conv.f[1]); - } - - //Real double Reduce - template<> - inline Grid::RealD Reduce::operator()(__m128d in){ - __m128d v1; - v1 = Optimization::Permute::Permute0(in); - v1 = _mm_add_pd(v1,in); - u128d conv; conv.v = v1; - return conv.f[0]; + static inline __m128d rotate(__m128d in,int n){ + switch(n){ + case 0: return tRotate<0>(in);break; + case 1: return tRotate<1>(in);break; + default: assert(0); + } } - //Integer Reduce - template<> - inline Integer Reduce::operator()(__m128i in){ - __m128i v1 = _mm_hadd_epi32(in, in); - __m128i v2 = _mm_hadd_epi32(v1, v1); - return _mm_cvtsi128_si32(v2); - } + template static inline __m128 tRotate(__m128 in){ return (__m128)_my_alignr_epi32((__m128i)in,(__m128i)in,n); }; + template static inline __m128d tRotate(__m128d in){ return (__m128d)_my_alignr_epi64((__m128i)in,(__m128i)in,n); }; + +}; +////////////////////////////////////////////// +// Some Template specialization + + +//Complex float Reduce +template<> +inline Grid::ComplexF Reduce::operator()(__m128 in){ + __m128 v1; // two complex + v1= Optimization::Permute::Permute0(in); + v1= _mm_add_ps(v1,in); + u128f conv; conv.v=v1; + return Grid::ComplexF(conv.f[0],conv.f[1]); +} +//Real float Reduce +template<> +inline Grid::RealF Reduce::operator()(__m128 in){ + __m128 v1,v2; // quad single + v1= Optimization::Permute::Permute0(in); + v1= _mm_add_ps(v1,in); + v2= Optimization::Permute::Permute1(v1); + v1 = _mm_add_ps(v1,v2); + u128f conv; conv.v=v1; + return conv.f[0]; +} + +//Complex double Reduce +template<> +inline Grid::ComplexD Reduce::operator()(__m128d in){ + u128d conv; conv.v = in; + return Grid::ComplexD(conv.f[0],conv.f[1]); +} + +//Real double Reduce +template<> +inline Grid::RealD Reduce::operator()(__m128d in){ + __m128d v1; + v1 = Optimization::Permute::Permute0(in); + v1 = _mm_add_pd(v1,in); + u128d conv; conv.v = v1; + return conv.f[0]; } - +//Integer Reduce +template<> +inline Integer Reduce::operator()(__m128i in){ + __m128i v1 = _mm_hadd_epi32(in, in); + __m128i v2 = _mm_hadd_epi32(v1, v1); + return _mm_cvtsi128_si32(v2); +} +NAMESPACE_END(Optimization); ////////////////////////////////////////////////////////////////////////////////////// // Here assign types +typedef __m128i SIMD_Htype; // Single precision type +typedef __m128 SIMD_Ftype; // Single precision type +typedef __m128d SIMD_Dtype; // Double precision type +typedef __m128i SIMD_Itype; // Integer type - typedef __m128i SIMD_Htype; // Single precision type - typedef __m128 SIMD_Ftype; // Single precision type - typedef __m128d SIMD_Dtype; // Double precision type - typedef __m128i SIMD_Itype; // Integer type - - // prefetch utilities - inline void v_prefetch0(int size, const char *ptr){}; - inline void prefetch_HINT_T0(const char *ptr){ - _mm_prefetch(ptr,_MM_HINT_T0); - } - - // Function name aliases - typedef Optimization::Vsplat VsplatSIMD; - typedef Optimization::Vstore VstoreSIMD; - typedef Optimization::Vset VsetSIMD; - typedef Optimization::Vstream VstreamSIMD; - template using ReduceSIMD = Optimization::Reduce; - - - - - // Arithmetic operations - typedef Optimization::Sum SumSIMD; - typedef Optimization::Sub SubSIMD; - typedef Optimization::Div DivSIMD; - typedef Optimization::Mult MultSIMD; - typedef Optimization::MultComplex MultComplexSIMD; - typedef Optimization::MultRealPart MultRealPartSIMD; - typedef Optimization::MaddRealPart MaddRealPartSIMD; - typedef Optimization::Conj ConjSIMD; - typedef Optimization::TimesMinusI TimesMinusISIMD; - typedef Optimization::TimesI TimesISIMD; - +// prefetch utilities +inline void v_prefetch0(int size, const char *ptr){}; +inline void prefetch_HINT_T0(const char *ptr){ + _mm_prefetch(ptr,_MM_HINT_T0); } + +// Function name aliases +typedef Optimization::Vsplat VsplatSIMD; +typedef Optimization::Vstore VstoreSIMD; +typedef Optimization::Vset VsetSIMD; +typedef Optimization::Vstream VstreamSIMD; +template using ReduceSIMD = Optimization::Reduce; + +// Arithmetic operations +typedef Optimization::Sum SumSIMD; +typedef Optimization::Sub SubSIMD; +typedef Optimization::Div DivSIMD; +typedef Optimization::Mult MultSIMD; +typedef Optimization::MultComplex MultComplexSIMD; +typedef Optimization::MultRealPart MultRealPartSIMD; +typedef Optimization::MaddRealPart MaddRealPartSIMD; +typedef Optimization::Conj ConjSIMD; +typedef Optimization::TimesMinusI TimesMinusISIMD; +typedef Optimization::TimesI TimesISIMD; + +NAMESPACE_END(Grid); From 6a2eca2ec241d67079acee637ace06fbdcb4a8b9 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:00:03 +0000 Subject: [PATCH 010/754] NAMESAPCE --- lib/simd/Grid_vector_types.h | 14 ++++++-------- lib/simd/Grid_vector_unops.h | 14 +++++++------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index c67e74cb..4768c734 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -31,7 +31,7 @@ directory /* END LEGAL */ //--------------------------------------------------------------------------- /*! @file Grid_vector_types.h - @brief Defines templated class Grid_simd to deal with inner vector types + @brief Defines templated class Grid_simd to deal with inner vector types */ // Time-stamp: <2015-07-10 17:45:33 neo> //--------------------------------------------------------------------------- @@ -62,7 +62,7 @@ directory #include "l1p.h" -namespace Grid { +NAMESPACE_BEGIN(Grid); ////////////////////////////////////// // To take the floating point type of real/complex type @@ -125,10 +125,10 @@ Out unary(Input src, Operation op) { /* @brief Grid_simd class for the SIMD vector type operations - */ +*/ template class Grid_simd { - public: +public: typedef typename RealPart::type Real; typedef Vector_type vector_type; typedef Scalar_type scalar_type; @@ -427,9 +427,6 @@ class Grid_simd { inline void putlane(const Scalar_type &S, int lane){ ((Scalar_type*)&v)[lane] = S; } - - - }; // end of Grid_simd class definition inline void permute(ComplexD &y,ComplexD b, int perm) { y=b; } @@ -863,6 +860,7 @@ template <> struct is_simd : public std::true_type {}; template using IfSimd = Invoke::value, int> >; template using IfNotSimd = Invoke::value, unsigned> >; -} + +NAMESPACE_END(Grid); #endif diff --git a/lib/simd/Grid_vector_unops.h b/lib/simd/Grid_vector_unops.h index 2244566f..bb8ab5a5 100644 --- a/lib/simd/Grid_vector_unops.h +++ b/lib/simd/Grid_vector_unops.h @@ -28,13 +28,13 @@ with this program; if not, write to the Free Software Foundation, Inc., See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ -/* END LEGAL */ + /* END LEGAL */ #ifndef GRID_VECTOR_UNOPS #define GRID_VECTOR_UNOPS #include -namespace Grid { +NAMESPACE_BEGIN(Grid); template struct SqrtRealFunctor { @@ -201,23 +201,23 @@ struct OrOrFunctor { //////////////////////////////// template inline Grid_simd operator&(const Grid_simd &x, - const Grid_simd &y) { + const Grid_simd &y) { return SimdApplyBinop(AndFunctor(), x, y); } template inline Grid_simd operator&&(const Grid_simd &x, - const Grid_simd &y) { + const Grid_simd &y) { return SimdApplyBinop(AndAndFunctor(), x, y); } template inline Grid_simd operator|(const Grid_simd &x, - const Grid_simd &y) { + const Grid_simd &y) { return SimdApplyBinop(OrFunctor(), x, y); } template inline Grid_simd operator||(const Grid_simd &x, - const Grid_simd &y) { + const Grid_simd &y) { return SimdApplyBinop(OrOrFunctor(), x, y); } -} +NAMESPACE_END(Grid); #endif From 3281559ec32be1026f5c118084ba7a713042f203 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:09:01 +0000 Subject: [PATCH 011/754] Format --- lib/simd/Grid_avx.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index 1994e224..ad9800fb 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -134,15 +134,15 @@ struct Vset{ }; template - struct Reduce{ - // Need templated class to overload output type - // General form must generate error if compiled - inline Out_type operator()(In_type in){ - printf("Error, using wrong Reduce function\n"); - exit(1); - return 0; - } - }; +struct Reduce{ + // Need templated class to overload output type + // General form must generate error if compiled + inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } +}; ///////////////////////////////////////////////////// // Arithmetic operations From c64deedf74164e8c112df25066138d847534a262 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:09:35 +0000 Subject: [PATCH 012/754] Format --- lib/simd/Grid_avx512.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index ff572464..d22a3b81 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -126,15 +126,15 @@ struct Vset{ }; template - struct Reduce{ - //Need templated class to overload output type - //General form must generate error if compiled - inline Out_type operator()(In_type in){ - printf("Error, using wrong Reduce function\n"); - exit(1); - return 0; - } - }; +struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } +}; ///////////////////////////////////////////////////// From 63865e42320456ce0a272c36b348b5b84b32cc1b Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:10:48 +0000 Subject: [PATCH 013/754] format --- lib/simd/Grid_generic.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/simd/Grid_generic.h b/lib/simd/Grid_generic.h index d555a672..6fef3e2b 100644 --- a/lib/simd/Grid_generic.h +++ b/lib/simd/Grid_generic.h @@ -342,7 +342,7 @@ struct PrecisionChange { struct Exchange{ template - static inline void ExchangeN(vec &out1,vec &out2,vec &in1,vec &in2){ + static inline void ExchangeN(vec &out1,vec &out2,vec &in1,vec &in2){ const int w = W::r; unsigned int mask = w >> (n + 1); // std::cout << " Exchange "< - struct Reduce{ - //Need templated class to overload output type - //General form must generate error if compiled - inline Out_type operator()(In_type in){ - printf("Error, using wrong Reduce function\n"); - exit(1); - return 0; - } - }; +struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } +}; //Complex float Reduce template <> From 0a6168eef069df76090bae774764160559ff757e Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:11:22 +0000 Subject: [PATCH 014/754] Format emacs style --- lib/simd/BGQQPX.h | 956 +++++++++++++++++++++++----------------------- 1 file changed, 478 insertions(+), 478 deletions(-) diff --git a/lib/simd/BGQQPX.h b/lib/simd/BGQQPX.h index 34888ab7..3f428ad9 100644 --- a/lib/simd/BGQQPX.h +++ b/lib/simd/BGQQPX.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,8 +23,8 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_ASM_BGQ_QPX_H #define GRID_ASM_BGQ_QPX_H @@ -127,20 +127,20 @@ Author: paboyle /********************************************************* * Macro sequences encoding QCD *********************************************************/ -#define LOCK_GAUGE(dir) \ - { \ - uint8_t *byte_addr = (uint8_t *)&U._odata[sU](dir); \ - for(int i=0;i< 18*2*BYTES_PER_WORD*8;i+=32){ \ - CACHE_LOCK(&byte_addr[i]); \ - } \ +#define LOCK_GAUGE(dir) \ + { \ + uint8_t *byte_addr = (uint8_t *)&U._odata[sU](dir); \ + for(int i=0;i< 18*2*BYTES_PER_WORD*8;i+=32){ \ + CACHE_LOCK(&byte_addr[i]); \ + } \ } -#define UNLOCK_GAUGE(dir) \ - { \ - uint8_t *byte_addr = (uint8_t *)&U._odata[sU](dir); \ - for(int i=0;i< 18*2*BYTES_PER_WORD*8;i+=32){ \ - CACHE_UNLOCK(&byte_addr[i]); \ - } \ +#define UNLOCK_GAUGE(dir) \ + { \ + uint8_t *byte_addr = (uint8_t *)&U._odata[sU](dir); \ + for(int i=0;i< 18*2*BYTES_PER_WORD*8;i+=32){ \ + CACHE_UNLOCK(&byte_addr[i]); \ + } \ } #define MAYBEPERM(A,B) @@ -165,369 +165,369 @@ Author: paboyle VLOAD(%0,%3,U0) \ VLOAD(%1,%3,U1) \ VLOAD(%2,%3,U2) \ - VMUL_RR_RI(U0,Chi_00,UChi_00) \ - VMUL_RR_RI(U1,Chi_00,UChi_01) \ - VMUL_RR_RI(U2,Chi_00,UChi_02) \ - VMUL_RR_RI(U0,Chi_10,UChi_10) \ - VMUL_RR_RI(U1,Chi_10,UChi_11) \ - VMUL_RR_RI(U2,Chi_10,UChi_12) \ - VMADD_MII_IR(U0,Chi_00,UChi_00,UChi_00) \ - VMADD_MII_IR(U1,Chi_00,UChi_01,UChi_01) \ - VMADD_MII_IR(U2,Chi_00,UChi_02,UChi_02) \ - VMADD_MII_IR(U0,Chi_10,UChi_10,UChi_10) \ - VMADD_MII_IR(U1,Chi_10,UChi_11,UChi_11) \ - VMADD_MII_IR(U2,Chi_10,UChi_12,UChi_12) \ - : : "r" (0), "r" (32*3), "r" (32*6), "r" (ub )); \ - asm ( \ - VLOAD(%0,%3,U0) \ - VLOAD(%1,%3,U1) \ - VLOAD(%2,%3,U2) \ - VMADD_RR_RI(U0,Chi_01,UChi_00,UChi_00) \ - VMADD_RR_RI(U1,Chi_01,UChi_01,UChi_01) \ - VMADD_RR_RI(U2,Chi_01,UChi_02,UChi_02) \ - VMADD_RR_RI(U0,Chi_11,UChi_10,UChi_10) \ - VMADD_RR_RI(U1,Chi_11,UChi_11,UChi_11) \ - VMADD_RR_RI(U2,Chi_11,UChi_12,UChi_12) \ - VMADD_MII_IR(U0,Chi_01,UChi_00,UChi_00) \ - VMADD_MII_IR(U1,Chi_01,UChi_01,UChi_01) \ - VMADD_MII_IR(U2,Chi_01,UChi_02,UChi_02) \ - VMADD_MII_IR(U0,Chi_11,UChi_10,UChi_10) \ - VMADD_MII_IR(U1,Chi_11,UChi_11,UChi_11) \ - VMADD_MII_IR(U2,Chi_11,UChi_12,UChi_12) \ - : : "r" (32), "r" (32*4), "r" (32*7), "r" (ub )); \ - asm ( \ - VLOAD(%0,%3,U0) \ - VLOAD(%1,%3,U1) \ - VLOAD(%2,%3,U2) \ - VMADD_RR_RI(U0,Chi_02,UChi_00,UChi_00) \ - VMADD_RR_RI(U1,Chi_02,UChi_01,UChi_01) \ - VMADD_RR_RI(U2,Chi_02,UChi_02,UChi_02) \ - VMADD_RR_RI(U0,Chi_12,UChi_10,UChi_10) \ - VMADD_RR_RI(U1,Chi_12,UChi_11,UChi_11) \ - VMADD_RR_RI(U2,Chi_12,UChi_12,UChi_12) \ - VMADD_MII_IR(U0,Chi_02,UChi_00,UChi_00) \ - VMADD_MII_IR(U1,Chi_02,UChi_01,UChi_01) \ - VMADD_MII_IR(U2,Chi_02,UChi_02,UChi_02) \ - VMADD_MII_IR(U0,Chi_12,UChi_10,UChi_10) \ - VMADD_MII_IR(U1,Chi_12,UChi_11,UChi_11) \ - VMADD_MII_IR(U2,Chi_12,UChi_12,UChi_12) \ - : : "r" (32*2), "r" (32*5), "r" (32*8), "r" (ub )); \ + VMUL_RR_RI(U0,Chi_00,UChi_00) \ + VMUL_RR_RI(U1,Chi_00,UChi_01) \ + VMUL_RR_RI(U2,Chi_00,UChi_02) \ + VMUL_RR_RI(U0,Chi_10,UChi_10) \ + VMUL_RR_RI(U1,Chi_10,UChi_11) \ + VMUL_RR_RI(U2,Chi_10,UChi_12) \ + VMADD_MII_IR(U0,Chi_00,UChi_00,UChi_00) \ + VMADD_MII_IR(U1,Chi_00,UChi_01,UChi_01) \ + VMADD_MII_IR(U2,Chi_00,UChi_02,UChi_02) \ + VMADD_MII_IR(U0,Chi_10,UChi_10,UChi_10) \ + VMADD_MII_IR(U1,Chi_10,UChi_11,UChi_11) \ + VMADD_MII_IR(U2,Chi_10,UChi_12,UChi_12) \ + : : "r" (0), "r" (32*3), "r" (32*6), "r" (ub )); \ + asm ( \ + VLOAD(%0,%3,U0) \ + VLOAD(%1,%3,U1) \ + VLOAD(%2,%3,U2) \ + VMADD_RR_RI(U0,Chi_01,UChi_00,UChi_00) \ + VMADD_RR_RI(U1,Chi_01,UChi_01,UChi_01) \ + VMADD_RR_RI(U2,Chi_01,UChi_02,UChi_02) \ + VMADD_RR_RI(U0,Chi_11,UChi_10,UChi_10) \ + VMADD_RR_RI(U1,Chi_11,UChi_11,UChi_11) \ + VMADD_RR_RI(U2,Chi_11,UChi_12,UChi_12) \ + VMADD_MII_IR(U0,Chi_01,UChi_00,UChi_00) \ + VMADD_MII_IR(U1,Chi_01,UChi_01,UChi_01) \ + VMADD_MII_IR(U2,Chi_01,UChi_02,UChi_02) \ + VMADD_MII_IR(U0,Chi_11,UChi_10,UChi_10) \ + VMADD_MII_IR(U1,Chi_11,UChi_11,UChi_11) \ + VMADD_MII_IR(U2,Chi_11,UChi_12,UChi_12) \ + : : "r" (32), "r" (32*4), "r" (32*7), "r" (ub )); \ + asm ( \ + VLOAD(%0,%3,U0) \ + VLOAD(%1,%3,U1) \ + VLOAD(%2,%3,U2) \ + VMADD_RR_RI(U0,Chi_02,UChi_00,UChi_00) \ + VMADD_RR_RI(U1,Chi_02,UChi_01,UChi_01) \ + VMADD_RR_RI(U2,Chi_02,UChi_02,UChi_02) \ + VMADD_RR_RI(U0,Chi_12,UChi_10,UChi_10) \ + VMADD_RR_RI(U1,Chi_12,UChi_11,UChi_11) \ + VMADD_RR_RI(U2,Chi_12,UChi_12,UChi_12) \ + VMADD_MII_IR(U0,Chi_02,UChi_00,UChi_00) \ + VMADD_MII_IR(U1,Chi_02,UChi_01,UChi_01) \ + VMADD_MII_IR(U2,Chi_02,UChi_02,UChi_02) \ + VMADD_MII_IR(U0,Chi_12,UChi_10,UChi_10) \ + VMADD_MII_IR(U1,Chi_12,UChi_11,UChi_11) \ + VMADD_MII_IR(U2,Chi_12,UChi_12,UChi_12) \ + : : "r" (32*2), "r" (32*5), "r" (32*8), "r" (ub )); \ } -#define SAVE_RESULT(base,basep) {\ - uint64_t ub = ((uint64_t)base) - 32; \ - asm("mr %0,"REP";\n\t" \ - "li " IMM ",32;\n\t" \ - VSTORE(IMM,REP,psi_00) \ - VSTORE(IMM,REP,psi_01) \ - VSTORE(IMM,REP,psi_02) \ - VSTORE(IMM,REP,psi_10) \ - VSTORE(IMM,REP,psi_11) \ - VSTORE(IMM,REP,psi_12) \ - VSTORE(IMM,REP,psi_20) \ - VSTORE(IMM,REP,psi_21) \ - VSTORE(IMM,REP,psi_22) \ - VSTORE(IMM,REP,psi_30) \ - VSTORE(IMM,REP,psi_31) \ - VSTORE(IMM,REP,psi_32) \ - ); \ -} +#define SAVE_RESULT(base,basep) { \ + uint64_t ub = ((uint64_t)base) - 32; \ + asm("mr %0,"REP";\n\t" \ + "li " IMM ",32;\n\t" \ + VSTORE(IMM,REP,psi_00) \ + VSTORE(IMM,REP,psi_01) \ + VSTORE(IMM,REP,psi_02) \ + VSTORE(IMM,REP,psi_10) \ + VSTORE(IMM,REP,psi_11) \ + VSTORE(IMM,REP,psi_12) \ + VSTORE(IMM,REP,psi_20) \ + VSTORE(IMM,REP,psi_21) \ + VSTORE(IMM,REP,psi_22) \ + VSTORE(IMM,REP,psi_30) \ + VSTORE(IMM,REP,psi_31) \ + VSTORE(IMM,REP,psi_32) \ + ); \ + } /* *Annoying BG/Q loads with no immediat indexing and big performance hit *when second miss to a L1 line occurs */ -#define LOAD_CHI(base) { \ - uint64_t ub = ((uint64_t)base) - 64; \ - asm("mr %0,"REP";\n\t" \ - "li " IMM ",64;\n\t" \ - VLOAD(IMM,REP,Chi_00) \ - VLOAD(IMM,REP,Chi_02) \ - VLOAD(IMM,REP,Chi_11) : : "r" (ub) ); \ - ub = ((uint64_t)base) - 32; \ - asm("mr %0,"REP";\n\t" \ - "li IMM,64;\n\t" \ - VLOAD(IMM,REP,Chimu_01) \ - VLOAD(IMM,REP,Chimu_10) \ - VLOAD(IMM,REP,Chimu_12) : : "r" (ub) ); \ +#define LOAD_CHI(base) { \ + uint64_t ub = ((uint64_t)base) - 64; \ + asm("mr %0,"REP";\n\t" \ + "li " IMM ",64;\n\t" \ + VLOAD(IMM,REP,Chi_00) \ + VLOAD(IMM,REP,Chi_02) \ + VLOAD(IMM,REP,Chi_11) : : "r" (ub) ); \ + ub = ((uint64_t)base) - 32; \ + asm("mr %0,"REP";\n\t" \ + "li IMM,64;\n\t" \ + VLOAD(IMM,REP,Chimu_01) \ + VLOAD(IMM,REP,Chimu_10) \ + VLOAD(IMM,REP,Chimu_12) : : "r" (ub) ); \ } -#define LOAD_CHIMU(base) { \ - uint64_t ub = ((uint64_t)base) - 64; \ - asm("mr %0,"REP";\n\t" \ - "li IMM,64;\n\t" \ - VLOAD(IMM,REP,Chimu_00) \ - VLOAD(IMM,REP,Chimu_02) \ - VLOAD(IMM,REP,Chimu_11) \ - VLOAD(IMM,REP,Chimu_20) \ - VLOAD(IMM,REP,Chimu_22) \ - VLOAD(IMM,REP,Chimu_31) : : "r" (ub) ); \ - ub = ((uint64_t)base) - 32; \ - asm("mr %0,"REP";\n\t" \ - "li IMM,64;\n\t" \ - VLOAD(IMM,REP,Chimu_01) \ - VLOAD(IMM,REP,Chimu_10) \ - VLOAD(IMM,REP,Chimu_12) \ - VLOAD(IMM,REP,Chimu_21) \ - VLOAD(IMM,REP,Chimu_30) \ - VLOAD(IMM,REP,Chimu_32) : : "r" (ub) ); \ +#define LOAD_CHIMU(base) { \ + uint64_t ub = ((uint64_t)base) - 64; \ + asm("mr %0,"REP";\n\t" \ + "li IMM,64;\n\t" \ + VLOAD(IMM,REP,Chimu_00) \ + VLOAD(IMM,REP,Chimu_02) \ + VLOAD(IMM,REP,Chimu_11) \ + VLOAD(IMM,REP,Chimu_20) \ + VLOAD(IMM,REP,Chimu_22) \ + VLOAD(IMM,REP,Chimu_31) : : "r" (ub) ); \ + ub = ((uint64_t)base) - 32; \ + asm("mr %0,"REP";\n\t" \ + "li IMM,64;\n\t" \ + VLOAD(IMM,REP,Chimu_01) \ + VLOAD(IMM,REP,Chimu_10) \ + VLOAD(IMM,REP,Chimu_12) \ + VLOAD(IMM,REP,Chimu_21) \ + VLOAD(IMM,REP,Chimu_30) \ + VLOAD(IMM,REP,Chimu_32) : : "r" (ub) ); \ } // hspin(0)=fspin(0)+timesI(fspin(3)); // hspin(1)=fspin(1)+timesI(fspin(2)); -#define XP_PROJMEM(base) { \ +#define XP_PROJMEM(base) { \ + LOAD_CHIMU(base); \ + asm ( \ + VONE(one) \ + VMADD_MII_IR(one,Chimu_30,Chimu_00,Chi_00) \ + VMADD_MII_IR(one,Chimu_31,Chimu_01,Chi_01) \ + VMADD_MII_IR(one,Chimu_32,Chimu_02,Chi_02) \ + VMADD_MII_IR(one,Chimu_20,Chimu_10,Chi_10) \ + VMADD_MII_IR(one,Chimu_21,Chimu_11,Chi_11) \ + VMADD_MII_IR(one,Chimu_22,Chimu_12,Chi_12) \ + ); \ + } + +#define XM_PROJMEM(base) { \ LOAD_CHIMU(base); \ asm ( \ VONE(one) \ - VMADD_MII_IR(one,Chimu_30,Chimu_00,Chi_00) \ - VMADD_MII_IR(one,Chimu_31,Chimu_01,Chi_01) \ - VMADD_MII_IR(one,Chimu_32,Chimu_02,Chi_02) \ - VMADD_MII_IR(one,Chimu_20,Chimu_10,Chi_10) \ - VMADD_MII_IR(one,Chimu_21,Chimu_11,Chi_11) \ - VMADD_MII_IR(one,Chimu_22,Chimu_12,Chi_12) \ - ); \ - } - -#define XM_PROJMEM(base) { \ - LOAD_CHIMU(base); \ - asm ( \ - VONE(one) \ - VMADD_II_MIR(one,Chimu_30,Chimu_00,Chi_00) \ - VMADD_II_MIR(one,Chimu_31,Chimu_01,Chi_01) \ - VMADD_II_MIR(one,Chimu_32,Chimu_02,Chi_02) \ - VMADD_II_MIR(one,Chimu_20,Chimu_10,Chi_10) \ - VMADD_II_MIR(one,Chimu_21,Chimu_11,Chi_11) \ - VMADD_II_MIR(one,Chimu_22,Chimu_12,Chi_12) \ + VMADD_II_MIR(one,Chimu_30,Chimu_00,Chi_00) \ + VMADD_II_MIR(one,Chimu_31,Chimu_01,Chi_01) \ + VMADD_II_MIR(one,Chimu_32,Chimu_02,Chi_02) \ + VMADD_II_MIR(one,Chimu_20,Chimu_10,Chi_10) \ + VMADD_II_MIR(one,Chimu_21,Chimu_11,Chi_11) \ + VMADD_II_MIR(one,Chimu_22,Chimu_12,Chi_12) \ ); \ } // hspin(0)=fspin(0)-fspin(3); // hspin(1)=fspin(1)+fspin(2); -#define YP_PROJMEM(base) { \ - LOAD_CHIMU(base); \ - asm ( \ - VSUB(Chimu_00,Chimu_00,Chi_30) \ - VSUB(Chimu_01,Chimu_01,Chi_31) \ - VSUB(Chimu_02,Chimu_02,Chi_32) \ - VADD(Chimu_10,Chimu_10,Chi_20) \ - VADD(Chimu_11,Chimu_11,Chi_21) \ - VADD(Chimu_12,Chimu_12,Chi_22) \ - ); \ +#define YP_PROJMEM(base) { \ + LOAD_CHIMU(base); \ + asm ( \ + VSUB(Chimu_00,Chimu_00,Chi_30) \ + VSUB(Chimu_01,Chimu_01,Chi_31) \ + VSUB(Chimu_02,Chimu_02,Chi_32) \ + VADD(Chimu_10,Chimu_10,Chi_20) \ + VADD(Chimu_11,Chimu_11,Chi_21) \ + VADD(Chimu_12,Chimu_12,Chi_22) \ + ); \ } -#define YM_PROJMEM(base) { \ - LOAD_CHIMU(base); \ - asm ( \ - VADD(Chimu_00,Chimu_00,Chi_30) \ - VADD(Chimu_01,Chimu_01,Chi_31) \ - VADD(Chimu_02,Chimu_02,Chi_32) \ - VSUB(Chimu_10,Chimu_10,Chi_20) \ - VSUB(Chimu_11,Chimu_11,Chi_21) \ - VSUB(Chimu_12,Chimu_12,Chi_22) \ - ); \ +#define YM_PROJMEM(base) { \ + LOAD_CHIMU(base); \ + asm ( \ + VADD(Chimu_00,Chimu_00,Chi_30) \ + VADD(Chimu_01,Chimu_01,Chi_31) \ + VADD(Chimu_02,Chimu_02,Chi_32) \ + VSUB(Chimu_10,Chimu_10,Chi_20) \ + VSUB(Chimu_11,Chimu_11,Chi_21) \ + VSUB(Chimu_12,Chimu_12,Chi_22) \ + ); \ } - /*Gz - * 0 0 i 0 [0]+-i[2] - * 0 0 0 -i [1]-+i[3] - * -i 0 0 0 - * 0 i 0 0 - */ -#define ZP_PROJMEM(base) { \ - LOAD_CHIMU(base); \ - asm ( \ - VONE(one) \ - VMADD_MII_IR(one,Chimu_20,Chimu_00,Chi_00) \ - VMADD_MII_IR(one,Chimu_21,Chimu_01,Chi_01) \ - VMADD_MII_IR(one,Chimu_22,Chimu_02,Chi_02) \ - VMADD_II_MIR(one,Chimu_30,Chimu_10,Chi_10) \ - VMADD_II_MIR(one,Chimu_31,Chimu_11,Chi_11) \ - VMADD_II_MIR(one,Chimu_32,Chimu_12,Chi_12) \ - ); \ +/*Gz + * 0 0 i 0 [0]+-i[2] + * 0 0 0 -i [1]-+i[3] + * -i 0 0 0 + * 0 i 0 0 + */ +#define ZP_PROJMEM(base) { \ + LOAD_CHIMU(base); \ + asm ( \ + VONE(one) \ + VMADD_MII_IR(one,Chimu_20,Chimu_00,Chi_00) \ + VMADD_MII_IR(one,Chimu_21,Chimu_01,Chi_01) \ + VMADD_MII_IR(one,Chimu_22,Chimu_02,Chi_02) \ + VMADD_II_MIR(one,Chimu_30,Chimu_10,Chi_10) \ + VMADD_II_MIR(one,Chimu_31,Chimu_11,Chi_11) \ + VMADD_II_MIR(one,Chimu_32,Chimu_12,Chi_12) \ + ); \ } -#define ZM_PROJMEM(base) { \ - LOAD_CHIMU(base); \ - asm ( \ - VONE(one) \ - VMADD_II_MIR(one,Chimu_20,Chimu_00,Chi_00) \ - VMADD_II_MIR(one,Chimu_21,Chimu_01,Chi_01) \ - VMADD_II_MIR(one,Chimu_22,Chimu_02,Chi_02) \ - VMADD_MII_IR(one,Chimu_30,Chimu_10,Chi_10) \ - VMADD_MII_IR(one,Chimu_31,Chimu_11,Chi_11) \ - VMADD_MII_IR(one,Chimu_32,Chimu_12,Chi_12) \ - ); \ +#define ZM_PROJMEM(base) { \ + LOAD_CHIMU(base); \ + asm ( \ + VONE(one) \ + VMADD_II_MIR(one,Chimu_20,Chimu_00,Chi_00) \ + VMADD_II_MIR(one,Chimu_21,Chimu_01,Chi_01) \ + VMADD_II_MIR(one,Chimu_22,Chimu_02,Chi_02) \ + VMADD_MII_IR(one,Chimu_30,Chimu_10,Chi_10) \ + VMADD_MII_IR(one,Chimu_31,Chimu_11,Chi_11) \ + VMADD_MII_IR(one,Chimu_32,Chimu_12,Chi_12) \ + ); \ } - /*Gt - * 0 0 1 0 [0]+-[2] - * 0 0 0 1 [1]+-[3] - * 1 0 0 0 - * 0 1 0 0 - */ -#define TP_PROJMEM(base) { \ - LOAD_CHIMU(base); \ - asm ( \ - VADD(Chimu_00,Chimu_00,Chi_20) \ - VADD(Chimu_01,Chimu_01,Chi_21) \ - VADD(Chimu_02,Chimu_02,Chi_22) \ - VADD(Chimu_10,Chimu_10,Chi_30) \ - VADD(Chimu_11,Chimu_11,Chi_31) \ - VADD(Chimu_12,Chimu_12,Chi_32) \ - ); \ +/*Gt + * 0 0 1 0 [0]+-[2] + * 0 0 0 1 [1]+-[3] + * 1 0 0 0 + * 0 1 0 0 + */ +#define TP_PROJMEM(base) { \ + LOAD_CHIMU(base); \ + asm ( \ + VADD(Chimu_00,Chimu_00,Chi_20) \ + VADD(Chimu_01,Chimu_01,Chi_21) \ + VADD(Chimu_02,Chimu_02,Chi_22) \ + VADD(Chimu_10,Chimu_10,Chi_30) \ + VADD(Chimu_11,Chimu_11,Chi_31) \ + VADD(Chimu_12,Chimu_12,Chi_32) \ + ); \ } -#define TM_PROJMEM(base) { \ - LOAD_CHIMU(base); \ - asm ( \ - VSUB(Chimu_00,Chimu_00,Chi_20) \ - VSUB(Chimu_01,Chimu_01,Chi_21) \ - VSUB(Chimu_02,Chimu_02,Chi_22) \ - VSUB(Chimu_10,Chimu_10,Chi_30) \ - VSUB(Chimu_11,Chimu_11,Chi_31) \ - VSUB(Chimu_12,Chimu_12,Chi_32) \ - ); \ +#define TM_PROJMEM(base) { \ + LOAD_CHIMU(base); \ + asm ( \ + VSUB(Chimu_00,Chimu_00,Chi_20) \ + VSUB(Chimu_01,Chimu_01,Chi_21) \ + VSUB(Chimu_02,Chimu_02,Chi_22) \ + VSUB(Chimu_10,Chimu_10,Chi_30) \ + VSUB(Chimu_11,Chimu_11,Chi_31) \ + VSUB(Chimu_12,Chimu_12,Chi_32) \ + ); \ } /* - fspin(0)=hspin(0); - fspin(1)=hspin(1); - fspin(2)=timesMinusI(hspin(1)); - fspin(3)=timesMinusI(hspin(0)); + fspin(0)=hspin(0); + fspin(1)=hspin(1); + fspin(2)=timesMinusI(hspin(1)); + fspin(3)=timesMinusI(hspin(0)); - fspin(0)+=hspin(0); - fspin(1)+=hspin(1); - fspin(2)-=timesI(hspin(1)); - fspin(3)-=timesI(hspin(0)); - */ -#define XP_RECON { \ - asm(\ - VONE(one)\ - VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02)\ - VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12)\ - VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \ - VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \ - VMADD_II_MIR(one,UChi_10,psi_20,psi_20) \ - VMADD_II_MIR(one,UChi_11,psi_21,psi_21) \ - VMADD_II_MIR(one,UChi_12,psi_22,psi_22) \ - VMADD_II_MIR(one,UChi_00,psi_30,psi_30) \ - VMADD_II_MIR(one,UChi_01,psi_31,psi_31) \ - VMADD_II_MIR(one,UChi_02,psi_32,psi_32) \ - ); \ + fspin(0)+=hspin(0); + fspin(1)+=hspin(1); + fspin(2)-=timesI(hspin(1)); + fspin(3)-=timesI(hspin(0)); +*/ +#define XP_RECON { \ + asm( \ + VONE(one) \ + VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02) \ + VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12) \ + VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \ + VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \ + VMADD_II_MIR(one,UChi_10,psi_20,psi_20) \ + VMADD_II_MIR(one,UChi_11,psi_21,psi_21) \ + VMADD_II_MIR(one,UChi_12,psi_22,psi_22) \ + VMADD_II_MIR(one,UChi_00,psi_30,psi_30) \ + VMADD_II_MIR(one,UChi_01,psi_31,psi_31) \ + VMADD_II_MIR(one,UChi_02,psi_32,psi_32) \ + ); \ } -#define XM_RECON { \ - asm(\ - VONE(one)\ - VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02)\ - VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12)\ - VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \ - VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \ - VMADD_MII_IR(one,UChi_10,psi_20,psi_20) \ - VMADD_MII_IR(one,UChi_11,psi_21,psi_21) \ - VMADD_MII_IR(one,UChi_12,psi_22,psi_22) \ - VMADD_MII_IR(one,UChi_00,psi_30,psi_30) \ - VMADD_MII_IR(one,UChi_01,psi_31,psi_31) \ - VMADD_MII_IR(one,UChi_02,psi_32,psi_32) \ - ); \ +#define XM_RECON { \ + asm( \ + VONE(one) \ + VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02) \ + VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12) \ + VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \ + VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \ + VMADD_MII_IR(one,UChi_10,psi_20,psi_20) \ + VMADD_MII_IR(one,UChi_11,psi_21,psi_21) \ + VMADD_MII_IR(one,UChi_12,psi_22,psi_22) \ + VMADD_MII_IR(one,UChi_00,psi_30,psi_30) \ + VMADD_MII_IR(one,UChi_01,psi_31,psi_31) \ + VMADD_MII_IR(one,UChi_02,psi_32,psi_32) \ + ); \ } -#define XP_RECON_ACCUM { \ - asm(\ - VONE(one)\ +#define XP_RECON_ACCUM { \ + asm( \ + VONE(one) \ VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \ VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \ - VMADD_II_MIR(one,UChi_10,psi_20,psi_20) \ - VMADD_II_MIR(one,UChi_11,psi_21,psi_21) \ - VMADD_II_MIR(one,UChi_12,psi_22,psi_22) \ - VMADD_II_MIR(one,UChi_00,psi_30,psi_30) \ - VMADD_II_MIR(one,UChi_01,psi_31,psi_31) \ - VMADD_II_MIR(one,UChi_02,psi_32,psi_32) \ - ); \ + VMADD_II_MIR(one,UChi_10,psi_20,psi_20) \ + VMADD_II_MIR(one,UChi_11,psi_21,psi_21) \ + VMADD_II_MIR(one,UChi_12,psi_22,psi_22) \ + VMADD_II_MIR(one,UChi_00,psi_30,psi_30) \ + VMADD_II_MIR(one,UChi_01,psi_31,psi_31) \ + VMADD_II_MIR(one,UChi_02,psi_32,psi_32) \ + ); \ } -#define XM_RECON_ACCUM { \ - asm(\ - VONE(one)\ +#define XM_RECON_ACCUM { \ + asm( \ + VONE(one) \ VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \ VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \ - VMADD_MII_IR(one,UChi_10,psi_20,psi_20) \ - VMADD_MII_IR(one,UChi_11,psi_21,psi_21) \ - VMADD_MII_IR(one,UChi_12,psi_22,psi_22) \ - VMADD_MII_IR(one,UChi_00,psi_30,psi_30) \ - VMADD_MII_IR(one,UChi_01,psi_31,psi_31) \ - VMADD_MII_IR(one,UChi_02,psi_32,psi_32) \ - ); \ + VMADD_MII_IR(one,UChi_10,psi_20,psi_20) \ + VMADD_MII_IR(one,UChi_11,psi_21,psi_21) \ + VMADD_MII_IR(one,UChi_12,psi_22,psi_22) \ + VMADD_MII_IR(one,UChi_00,psi_30,psi_30) \ + VMADD_MII_IR(one,UChi_01,psi_31,psi_31) \ + VMADD_MII_IR(one,UChi_02,psi_32,psi_32) \ + ); \ } // fspin(2)+=hspin(1); // fspin(3)-=hspin(0); -#define YP_RECON_ACCUM {\ - asm(\ +#define YP_RECON_ACCUM { \ + asm( \ VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \ VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \ VADD(psi_20,UChi_10,psi_20) VADD(psi_21,UChi_11,psi_21) VADD(psi_22,UChi_12,psi_22) \ VSUB(psi_30,UChi_00,psi_30) VSUB(psi_31,UChi_01,psi_31) VSUB(psi_32,UChi_02,psi_32) \ - );\ - } -#define YM_RECON_ACCUM {\ - asm(\ + ); \ + } +#define YM_RECON_ACCUM { \ + asm( \ VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \ VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \ VSUB(psi_20,UChi_10,psi_20) VSUB(psi_21,UChi_11,psi_21) VSUB(psi_22,UChi_12,psi_22) \ VADD(psi_30,UChi_00,psi_30) VADD(psi_31,UChi_01,psi_31) VADD(psi_32,UChi_02,psi_32) \ - );\ - } + ); \ + } // fspin(2)-=timesI(hspin(0)); // fspin(3)+=timesI(hspin(1)); -#define ZP_RECON_ACCUM {\ - asm(\ - VONE(one)\ +#define ZP_RECON_ACCUM { \ + asm( \ + VONE(one) \ VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \ VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \ - VMADD_II_MIR(one,UChi_00,psi_20,psi_20) \ - VMADD_II_MIR(one,UChi_01,psi_21,psi_21) \ - VMADD_II_MIR(one,UChi_02,psi_22,psi_22) \ - VMADD_MII_IR(one,UChi_10,psi_30,psi_30) \ - VMADD_MII_IR(one,UChi_11,psi_31,psi_31) \ - VMADD_MII_IR(one,UChi_12,psi_32,psi_32) \ - );\ - } + VMADD_II_MIR(one,UChi_00,psi_20,psi_20) \ + VMADD_II_MIR(one,UChi_01,psi_21,psi_21) \ + VMADD_II_MIR(one,UChi_02,psi_22,psi_22) \ + VMADD_MII_IR(one,UChi_10,psi_30,psi_30) \ + VMADD_MII_IR(one,UChi_11,psi_31,psi_31) \ + VMADD_MII_IR(one,UChi_12,psi_32,psi_32) \ + ); \ + } -#define ZM_RECON_ACCUM {\ - asm(\ - VONE(one)\ +#define ZM_RECON_ACCUM { \ + asm( \ + VONE(one) \ VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \ VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \ - VMADD_MII_IR(one,UChi_00,psi_20,psi_20) \ - VMADD_MII_IR(one,UChi_01,psi_21,psi_21) \ - VMADD_MII_IR(one,UChi_02,psi_22,psi_22) \ - VMADD_II_MIR(one,UChi_10,psi_30,psi_30) \ - VMADD_II_MIR(one,UChi_11,psi_31,psi_31) \ - VMADD_II_MIR(one,UChi_12,psi_32,psi_32) \ - );\ - } + VMADD_MII_IR(one,UChi_00,psi_20,psi_20) \ + VMADD_MII_IR(one,UChi_01,psi_21,psi_21) \ + VMADD_MII_IR(one,UChi_02,psi_22,psi_22) \ + VMADD_II_MIR(one,UChi_10,psi_30,psi_30) \ + VMADD_II_MIR(one,UChi_11,psi_31,psi_31) \ + VMADD_II_MIR(one,UChi_12,psi_32,psi_32) \ + ); \ + } // fspin(2)+=hspin(0); // fspin(3)+=hspin(1); -#define TP_RECON_ACCUM {\ - asm(\ +#define TP_RECON_ACCUM { \ + asm( \ VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \ VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \ VADD(psi_20,UChi_00,psi_20) VADD(psi_21,UChi_01,psi_21) VADD(psi_22,UChi_02,psi_22) \ VADD(psi_30,UChi_10,psi_30) VADD(psi_31,UChi_11,psi_31) VADD(psi_32,UChi_12,psi_32) \ - );\ - } + ); \ + } -#define TM_RECON_ACCUM {\ - asm(\ - VONE(one)\ +#define TM_RECON_ACCUM { \ + asm( \ + VONE(one) \ VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \ VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \ VSUB(psi_20,UChi_00,psi_20) VSUB(psi_21,UChi_01,psi_21) VSUB(psi_22,UChi_02,psi_22) \ VSUB(psi_30,UChi_10,psi_30) VSUB(psi_31,UChi_11,psi_31) VSUB(psi_32,UChi_12,psi_32) \ - );\ - } + ); \ + } uint64_t GetPFInfo(int nent,int plocal); uint64_t GetInfo(int ptype,int local,int perm,int Xp,int ent,int plocal); @@ -551,244 +551,244 @@ void testme(int osites,int ssU) int nmax=osites; for(int site=0;site=nmax) ssn=0; - int sUn=ssn; - for(int s=0;s=nmax) ssn=0; + int sUn=ssn; + for(int s=0;s shuffle and xor the real part sign bit + //////////////////////////////// + // Yp + //////////////////////////////// + basep = GetPFInfo(nent,plocal); nent++; + if ( local ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - YP_PROJMEM(base); + YP_PROJMEM(base); #else - YM_PROJMEM(base); + YM_PROJMEM(base); #endif - MAYBEPERM(PERMUTE_DIR2,perm); - } else { - LOAD_CHI(base); - } - base = GetInfo(ptype,local,perm,Zp,ent,plocal); ent++; - PREFETCH_CHIMU(base); - { - MULT_2SPIN_DIR_PFYP(Yp,basep); - } - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + MAYBEPERM(PERMUTE_DIR2,perm); + } else { + LOAD_CHI(base); + } + base = GetInfo(ptype,local,perm,Zp,ent,plocal); ent++; + PREFETCH_CHIMU(base); + { + MULT_2SPIN_DIR_PFYP(Yp,basep); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - YP_RECON_ACCUM; + YP_RECON_ACCUM; #else - YM_RECON_ACCUM; + YM_RECON_ACCUM; #endif - //////////////////////////////// - // Zp - //////////////////////////////// - basep = GetPFInfo(nent,plocal); nent++; - if ( local ) { - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + //////////////////////////////// + // Zp + //////////////////////////////// + basep = GetPFInfo(nent,plocal); nent++; + if ( local ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - ZP_PROJMEM(base); + ZP_PROJMEM(base); #else - ZM_PROJMEM(base); + ZM_PROJMEM(base); #endif - MAYBEPERM(PERMUTE_DIR1,perm); - } else { - LOAD_CHI(base); - } - base = GetInfo(ptype,local,perm,Tp,ent,plocal); ent++; - PREFETCH_CHIMU(base); - { - MULT_2SPIN_DIR_PFZP(Zp,basep); - } - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + MAYBEPERM(PERMUTE_DIR1,perm); + } else { + LOAD_CHI(base); + } + base = GetInfo(ptype,local,perm,Tp,ent,plocal); ent++; + PREFETCH_CHIMU(base); + { + MULT_2SPIN_DIR_PFZP(Zp,basep); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - ZP_RECON_ACCUM; + ZP_RECON_ACCUM; #else - ZM_RECON_ACCUM; + ZM_RECON_ACCUM; #endif - //////////////////////////////// - // Tp - //////////////////////////////// - basep = GetPFInfo(nent,plocal); nent++; - if ( local ) { - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + //////////////////////////////// + // Tp + //////////////////////////////// + basep = GetPFInfo(nent,plocal); nent++; + if ( local ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - TP_PROJMEM(base); + TP_PROJMEM(base); #else - TM_PROJMEM(base); + TM_PROJMEM(base); #endif - MAYBEPERM(PERMUTE_DIR0,perm); - } else { - LOAD_CHI(base); - } - base = GetInfo(ptype,local,perm,Xm,ent,plocal); ent++; - PREFETCH_CHIMU(base); - { - MULT_2SPIN_DIR_PFTP(Tp,basep); - } - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + MAYBEPERM(PERMUTE_DIR0,perm); + } else { + LOAD_CHI(base); + } + base = GetInfo(ptype,local,perm,Xm,ent,plocal); ent++; + PREFETCH_CHIMU(base); + { + MULT_2SPIN_DIR_PFTP(Tp,basep); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - TP_RECON_ACCUM; + TP_RECON_ACCUM; #else - TM_RECON_ACCUM; + TM_RECON_ACCUM; #endif - //////////////////////////////// - // Xm - //////////////////////////////// + //////////////////////////////// + // Xm + //////////////////////////////// #ifndef STREAM_STORE - basep= (uint64_t) &out._odata[ss]; + basep= (uint64_t) &out._odata[ss]; #endif - // basep= GetPFInfo(nent,plocal); nent++; - if ( local ) { - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + // basep= GetPFInfo(nent,plocal); nent++; + if ( local ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - XM_PROJMEM(base); + XM_PROJMEM(base); #else - XP_PROJMEM(base); + XP_PROJMEM(base); #endif - MAYBEPERM(PERMUTE_DIR3,perm); - } else { - LOAD_CHI(base); - } - base = GetInfo(ptype,local,perm,Ym,ent,plocal); ent++; - PREFETCH_CHIMU(base); - { - MULT_2SPIN_DIR_PFXM(Xm,basep); - } - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + MAYBEPERM(PERMUTE_DIR3,perm); + } else { + LOAD_CHI(base); + } + base = GetInfo(ptype,local,perm,Ym,ent,plocal); ent++; + PREFETCH_CHIMU(base); + { + MULT_2SPIN_DIR_PFXM(Xm,basep); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - XM_RECON_ACCUM; + XM_RECON_ACCUM; #else - XP_RECON_ACCUM; + XP_RECON_ACCUM; #endif - //////////////////////////////// - // Ym - //////////////////////////////// - basep= GetPFInfo(nent,plocal); nent++; - if ( local ) { - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + //////////////////////////////// + // Ym + //////////////////////////////// + basep= GetPFInfo(nent,plocal); nent++; + if ( local ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - YM_PROJMEM(base); + YM_PROJMEM(base); #else - YP_PROJMEM(base); + YP_PROJMEM(base); #endif - MAYBEPERM(PERMUTE_DIR2,perm); - } else { - LOAD_CHI(base); - } - base = GetInfo(ptype,local,perm,Zm,ent,plocal); ent++; - PREFETCH_CHIMU(base); - { - MULT_2SPIN_DIR_PFYM(Ym,basep); - } - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + MAYBEPERM(PERMUTE_DIR2,perm); + } else { + LOAD_CHI(base); + } + base = GetInfo(ptype,local,perm,Zm,ent,plocal); ent++; + PREFETCH_CHIMU(base); + { + MULT_2SPIN_DIR_PFYM(Ym,basep); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - YM_RECON_ACCUM; + YM_RECON_ACCUM; #else - YP_RECON_ACCUM; + YP_RECON_ACCUM; #endif - //////////////////////////////// - // Zm - //////////////////////////////// - basep= GetPFInfo(nent,plocal); nent++; - if ( local ) { - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + //////////////////////////////// + // Zm + //////////////////////////////// + basep= GetPFInfo(nent,plocal); nent++; + if ( local ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - ZM_PROJMEM(base); + ZM_PROJMEM(base); #else - ZP_PROJMEM(base); + ZP_PROJMEM(base); #endif - MAYBEPERM(PERMUTE_DIR1,perm); - } else { - LOAD_CHI(base); - } - base = GetInfo(ptype,local,perm,Tm,ent,plocal); ent++; - PREFETCH_CHIMU(base); - { - MULT_2SPIN_DIR_PFZM(Zm,basep); - } - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + MAYBEPERM(PERMUTE_DIR1,perm); + } else { + LOAD_CHI(base); + } + base = GetInfo(ptype,local,perm,Tm,ent,plocal); ent++; + PREFETCH_CHIMU(base); + { + MULT_2SPIN_DIR_PFZM(Zm,basep); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - ZM_RECON_ACCUM; + ZM_RECON_ACCUM; #else - ZP_RECON_ACCUM; + ZP_RECON_ACCUM; #endif - //////////////////////////////// - // Tm - //////////////////////////////// - basep= GetPFInfo(nent,plocal); nent++; - if ( local ) { - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + //////////////////////////////// + // Tm + //////////////////////////////// + basep= GetPFInfo(nent,plocal); nent++; + if ( local ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - TM_PROJMEM(base); + TM_PROJMEM(base); #else - TP_PROJMEM(base); + TP_PROJMEM(base); #endif - MAYBEPERM(PERMUTE_DIR0,perm); - } else { - LOAD_CHI(base); - } - base= (uint64_t) &out._odata[ss]; + MAYBEPERM(PERMUTE_DIR0,perm); + } else { + LOAD_CHI(base); + } + base= (uint64_t) &out._odata[ss]; #ifndef STREAM_STORE - PREFETCH_CHIMU(base); + PREFETCH_CHIMU(base); #endif - { - MULT_2SPIN_DIR_PFTM(Tm,basep); - } - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + { + MULT_2SPIN_DIR_PFTM(Tm,basep); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - TM_RECON_ACCUM; + TM_RECON_ACCUM; #else - TP_RECON_ACCUM; + TP_RECON_ACCUM; #endif - basep= GetPFInfo(nent,plocal); nent++; - SAVE_RESULT(base,basep); + basep= GetPFInfo(nent,plocal); nent++; + SAVE_RESULT(base,basep); - } - ssU++; + } + ssU++; } } From 62fcee72c56ca18439672e78f35ce4ebaee05738 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:16:37 +0000 Subject: [PATCH 015/754] Format, NAMESPACE --- lib/simd/Grid_imci.h | 741 +++++++++++++++++++++---------------------- 1 file changed, 362 insertions(+), 379 deletions(-) diff --git a/lib/simd/Grid_imci.h b/lib/simd/Grid_imci.h index a1dae565..8906525a 100644 --- a/lib/simd/Grid_imci.h +++ b/lib/simd/Grid_imci.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -24,425 +24,408 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #include #include -namespace Grid{ -namespace Optimization { +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Optimization); - struct Vsplat{ - //Complex float - inline __m512 operator()(float a, float b){ - return _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a); - } - // Real float - inline __m512 operator()(float a){ - return _mm512_set1_ps(a); - } - //Complex double - inline __m512d operator()(double a, double b){ - return _mm512_set_pd(b,a,b,a,b,a,b,a); - } - //Real double - inline __m512d operator()(double a){ - return _mm512_set1_pd(a); - } - //Integer - inline __m512i operator()(Integer a){ - return _mm512_set1_epi32(a); - } - }; +struct Vsplat{ + //Complex float + inline __m512 operator()(float a, float b){ + return _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a); + } + // Real float + inline __m512 operator()(float a){ + return _mm512_set1_ps(a); + } + //Complex double + inline __m512d operator()(double a, double b){ + return _mm512_set_pd(b,a,b,a,b,a,b,a); + } + //Real double + inline __m512d operator()(double a){ + return _mm512_set1_pd(a); + } + //Integer + inline __m512i operator()(Integer a){ + return _mm512_set1_epi32(a); + } +}; - struct Vstore{ - //Float - inline void operator()(__m512 a, float* F){ - _mm512_store_ps(F,a); - } - //Double - inline void operator()(__m512d a, double* D){ - _mm512_store_pd(D,a); - } - //Integer - inline void operator()(__m512i a, Integer* I){ - _mm512_store_si512((__m512i *)I,a); - } +struct Vstore{ + //Float + inline void operator()(__m512 a, float* F){ + _mm512_store_ps(F,a); + } + //Double + inline void operator()(__m512d a, double* D){ + _mm512_store_pd(D,a); + } + //Integer + inline void operator()(__m512i a, Integer* I){ + _mm512_store_si512((__m512i *)I,a); + } - }; +}; + +struct Vstream{ + //Float + inline void operator()(float * a, __m512 b){ + _mm512_storenrngo_ps(a,b); + } + //Double + inline void operator()(double * a, __m512d b){ + _mm512_storenrngo_pd(a,b); + } - struct Vstream{ - //Float - inline void operator()(float * a, __m512 b){ - _mm512_storenrngo_ps(a,b); - } - //Double - inline void operator()(double * a, __m512d b){ - _mm512_storenrngo_pd(a,b); - } +}; + +struct Vset{ + // Complex float + inline __m512 operator()(Grid::ComplexF *a){ + return _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(), + a[5].imag(),a[5].real(),a[4].imag(),a[4].real(), + a[3].imag(),a[3].real(),a[2].imag(),a[2].real(), + a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); + } + // Complex double + inline __m512d operator()(Grid::ComplexD *a){ + return _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(), + a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); + } + // Real float + inline __m512 operator()(float *a){ + return _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8], + a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); + } + // Real double + inline __m512d operator()(double *a){ + return _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); + } + // Integer + inline __m512i operator()(Integer *a){ + return _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8], + a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); + } - }; +}; +template +struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } +}; +///////////////////////////////////////////////////// +// Arithmetic operations +///////////////////////////////////////////////////// +struct Sum{ + //Complex/Real float + inline __m512 operator()(__m512 a, __m512 b){ + return _mm512_add_ps(a,b); + } + //Complex/Real double + inline __m512d operator()(__m512d a, __m512d b){ + return _mm512_add_pd(a,b); + } + //Integer + inline __m512i operator()(__m512i a, __m512i b){ + return _mm512_add_epi32(a,b); + } +}; - struct Vset{ - // Complex float - inline __m512 operator()(Grid::ComplexF *a){ - return _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(), - a[5].imag(),a[5].real(),a[4].imag(),a[4].real(), - a[3].imag(),a[3].real(),a[2].imag(),a[2].real(), - a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); - } - // Complex double - inline __m512d operator()(Grid::ComplexD *a){ - return _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(), - a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); - } - // Real float - inline __m512 operator()(float *a){ - return _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8], - a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); - } - // Real double - inline __m512d operator()(double *a){ - return _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); - } - // Integer - inline __m512i operator()(Integer *a){ - return _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8], - a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); - } +struct Sub{ + //Complex/Real float + inline __m512 operator()(__m512 a, __m512 b){ + return _mm512_sub_ps(a,b); + } + //Complex/Real double + inline __m512d operator()(__m512d a, __m512d b){ + return _mm512_sub_pd(a,b); + } + //Integer + inline __m512i operator()(__m512i a, __m512i b){ + return _mm512_sub_epi32(a,b); + } +}; - - }; - - template - struct Reduce{ - //Need templated class to overload output type - //General form must generate error if compiled - inline Out_type operator()(In_type in){ - printf("Error, using wrong Reduce function\n"); - exit(1); - return 0; - } - }; - - - - - ///////////////////////////////////////////////////// - // Arithmetic operations - ///////////////////////////////////////////////////// - struct Sum{ - //Complex/Real float - inline __m512 operator()(__m512 a, __m512 b){ - return _mm512_add_ps(a,b); - } - //Complex/Real double - inline __m512d operator()(__m512d a, __m512d b){ - return _mm512_add_pd(a,b); - } - //Integer - inline __m512i operator()(__m512i a, __m512i b){ - return _mm512_add_epi32(a,b); - } - }; - - struct Sub{ - //Complex/Real float - inline __m512 operator()(__m512 a, __m512 b){ - return _mm512_sub_ps(a,b); - } - //Complex/Real double - inline __m512d operator()(__m512d a, __m512d b){ - return _mm512_sub_pd(a,b); - } - //Integer - inline __m512i operator()(__m512i a, __m512i b){ - return _mm512_sub_epi32(a,b); - } - }; - - - struct MultComplex{ - // Complex float - inline __m512 operator()(__m512 a, __m512 b){ - __m512 vzero,ymm0,ymm1,real, imag; - vzero = _mm512_setzero_ps(); - ymm0 = _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB); // - real = (__m512)_mm512_mask_or_epi32((__m512i)a, 0xAAAA,(__m512i)vzero,(__m512i)ymm0); - imag = _mm512_mask_sub_ps(a, 0x5555,vzero, ymm0); - ymm1 = _mm512_mul_ps(real, b); - ymm0 = _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB); // OK - return _mm512_fmadd_ps(ymm0,imag,ymm1); - } - // Complex double - inline __m512d operator()(__m512d a, __m512d b){ - /* This is from - * Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets - * @inproceedings{McFarlin:2011:ASV:1995896.1995938, - * author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus}, - * title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets}, - * booktitle = {Proceedings of the International Conference on Supercomputing}, - * series = {ICS '11}, - * year = {2011}, - * isbn = {978-1-4503-0102-2}, - * location = {Tucson, Arizona, USA}, - * pages = {265--274}, - * numpages = {10}, - * url = {http://doi.acm.org/10.1145/1995896.1995938}, - * doi = {10.1145/1995896.1995938}, - * acmid = {1995938}, - * publisher = {ACM}, - * address = {New York, NY, USA}, - * keywords = {autovectorization, fourier transform, program generation, simd, super-optimization}, - * } - */ - __m512d vzero,ymm0,ymm1,real,imag; - vzero =_mm512_setzero_pd(); - ymm0 = _mm512_swizzle_pd(a, _MM_SWIZ_REG_CDAB); // - real =(__m512d)_mm512_mask_or_epi64((__m512i)a, 0xAA,(__m512i)vzero,(__m512i) ymm0); - imag = _mm512_mask_sub_pd(a, 0x55,vzero, ymm0); - ymm1 = _mm512_mul_pd(real, b); - ymm0 = _mm512_swizzle_pd(b, _MM_SWIZ_REG_CDAB); // OK - return _mm512_fmadd_pd(ymm0,imag,ymm1); - } - }; +struct MultComplex{ + // Complex float + inline __m512 operator()(__m512 a, __m512 b){ + __m512 vzero,ymm0,ymm1,real, imag; + vzero = _mm512_setzero_ps(); + ymm0 = _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB); // + real = (__m512)_mm512_mask_or_epi32((__m512i)a, 0xAAAA,(__m512i)vzero,(__m512i)ymm0); + imag = _mm512_mask_sub_ps(a, 0x5555,vzero, ymm0); + ymm1 = _mm512_mul_ps(real, b); + ymm0 = _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB); // OK + return _mm512_fmadd_ps(ymm0,imag,ymm1); + } + // Complex double + inline __m512d operator()(__m512d a, __m512d b){ + /* This is from + * Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets + * @inproceedings{McFarlin:2011:ASV:1995896.1995938, + * author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus}, + * title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets}, + * booktitle = {Proceedings of the International Conference on Supercomputing}, + * series = {ICS '11}, + * year = {2011}, + * isbn = {978-1-4503-0102-2}, + * location = {Tucson, Arizona, USA}, + * pages = {265--274}, + * numpages = {10}, + * url = {http://doi.acm.org/10.1145/1995896.1995938}, + * doi = {10.1145/1995896.1995938}, + * acmid = {1995938}, + * publisher = {ACM}, + * address = {New York, NY, USA}, + * keywords = {autovectorization, fourier transform, program generation, simd, super-optimization}, + * } + */ + __m512d vzero,ymm0,ymm1,real,imag; + vzero =_mm512_setzero_pd(); + ymm0 = _mm512_swizzle_pd(a, _MM_SWIZ_REG_CDAB); // + real =(__m512d)_mm512_mask_or_epi64((__m512i)a, 0xAA,(__m512i)vzero,(__m512i) ymm0); + imag = _mm512_mask_sub_pd(a, 0x55,vzero, ymm0); + ymm1 = _mm512_mul_pd(real, b); + ymm0 = _mm512_swizzle_pd(b, _MM_SWIZ_REG_CDAB); // OK + return _mm512_fmadd_pd(ymm0,imag,ymm1); + } +}; - struct Mult{ +struct Mult{ - inline void mac(__m512 &a, __m512 b, __m512 c){ - a= _mm512_fmadd_ps( b, c, a); - } + inline void mac(__m512 &a, __m512 b, __m512 c){ + a= _mm512_fmadd_ps( b, c, a); + } - inline void mac(__m512d &a, __m512d b, __m512d c){ - a= _mm512_fmadd_pd( b, c, a); - } + inline void mac(__m512d &a, __m512d b, __m512d c){ + a= _mm512_fmadd_pd( b, c, a); + } - // Real float - inline __m512 operator()(__m512 a, __m512 b){ - return _mm512_mul_ps(a,b); - } - // Real double - inline __m512d operator()(__m512d a, __m512d b){ - return _mm512_mul_pd(a,b); - } - // Integer - inline __m512i operator()(__m512i a, __m512i b){ - return _mm512_mullo_epi32(a,b); - } - }; + // Real float + inline __m512 operator()(__m512 a, __m512 b){ + return _mm512_mul_ps(a,b); + } + // Real double + inline __m512d operator()(__m512d a, __m512d b){ + return _mm512_mul_pd(a,b); + } + // Integer + inline __m512i operator()(__m512i a, __m512i b){ + return _mm512_mullo_epi32(a,b); + } +}; - struct Div{ - // Real float - inline __m512 operator()(__m512 a, __m512 b){ - return _mm512_div_ps(a,b); - } - // Real double - inline __m512d operator()(__m512d a, __m512d b){ - return _mm512_div_pd(a,b); - } - }; +struct Div{ + // Real float + inline __m512 operator()(__m512 a, __m512 b){ + return _mm512_div_ps(a,b); + } + // Real double + inline __m512d operator()(__m512d a, __m512d b){ + return _mm512_div_pd(a,b); + } +}; + +struct Conj{ + // Complex single + inline __m512 operator()(__m512 in){ + return _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // Zero out 0+real 0-imag + } + // Complex double + inline __m512d operator()(__m512d in){ + return _mm512_mask_sub_pd(in, 0xaa,_mm512_setzero_pd(), in); + } + // do not define for integer input +}; + +struct TimesMinusI{ + //Complex single + inline __m512 operator()(__m512 in, __m512 ret){ + __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag + return _mm512_swizzle_ps(tmp, _MM_SWIZ_REG_CDAB);// OK + } + //Complex double + inline __m512d operator()(__m512d in, __m512d ret){ + __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag + return _mm512_swizzle_pd(tmp, _MM_SWIZ_REG_CDAB);// OK + } +}; + +struct TimesI{ + //Complex single + inline __m512 operator()(__m512 in, __m512 ret){ + __m512 tmp = _mm512_swizzle_ps(in, _MM_SWIZ_REG_CDAB);// OK + return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); // real -imag + } + //Complex double + inline __m512d operator()(__m512d in, __m512d ret){ + __m512d tmp = _mm512_swizzle_pd(in, _MM_SWIZ_REG_CDAB);// OK + return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); // real -imag + } - struct Conj{ - // Complex single - inline __m512 operator()(__m512 in){ - return _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // Zero out 0+real 0-imag - } - // Complex double - inline __m512d operator()(__m512d in){ - return _mm512_mask_sub_pd(in, 0xaa,_mm512_setzero_pd(), in); - } - // do not define for integer input - }; +}; - struct TimesMinusI{ - //Complex single - inline __m512 operator()(__m512 in, __m512 ret){ - __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag - return _mm512_swizzle_ps(tmp, _MM_SWIZ_REG_CDAB);// OK - } - //Complex double - inline __m512d operator()(__m512d in, __m512d ret){ - __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag - return _mm512_swizzle_pd(tmp, _MM_SWIZ_REG_CDAB);// OK - } - - - }; - - struct TimesI{ - //Complex single - inline __m512 operator()(__m512 in, __m512 ret){ - __m512 tmp = _mm512_swizzle_ps(in, _MM_SWIZ_REG_CDAB);// OK - return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); // real -imag - } - //Complex double - inline __m512d operator()(__m512d in, __m512d ret){ - __m512d tmp = _mm512_swizzle_pd(in, _MM_SWIZ_REG_CDAB);// OK - return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); // real -imag - } - - - }; - - - struct Permute{ +struct Permute{ - static inline __m512 Permute0(__m512 in){ - return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2)); - }; - static inline __m512 Permute1(__m512 in){ - return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(2,3,0,1)); - }; - static inline __m512 Permute2(__m512 in){ - return _mm512_swizzle_ps(in,_MM_SWIZ_REG_BADC); - }; - static inline __m512 Permute3(__m512 in){ - return _mm512_swizzle_ps(in,_MM_SWIZ_REG_CDAB); - }; - - static inline __m512d Permute0(__m512d in){// Hack no intrinsic for 256 swaps of __m512d - return (__m512d)_mm512_permute4f128_ps((__m512)in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2)); - }; - static inline __m512d Permute1(__m512d in){ - return _mm512_swizzle_pd(in,_MM_SWIZ_REG_BADC); - }; - static inline __m512d Permute2(__m512d in){ - return _mm512_swizzle_pd(in,_MM_SWIZ_REG_CDAB); - }; - static inline __m512d Permute3(__m512d in){ - return in; - }; - + static inline __m512 Permute0(__m512 in){ + return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2)); }; + static inline __m512 Permute1(__m512 in){ + return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(2,3,0,1)); + }; + static inline __m512 Permute2(__m512 in){ + return _mm512_swizzle_ps(in,_MM_SWIZ_REG_BADC); + }; + static inline __m512 Permute3(__m512 in){ + return _mm512_swizzle_ps(in,_MM_SWIZ_REG_CDAB); + }; + + static inline __m512d Permute0(__m512d in){// Hack no intrinsic for 256 swaps of __m512d + return (__m512d)_mm512_permute4f128_ps((__m512)in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2)); + }; + static inline __m512d Permute1(__m512d in){ + return _mm512_swizzle_pd(in,_MM_SWIZ_REG_BADC); + }; + static inline __m512d Permute2(__m512d in){ + return _mm512_swizzle_pd(in,_MM_SWIZ_REG_CDAB); + }; + static inline __m512d Permute3(__m512d in){ + return in; + }; + +}; - struct Rotate{ +struct Rotate{ - static inline __m512 rotate(__m512 in,int n){ - switch(n){ - case 0: return tRotate<0>(in);break; - case 1: return tRotate<1>(in);break; - case 2: return tRotate<2>(in);break; - case 3: return tRotate<3>(in);break; - case 4: return tRotate<4>(in);break; - case 5: return tRotate<5>(in);break; - case 6: return tRotate<6>(in);break; - case 7: return tRotate<7>(in);break; + static inline __m512 rotate(__m512 in,int n){ + switch(n){ + case 0: return tRotate<0>(in);break; + case 1: return tRotate<1>(in);break; + case 2: return tRotate<2>(in);break; + case 3: return tRotate<3>(in);break; + case 4: return tRotate<4>(in);break; + case 5: return tRotate<5>(in);break; + case 6: return tRotate<6>(in);break; + case 7: return tRotate<7>(in);break; - case 8 : return tRotate<8>(in);break; - case 9 : return tRotate<9>(in);break; - case 10: return tRotate<10>(in);break; - case 11: return tRotate<11>(in);break; - case 12: return tRotate<12>(in);break; - case 13: return tRotate<13>(in);break; - case 14: return tRotate<14>(in);break; - case 15: return tRotate<15>(in);break; - default: assert(0); - } + case 8 : return tRotate<8>(in);break; + case 9 : return tRotate<9>(in);break; + case 10: return tRotate<10>(in);break; + case 11: return tRotate<11>(in);break; + case 12: return tRotate<12>(in);break; + case 13: return tRotate<13>(in);break; + case 14: return tRotate<14>(in);break; + case 15: return tRotate<15>(in);break; + default: assert(0); } - static inline __m512d rotate(__m512d in,int n){ - switch(n){ - case 0: return tRotate<0>(in);break; - case 1: return tRotate<1>(in);break; - case 2: return tRotate<2>(in);break; - case 3: return tRotate<3>(in);break; - case 4: return tRotate<4>(in);break; - case 5: return tRotate<5>(in);break; - case 6: return tRotate<6>(in);break; - case 7: return tRotate<7>(in);break; - default: assert(0); - } + } + static inline __m512d rotate(__m512d in,int n){ + switch(n){ + case 0: return tRotate<0>(in);break; + case 1: return tRotate<1>(in);break; + case 2: return tRotate<2>(in);break; + case 3: return tRotate<3>(in);break; + case 4: return tRotate<4>(in);break; + case 5: return tRotate<5>(in);break; + case 6: return tRotate<6>(in);break; + case 7: return tRotate<7>(in);break; + default: assert(0); } + } - template static inline __m512 tRotate(__m512 in){ - return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n); - }; - - template static inline __m512d tRotate(__m512d in){ - return (__m512d)_mm512_alignr_epi32((__m512i)in,(__m512i)in,2*n); - }; - + template static inline __m512 tRotate(__m512 in){ + return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n); }; + template static inline __m512d tRotate(__m512d in){ + return (__m512d)_mm512_alignr_epi32((__m512i)in,(__m512i)in,2*n); + }; +}; - ////////////////////////////////////////////// - // Some Template specialization - - //Complex float Reduce - template<> - inline Grid::ComplexF Reduce::operator()(__m512 in){ - return Grid::ComplexF(_mm512_mask_reduce_add_ps(0x5555, in),_mm512_mask_reduce_add_ps(0xAAAA, in)); - } - //Real float Reduce - template<> - inline Grid::RealF Reduce::operator()(__m512 in){ - return _mm512_reduce_add_ps(in); - } - - - //Complex double Reduce - template<> - inline Grid::ComplexD Reduce::operator()(__m512d in){ - return Grid::ComplexD(_mm512_mask_reduce_add_pd(0x55, in),_mm512_mask_reduce_add_pd(0xAA, in)); - } - - //Real double Reduce - template<> - inline Grid::RealD Reduce::operator()(__m512d in){ - return _mm512_reduce_add_pd(in); - } - - //Integer Reduce - template<> - inline Integer Reduce::operator()(__m512i in){ - return _mm512_reduce_add_epi32(in); - } - +////////////////////////////////////////////// +// Some Template specialization +//Complex float Reduce +template<> +inline Grid::ComplexF Reduce::operator()(__m512 in){ + return Grid::ComplexF(_mm512_mask_reduce_add_ps(0x5555, in),_mm512_mask_reduce_add_ps(0xAAAA, in)); } +//Real float Reduce +template<> +inline Grid::RealF Reduce::operator()(__m512 in){ + return _mm512_reduce_add_ps(in); +} + + +//Complex double Reduce +template<> +inline Grid::ComplexD Reduce::operator()(__m512d in){ + return Grid::ComplexD(_mm512_mask_reduce_add_pd(0x55, in),_mm512_mask_reduce_add_pd(0xAA, in)); +} + +//Real double Reduce +template<> +inline Grid::RealD Reduce::operator()(__m512d in){ + return _mm512_reduce_add_pd(in); +} + +//Integer Reduce +template<> +inline Integer Reduce::operator()(__m512i in){ + return _mm512_reduce_add_epi32(in); +} + +NAMESPACE_END(Grid); ////////////////////////////////////////////////////////////////////////////////////// // Here assign types - typedef __m512 SIMD_Ftype; // Single precision type - typedef __m512d SIMD_Dtype; // Double precision type - typedef __m512i SIMD_Itype; // Integer type +typedef __m512 SIMD_Ftype; // Single precision type +typedef __m512d SIMD_Dtype; // Double precision type +typedef __m512i SIMD_Itype; // Integer type - // prefecth - inline void v_prefetch0(int size, const char *ptr){ - for(int i=0;i using ReduceSIMD = Optimization::Reduce; - - - // Arithmetic operations - typedef Optimization::Sum SumSIMD; - typedef Optimization::Sub SubSIMD; - typedef Optimization::Div DivSIMD; - typedef Optimization::Mult MultSIMD; - typedef Optimization::MultComplex MultComplexSIMD; - typedef Optimization::Conj ConjSIMD; - typedef Optimization::TimesMinusI TimesMinusISIMD; - typedef Optimization::TimesI TimesISIMD; - } +inline void prefetch_HINT_T0(const char *ptr){ + _mm_prefetch(ptr,_MM_HINT_T0); +} + +// Function name aliases +typedef Optimization::Vsplat VsplatSIMD; +typedef Optimization::Vstore VstoreSIMD; +typedef Optimization::Vset VsetSIMD; +typedef Optimization::Vstream VstreamSIMD; +template using ReduceSIMD = Optimization::Reduce; + +// Arithmetic operations +typedef Optimization::Sum SumSIMD; +typedef Optimization::Sub SubSIMD; +typedef Optimization::Div DivSIMD; +typedef Optimization::Mult MultSIMD; +typedef Optimization::MultComplex MultComplexSIMD; +typedef Optimization::Conj ConjSIMD; +typedef Optimization::TimesMinusI TimesMinusISIMD; +typedef Optimization::TimesI TimesISIMD; + +NAMESPACE_END(Grid); From 5e2cd0d07cd1557e65740fe6b6d79510aadeeab6 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:18:22 +0000 Subject: [PATCH 016/754] Format --- lib/simd/Grid_qpx.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index a4efeb91..d93dbc5b 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -175,15 +175,15 @@ struct Vset{ }; template - struct Reduce{ - //Need templated class to overload output type - //General form must generate error if compiled - inline Out_type operator()(In_type in){ - printf("Error, using wrong Reduce function\n"); - exit(1); - return 0; - } - }; +struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } +}; ///////////////////////////////////////////////////// // Arithmetic operations From 347d5404ddd8022bcaee53f7bef36548cc0f8eab Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:21:25 +0000 Subject: [PATCH 017/754] format --- lib/simd/Grid_vector_types.h | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 4768c734..f0a301a4 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -1,37 +1,37 @@ /************************************************************************************* -Grid physics library, www.github.com/paboyle/Grid + Grid physics library, www.github.com/paboyle/Grid -Source file: ./lib/simd/Grid_vector_type.h + Source file: ./lib/simd/Grid_vector_types.h -Copyright (C) 2015 + Copyright (C) 2015 Author: Azusa Yamaguchi Author: Guido Cossu Author: Peter Boyle Author: neo +Author: paboyle -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. -You should have received a copy of the GNU General Public License along -with this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -See the full license in the file "LICENSE" in the top level distribution -directory + See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ //--------------------------------------------------------------------------- /*! @file Grid_vector_types.h - @brief Defines templated class Grid_simd to deal with inner vector types + @brief Defines templated class Grid_simd to deal with inner vector types */ // Time-stamp: <2015-07-10 17:45:33 neo> //--------------------------------------------------------------------------- From 3c7bf211a99869d933a70507553073a4e5086e3f Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:22:18 +0000 Subject: [PATCH 018/754] Reformat --- lib/simd/IBM_qpx.h | 438 ++++++++++++++++++++++----------------------- 1 file changed, 219 insertions(+), 219 deletions(-) diff --git a/lib/simd/IBM_qpx.h b/lib/simd/IBM_qpx.h index df91d8e6..ca01907b 100644 --- a/lib/simd/IBM_qpx.h +++ b/lib/simd/IBM_qpx.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,8 +23,8 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_ASM_BGQ_QPX_H #define GRID_ASM_BGQ_QPX_H @@ -153,32 +153,32 @@ Author: paboyle /********************************************************* * Macro sequences encoding QCD *********************************************************/ -#define LOCK_GAUGE(dir) \ - { \ - uint64_t byte_addr = (uint64_t)&U._odata[sU]; \ - int count = (sizeof(U._odata[0])+63)/64; \ - asm (" mtctr %0 \n" \ - " mr " HASH(REP) ", %1\n" \ - " li " HASH(IMM) ", 64\n" \ +#define LOCK_GAUGE(dir) \ + { \ + uint64_t byte_addr = (uint64_t)&U._odata[sU]; \ + int count = (sizeof(U._odata[0])+63)/64; \ + asm (" mtctr %0 \n" \ + " mr " HASH(REP) ", %1\n" \ + " li " HASH(IMM) ", 64\n" \ "0:\n" \ - LOCK_SET \ - " add " HASH(REP) "," HASH(IMM) "," HASH(REP) "\n" \ + LOCK_SET \ + " add " HASH(REP) "," HASH(IMM) "," HASH(REP) "\n" \ " bdnz 0b\n" \ - : : "b" (count), "b" (byte_addr) ); \ + : : "b" (count), "b" (byte_addr) ); \ } -#define UNLOCK_GAUGE(dir) \ - { \ - uint64_t byte_addr = (uint64_t)&U._odata[sU]; \ - int count = (sizeof(U._odata[0])+63)/64; \ - asm (" mtctr %0 \n" \ - " mr " HASH(REP) ", %1\n" \ - " li " HASH(IMM) ", 64\n" \ - "0:\n" \ - LOCK_CLEAR \ - " add " HASH(REP) "," HASH(IMM) "," HASH(REP) "\n" \ +#define UNLOCK_GAUGE(dir) \ + { \ + uint64_t byte_addr = (uint64_t)&U._odata[sU]; \ + int count = (sizeof(U._odata[0])+63)/64; \ + asm (" mtctr %0 \n" \ + " mr " HASH(REP) ", %1\n" \ + " li " HASH(IMM) ", 64\n" \ + "0:\n" \ + LOCK_CLEAR \ + " add " HASH(REP) "," HASH(IMM) "," HASH(REP) "\n" \ " bdnz 0b\n" \ - : : "b" (count), "b" (byte_addr) ); \ + : : "b" (count), "b" (byte_addr) ); \ } #define ZERO_PSI \ @@ -201,11 +201,11 @@ Author: paboyle #define MULT_2SPIN_QPXf(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VLOAD,16) #define MULT_2SPIN_QPX_INTERNAL(ptr,p,ULOAD,USKIP) { \ - uint64_t ub = ((uint64_t)ptr); \ - asm ( \ - ULOAD(%0,%3,U0) \ - ULOAD(%1,%3,U1) \ - ULOAD(%2,%3,U2) \ + uint64_t ub = ((uint64_t)ptr); \ + asm ( \ + ULOAD(%0,%3,U0) \ + ULOAD(%1,%3,U1) \ + ULOAD(%2,%3,U2) \ VMUL_RR_RI(UChi_00,U0,Chi_00) \ VMUL_RR_RI(UChi_01,U1,Chi_00) \ VMUL_RR_RI(UChi_02,U2,Chi_00) \ @@ -235,7 +235,7 @@ Author: paboyle VMADD_MII_IR(UChi_10,U0,Chi_11,UChi_10) \ VMADD_MII_IR(UChi_11,U1,Chi_11,UChi_11) \ VMADD_MII_IR(UChi_12,U2,Chi_11,UChi_12) \ - : : "b" (USKIP*1), "b" (USKIP*4), "b" (USKIP*7), "b" (ub )); \ + : : "b" (USKIP*1), "b" (USKIP*4), "b" (USKIP*7), "b" (ub )); \ asm ( \ ULOAD(%0,%3,U0) \ ULOAD(%1,%3,U1) \ @@ -252,30 +252,30 @@ Author: paboyle VMADD_MII_IR(UChi_10,U0,Chi_12,UChi_10) \ VMADD_MII_IR(UChi_11,U1,Chi_12,UChi_11) \ VMADD_MII_IR(UChi_12,U2,Chi_12,UChi_12) \ - : : "b" (USKIP*2), "b" (USKIP*5), "b" (USKIP*8), "b" (ub )); \ + : : "b" (USKIP*2), "b" (USKIP*5), "b" (USKIP*8), "b" (ub )); \ } #define MULT_2SPIN_DIR_PF(A,p) MULT_2SPIN_PF(&U._odata[sU](A),p) #define MULT_2SPIN_PF(ptr,pf) MULT_2SPIN(ptr,pf) -#define SAVE_RESULT(base,basep) {\ - uint64_t ub = ((uint64_t)base) - (VSIZE); \ - asm("mr " HASH(REP) ", %0;\n" \ - "li " HASH(IMM) "," HASH(VSIZE)" ;\n" \ - VSTOREu(IMM,REP,psi_00) \ - VSTOREu(IMM,REP,psi_01) \ - VSTOREu(IMM,REP,psi_02) \ - VSTOREu(IMM,REP,psi_10) \ - VSTOREu(IMM,REP,psi_11) \ - VSTOREu(IMM,REP,psi_12) \ - VSTOREu(IMM,REP,psi_20) \ - VSTOREu(IMM,REP,psi_21) \ - VSTOREu(IMM,REP,psi_22) \ - VSTOREu(IMM,REP,psi_30) \ - VSTOREu(IMM,REP,psi_31) \ - VSTOREu(IMM,REP,psi_32) \ - : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \ +#define SAVE_RESULT(base,basep) { \ + uint64_t ub = ((uint64_t)base) - (VSIZE); \ + asm("mr " HASH(REP) ", %0;\n" \ + "li " HASH(IMM) "," HASH(VSIZE)" ;\n" \ + VSTOREu(IMM,REP,psi_00) \ + VSTOREu(IMM,REP,psi_01) \ + VSTOREu(IMM,REP,psi_02) \ + VSTOREu(IMM,REP,psi_10) \ + VSTOREu(IMM,REP,psi_11) \ + VSTOREu(IMM,REP,psi_12) \ + VSTOREu(IMM,REP,psi_20) \ + VSTOREu(IMM,REP,psi_21) \ + VSTOREu(IMM,REP,psi_22) \ + VSTOREu(IMM,REP,psi_30) \ + VSTOREu(IMM,REP,psi_31) \ + VSTOREu(IMM,REP,psi_32) \ + : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \ } @@ -295,7 +295,7 @@ Author: paboyle "li " HASH(IMM) ",(2*" HASH(VSIZE) ");\n" \ VLOADu(IMM,REP,Chi_01) \ VLOADu(IMM,REP,Chi_10) \ - VLOADu(IMM,REP,Chi_12) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \ + VLOADu(IMM,REP,Chi_12) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \ } #define LOAD_CHIMU(base) { \ @@ -316,27 +316,27 @@ Author: paboyle VLOADu(IMM,REP,Chi_12) \ VLOADu(IMM,REP,Chi_21) \ VLOADu(IMM,REP,Chi_30) \ - VLOADu(IMM,REP,Chi_32) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \ + VLOADu(IMM,REP,Chi_32) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \ } // hspin(0)=fspin(0)+timesI(fspin(3)); // hspin(1)=fspin(1)+timesI(fspin(2)); -#define XP_PROJMEM(base) { \ - LOAD_CHIMU(base); \ - asm ( \ - VONE(one) \ - VMADD_MII_IR(Chi_00,one,Chi_30,Chi_00) \ - VMADD_MII_IR(Chi_01,one,Chi_31,Chi_01) \ - VMADD_MII_IR(Chi_02,one,Chi_32,Chi_02) \ - VMADD_MII_IR(Chi_10,one,Chi_20,Chi_10) \ - VMADD_MII_IR(Chi_11,one,Chi_21,Chi_11) \ - VMADD_MII_IR(Chi_12,one,Chi_22,Chi_12) \ - ); \ +#define XP_PROJMEM(base) { \ + LOAD_CHIMU(base); \ + asm ( \ + VONE(one) \ + VMADD_MII_IR(Chi_00,one,Chi_30,Chi_00) \ + VMADD_MII_IR(Chi_01,one,Chi_31,Chi_01) \ + VMADD_MII_IR(Chi_02,one,Chi_32,Chi_02) \ + VMADD_MII_IR(Chi_10,one,Chi_20,Chi_10) \ + VMADD_MII_IR(Chi_11,one,Chi_21,Chi_11) \ + VMADD_MII_IR(Chi_12,one,Chi_22,Chi_12) \ + ); \ } -#define XM_PROJMEM(base) { \ - LOAD_CHIMU(base); \ - asm ( \ +#define XM_PROJMEM(base) { \ + LOAD_CHIMU(base); \ + asm ( \ VONE(one) \ VMADD_II_MIR(Chi_00,one,Chi_30,Chi_00) \ VMADD_II_MIR(Chi_01,one,Chi_31,Chi_01) \ @@ -349,19 +349,19 @@ Author: paboyle // hspin(0)=fspin(0)-fspin(3); // hspin(1)=fspin(1)+fspin(2); -#define YP_PROJMEM(base) { \ - LOAD_CHIMU(base); \ - asm ( \ - VSUB(Chi_00,Chi_00,Chi_30) \ - VSUB(Chi_01,Chi_01,Chi_31) \ - VSUB(Chi_02,Chi_02,Chi_32) \ - VADD(Chi_10,Chi_10,Chi_20) \ - VADD(Chi_11,Chi_11,Chi_21) \ - VADD(Chi_12,Chi_12,Chi_22) \ - ); \ +#define YP_PROJMEM(base) { \ + LOAD_CHIMU(base); \ + asm ( \ + VSUB(Chi_00,Chi_00,Chi_30) \ + VSUB(Chi_01,Chi_01,Chi_31) \ + VSUB(Chi_02,Chi_02,Chi_32) \ + VADD(Chi_10,Chi_10,Chi_20) \ + VADD(Chi_11,Chi_11,Chi_21) \ + VADD(Chi_12,Chi_12,Chi_22) \ + ); \ } -#define YM_PROJMEM(base) { \ +#define YM_PROJMEM(base) { \ LOAD_CHIMU(base); \ asm ( \ VADD(Chi_00,Chi_00,Chi_30) \ @@ -372,162 +372,162 @@ Author: paboyle VSUB(Chi_12,Chi_12,Chi_22) ); \ } - /*Gz - * 0 0 i 0 [0]+-i[2] - * 0 0 0 -i [1]-+i[3] - * -i 0 0 0 - * 0 i 0 0 - */ -#define ZP_PROJMEM(base) { \ - LOAD_CHIMU(base); \ - asm ( \ - VONE(one) \ - VMADD_MII_IR(Chi_00,one,Chi_20,Chi_00) \ - VMADD_MII_IR(Chi_01,one,Chi_21,Chi_01) \ - VMADD_MII_IR(Chi_02,one,Chi_22,Chi_02) \ - VMADD_II_MIR(Chi_10,one,Chi_30,Chi_10) \ - VMADD_II_MIR(Chi_11,one,Chi_31,Chi_11) \ - VMADD_II_MIR(Chi_12,one,Chi_32,Chi_12) \ - ); \ +/*Gz + * 0 0 i 0 [0]+-i[2] + * 0 0 0 -i [1]-+i[3] + * -i 0 0 0 + * 0 i 0 0 + */ +#define ZP_PROJMEM(base) { \ + LOAD_CHIMU(base); \ + asm ( \ + VONE(one) \ + VMADD_MII_IR(Chi_00,one,Chi_20,Chi_00) \ + VMADD_MII_IR(Chi_01,one,Chi_21,Chi_01) \ + VMADD_MII_IR(Chi_02,one,Chi_22,Chi_02) \ + VMADD_II_MIR(Chi_10,one,Chi_30,Chi_10) \ + VMADD_II_MIR(Chi_11,one,Chi_31,Chi_11) \ + VMADD_II_MIR(Chi_12,one,Chi_32,Chi_12) \ + ); \ } -#define ZM_PROJMEM(base) { \ - LOAD_CHIMU(base); \ - asm ( \ - VONE(one) \ - VMADD_II_MIR(Chi_00,one,Chi_20,Chi_00) \ - VMADD_II_MIR(Chi_01,one,Chi_21,Chi_01) \ - VMADD_II_MIR(Chi_02,one,Chi_22,Chi_02) \ - VMADD_MII_IR(Chi_10,one,Chi_30,Chi_10) \ - VMADD_MII_IR(Chi_11,one,Chi_31,Chi_11) \ - VMADD_MII_IR(Chi_12,one,Chi_32,Chi_12) \ - ); \ +#define ZM_PROJMEM(base) { \ + LOAD_CHIMU(base); \ + asm ( \ + VONE(one) \ + VMADD_II_MIR(Chi_00,one,Chi_20,Chi_00) \ + VMADD_II_MIR(Chi_01,one,Chi_21,Chi_01) \ + VMADD_II_MIR(Chi_02,one,Chi_22,Chi_02) \ + VMADD_MII_IR(Chi_10,one,Chi_30,Chi_10) \ + VMADD_MII_IR(Chi_11,one,Chi_31,Chi_11) \ + VMADD_MII_IR(Chi_12,one,Chi_32,Chi_12) \ + ); \ } - /*Gt - * 0 0 1 0 [0]+-[2] - * 0 0 0 1 [1]+-[3] - * 1 0 0 0 - * 0 1 0 0 - */ -#define TP_PROJMEM(base) { \ - LOAD_CHIMU(base); \ - asm ( \ - VADD(Chi_00,Chi_00,Chi_20) \ - VADD(Chi_01,Chi_01,Chi_21) \ - VADD(Chi_02,Chi_02,Chi_22) \ - VADD(Chi_10,Chi_10,Chi_30) \ - VADD(Chi_11,Chi_11,Chi_31) \ - VADD(Chi_12,Chi_12,Chi_32) \ - ); \ +/*Gt + * 0 0 1 0 [0]+-[2] + * 0 0 0 1 [1]+-[3] + * 1 0 0 0 + * 0 1 0 0 + */ +#define TP_PROJMEM(base) { \ + LOAD_CHIMU(base); \ + asm ( \ + VADD(Chi_00,Chi_00,Chi_20) \ + VADD(Chi_01,Chi_01,Chi_21) \ + VADD(Chi_02,Chi_02,Chi_22) \ + VADD(Chi_10,Chi_10,Chi_30) \ + VADD(Chi_11,Chi_11,Chi_31) \ + VADD(Chi_12,Chi_12,Chi_32) \ + ); \ } -#define TM_PROJMEM(base) { \ - LOAD_CHIMU(base); \ - asm ( \ - VSUB(Chi_00,Chi_00,Chi_20) \ - VSUB(Chi_01,Chi_01,Chi_21) \ - VSUB(Chi_02,Chi_02,Chi_22) \ - VSUB(Chi_10,Chi_10,Chi_30) \ - VSUB(Chi_11,Chi_11,Chi_31) \ - VSUB(Chi_12,Chi_12,Chi_32) \ - ); \ +#define TM_PROJMEM(base) { \ + LOAD_CHIMU(base); \ + asm ( \ + VSUB(Chi_00,Chi_00,Chi_20) \ + VSUB(Chi_01,Chi_01,Chi_21) \ + VSUB(Chi_02,Chi_02,Chi_22) \ + VSUB(Chi_10,Chi_10,Chi_30) \ + VSUB(Chi_11,Chi_11,Chi_31) \ + VSUB(Chi_12,Chi_12,Chi_32) \ + ); \ } /* - fspin(0)=hspin(0); - fspin(1)=hspin(1); - fspin(2)=timesMinusI(hspin(1)); - fspin(3)=timesMinusI(hspin(0)); + fspin(0)=hspin(0); + fspin(1)=hspin(1); + fspin(2)=timesMinusI(hspin(1)); + fspin(3)=timesMinusI(hspin(0)); - fspin(0)+=hspin(0); - fspin(1)+=hspin(1); - fspin(2)-=timesI(hspin(1)); - fspin(3)-=timesI(hspin(0)); - */ -#define XP_RECON { \ - asm(\ - VONE(one)\ - VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02)\ - VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12)\ - VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \ - VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \ - VMADD_II_MIR(psi_20,one,UChi_10,psi_20) \ - VMADD_II_MIR(psi_21,one,UChi_11,psi_21) \ - VMADD_II_MIR(psi_22,one,UChi_12,psi_22) \ - VMADD_II_MIR(psi_30,one,UChi_00,psi_30) \ - VMADD_II_MIR(psi_31,one,UChi_01,psi_31) \ - VMADD_II_MIR(psi_32,one,UChi_02,psi_32) \ - ); \ + fspin(0)+=hspin(0); + fspin(1)+=hspin(1); + fspin(2)-=timesI(hspin(1)); + fspin(3)-=timesI(hspin(0)); +*/ +#define XP_RECON { \ + asm( \ + VONE(one) \ + VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02) \ + VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12) \ + VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \ + VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \ + VMADD_II_MIR(psi_20,one,UChi_10,psi_20) \ + VMADD_II_MIR(psi_21,one,UChi_11,psi_21) \ + VMADD_II_MIR(psi_22,one,UChi_12,psi_22) \ + VMADD_II_MIR(psi_30,one,UChi_00,psi_30) \ + VMADD_II_MIR(psi_31,one,UChi_01,psi_31) \ + VMADD_II_MIR(psi_32,one,UChi_02,psi_32) \ + ); \ } -#define XM_RECON { \ - asm(\ - VONE(one)\ - VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02)\ - VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12)\ - VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \ - VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \ - VMADD_MII_IR(psi_20,one,UChi_10,psi_20) \ - VMADD_MII_IR(psi_21,one,UChi_11,psi_21) \ - VMADD_MII_IR(psi_22,one,UChi_12,psi_22) \ - VMADD_MII_IR(psi_30,one,UChi_00,psi_30) \ - VMADD_MII_IR(psi_31,one,UChi_01,psi_31) \ - VMADD_MII_IR(psi_32,one,UChi_02,psi_32) \ - ); \ +#define XM_RECON { \ + asm( \ + VONE(one) \ + VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02) \ + VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12) \ + VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \ + VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \ + VMADD_MII_IR(psi_20,one,UChi_10,psi_20) \ + VMADD_MII_IR(psi_21,one,UChi_11,psi_21) \ + VMADD_MII_IR(psi_22,one,UChi_12,psi_22) \ + VMADD_MII_IR(psi_30,one,UChi_00,psi_30) \ + VMADD_MII_IR(psi_31,one,UChi_01,psi_31) \ + VMADD_MII_IR(psi_32,one,UChi_02,psi_32) \ + ); \ } -#define XP_RECON_ACCUM { \ - asm(\ - VONE(one)\ +#define XP_RECON_ACCUM { \ + asm( \ + VONE(one) \ VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \ VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \ - VMADD_II_MIR(psi_20,one,UChi_10,psi_20) \ - VMADD_II_MIR(psi_21,one,UChi_11,psi_21) \ - VMADD_II_MIR(psi_22,one,UChi_12,psi_22) \ - VMADD_II_MIR(psi_30,one,UChi_00,psi_30) \ - VMADD_II_MIR(psi_31,one,UChi_01,psi_31) \ - VMADD_II_MIR(psi_32,one,UChi_02,psi_32) \ - ); \ + VMADD_II_MIR(psi_20,one,UChi_10,psi_20) \ + VMADD_II_MIR(psi_21,one,UChi_11,psi_21) \ + VMADD_II_MIR(psi_22,one,UChi_12,psi_22) \ + VMADD_II_MIR(psi_30,one,UChi_00,psi_30) \ + VMADD_II_MIR(psi_31,one,UChi_01,psi_31) \ + VMADD_II_MIR(psi_32,one,UChi_02,psi_32) \ + ); \ } -#define XM_RECON_ACCUM { \ - asm(\ - VONE(one)\ +#define XM_RECON_ACCUM { \ + asm( \ + VONE(one) \ VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \ VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \ - VMADD_MII_IR(psi_20,one,UChi_10,psi_20) \ - VMADD_MII_IR(psi_21,one,UChi_11,psi_21) \ - VMADD_MII_IR(psi_22,one,UChi_12,psi_22) \ - VMADD_MII_IR(psi_30,one,UChi_00,psi_30) \ - VMADD_MII_IR(psi_31,one,UChi_01,psi_31) \ - VMADD_MII_IR(psi_32,one,UChi_02,psi_32) \ - ); \ + VMADD_MII_IR(psi_20,one,UChi_10,psi_20) \ + VMADD_MII_IR(psi_21,one,UChi_11,psi_21) \ + VMADD_MII_IR(psi_22,one,UChi_12,psi_22) \ + VMADD_MII_IR(psi_30,one,UChi_00,psi_30) \ + VMADD_MII_IR(psi_31,one,UChi_01,psi_31) \ + VMADD_MII_IR(psi_32,one,UChi_02,psi_32) \ + ); \ } // fspin(2)+=hspin(1); // fspin(3)-=hspin(0); -#define YP_RECON_ACCUM {\ - asm(\ +#define YP_RECON_ACCUM { \ + asm( \ VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \ VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \ VADD(psi_20,psi_20,UChi_10) VADD(psi_21,psi_21,UChi_11) VADD(psi_22,psi_22,UChi_12) \ VSUB(psi_30,psi_30,UChi_00) VSUB(psi_31,psi_31,UChi_01) VSUB(psi_32,psi_32,UChi_02) \ - );\ - } -#define YM_RECON_ACCUM {\ - asm(\ + ); \ + } +#define YM_RECON_ACCUM { \ + asm( \ VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \ VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \ VSUB(psi_20,psi_20,UChi_10) VSUB(psi_21,psi_21,UChi_11) VSUB(psi_22,psi_22,UChi_12) \ VADD(psi_30,psi_30,UChi_00) VADD(psi_31,psi_31,UChi_01) VADD(psi_32,psi_32,UChi_02) \ - );\ - } + ); \ + } // fspin(2)-=timesI(hspin(0)); // fspin(3)+=timesI(hspin(1)); -#define ZP_RECON_ACCUM {\ - asm(\ - VONE(one)\ +#define ZP_RECON_ACCUM { \ + asm( \ + VONE(one) \ VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \ VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \ VMADD_II_MIR(psi_20,one,UChi_00,psi_20) \ @@ -536,12 +536,12 @@ Author: paboyle VMADD_MII_IR(psi_30,one,UChi_10,psi_30) \ VMADD_MII_IR(psi_31,one,UChi_11,psi_31) \ VMADD_MII_IR(psi_32,one,UChi_12,psi_32) \ - );\ - } + ); \ + } -#define ZM_RECON_ACCUM {\ - asm(\ - VONE(one)\ +#define ZM_RECON_ACCUM { \ + asm( \ + VONE(one) \ VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \ VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \ VMADD_MII_IR(psi_20,one,UChi_00,psi_20) \ @@ -550,37 +550,37 @@ Author: paboyle VMADD_II_MIR(psi_30,one,UChi_10,psi_30) \ VMADD_II_MIR(psi_31,one,UChi_11,psi_31) \ VMADD_II_MIR(psi_32,one,UChi_12,psi_32) \ - );\ - } + ); \ + } // fspin(2)+=hspin(0); // fspin(3)+=hspin(1); -#define TP_RECON_ACCUM {\ - asm(\ +#define TP_RECON_ACCUM { \ + asm( \ VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \ VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \ VADD(psi_20,psi_20,UChi_00) VADD(psi_21,psi_21,UChi_01) VADD(psi_22,psi_22,UChi_02) \ VADD(psi_30,psi_30,UChi_10) VADD(psi_31,psi_31,UChi_11) VADD(psi_32,psi_32,UChi_12) \ - );\ - } + ); \ + } -#define TM_RECON_ACCUM {\ - asm(\ +#define TM_RECON_ACCUM { \ + asm( \ VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \ VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \ VSUB(psi_20,psi_20,UChi_00) VSUB(psi_21,psi_21,UChi_01) VSUB(psi_22,psi_22,UChi_02) \ VSUB(psi_30,psi_30,UChi_10) VSUB(psi_31,psi_31,UChi_11) VSUB(psi_32,psi_32,UChi_12) \ - );\ - } + ); \ + } #define ADD_RESULTi(PTR,pf) \ LOAD_CHIMU(PTR) \ asm( \ - VADD(psi_00,chi_00,psi_00) VADD(psi_01,chi_01,psi_01) VADD(psi_02,chi_02,psi_02) \ - VADD(psi_10,chi_10,psi_10) VADD(psi_11,chi_11,psi_11) VADD(psi_12,chi_12,psi_12) \ - VADD(psi_20,chi_20,psi_20) VADD(psi_21,chi_21,psi_21) VADD(psi_22,chi_22,psi_22) \ - VADD(psi_30,chi_30,psi_30) VADD(psi_31,chi_31,psi_31) VADD(psi_32,chi_32,psi_32) ); \ + VADD(psi_00,chi_00,psi_00) VADD(psi_01,chi_01,psi_01) VADD(psi_02,chi_02,psi_02) \ + VADD(psi_10,chi_10,psi_10) VADD(psi_11,chi_11,psi_11) VADD(psi_12,chi_12,psi_12) \ + VADD(psi_20,chi_20,psi_20) VADD(psi_21,chi_21,psi_21) VADD(psi_22,chi_22,psi_22) \ + VADD(psi_30,chi_30,psi_30) VADD(psi_31,chi_31,psi_31) VADD(psi_32,chi_32,psi_32) ); \ SAVE_RESULT(PTR,pf); @@ -590,9 +590,9 @@ Author: paboyle #define PERMUTE_DIR0 { \ asm( \ - VPERMI(perm_reg) \ - VPERM(Chi_00,perm_reg) VPERM(Chi_01,perm_reg) VPERM(Chi_02,perm_reg) \ - VPERM(Chi_10,perm_reg) VPERM(Chi_11,perm_reg) VPERM(Chi_12,perm_reg) ); \ + VPERMI(perm_reg) \ + VPERM(Chi_00,perm_reg) VPERM(Chi_01,perm_reg) VPERM(Chi_02,perm_reg) \ + VPERM(Chi_10,perm_reg) VPERM(Chi_11,perm_reg) VPERM(Chi_12,perm_reg) ); \ } #endif From 4da437431e2c7bd31e1eec67342ece78548f4044 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:22:46 +0000 Subject: [PATCH 019/754] Reformat --- lib/simd/IBM_qpx_double.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/simd/IBM_qpx_double.h b/lib/simd/IBM_qpx_double.h index 60709102..a69b1552 100644 --- a/lib/simd/IBM_qpx_double.h +++ b/lib/simd/IBM_qpx_double.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,8 +23,8 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ // No guard; ok multi-include #undef VSIZE #undef VLOAD From b815f5f764159b294ad0c069abde94b0ec97f09d Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:23:21 +0000 Subject: [PATCH 020/754] Formatting --- lib/simd/IBM_qpx_single.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/simd/IBM_qpx_single.h b/lib/simd/IBM_qpx_single.h index ab903ea7..28bab041 100644 --- a/lib/simd/IBM_qpx_single.h +++ b/lib/simd/IBM_qpx_single.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,8 +23,8 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ // No guard; ok multi-include #undef VSIZE #undef VLOAD From bd15c38ae8bd617c9a17b3ab034083faf85552a3 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:25:02 +0000 Subject: [PATCH 021/754] Formatting emacs compliant --- lib/simd/Intel512avx.h | 66 +- lib/simd/Intel512common.h | 66 +- lib/simd/Intel512double.h | 6 +- lib/simd/Intel512imci.h | 46 +- lib/simd/Intel512single.h | 6 +- lib/simd/Intel512wilson.h | 1270 ++++++++++++++++++------------------- 6 files changed, 730 insertions(+), 730 deletions(-) diff --git a/lib/simd/Intel512avx.h b/lib/simd/Intel512avx.h index 7b5964ad..9cded194 100644 --- a/lib/simd/Intel512avx.h +++ b/lib/simd/Intel512avx.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,8 +23,8 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_ASM_AV512_H #define GRID_ASM_AV512_H @@ -44,46 +44,46 @@ Author: paboyle #define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp) #define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp) -#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ - VSHUFMEMf(O,P,tmp) \ - VMULMEMf(O,P,B,Biirr) \ - VMULMEMf(O,P,C,Ciirr) \ - VMULf(tmp,B,Briir) \ +#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \ + VSHUFMEMf(O,P,tmp) \ + VMULMEMf(O,P,B,Biirr) \ + VMULMEMf(O,P,C,Ciirr) \ + VMULf(tmp,B,Briir) \ VMULf(tmp,C,Criir) -#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ - VSHUFMEMd(O,P,tmp) \ - VMULMEMd(O,P,B,Biirr) \ - VMULMEMd(O,P,C,Ciirr) \ - VMULd(tmp,B,Briir) \ +#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \ + VSHUFMEMd(O,P,tmp) \ + VMULMEMd(O,P,B,Biirr) \ + VMULMEMd(O,P,C,Ciirr) \ + VMULd(tmp,B,Briir) \ VMULd(tmp,C,Criir) -#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ - VSHUFMEMf(O,P,tmp) \ - VMADDMEMf(O,P,B,Biirr) \ - VMADDMEMf(O,P,C,Ciirr) \ - VMADDf(tmp,B,Briir) \ +#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \ + VSHUFMEMf(O,P,tmp) \ + VMADDMEMf(O,P,B,Biirr) \ + VMADDMEMf(O,P,C,Ciirr) \ + VMADDf(tmp,B,Briir) \ VMADDf(tmp,C,Criir) #define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \ - VSHUFMEMd(O,P,tmp) \ - VMADDMEMd(O,P,B,Biirr) \ - VMADDMEMd(O,P,C,Ciirr) \ - VMADDd(tmp,B,Briir) \ + VSHUFMEMd(O,P,tmp) \ + VMADDMEMd(O,P,B,Biirr) \ + VMADDMEMd(O,P,C,Ciirr) \ + VMADDd(tmp,B,Briir) \ VMADDd(tmp,C,Criir) // Merges accumulation for complex dot chain; less efficient under avx512 -#define ZEND1f(Criir,Ciirr, tmp) "vshufps $0xb1," #Criir "," #Criir "," #tmp ";\n"\ - "vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n" +#define ZEND1f(Criir,Ciirr, tmp) "vshufps $0xb1," #Criir "," #Criir "," #tmp ";\n" \ + "vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n" -#define ZEND2f(Criir,Ciirr, tmp) "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp ";\n"\ - "vsubps " #tmp "," #Ciirr "," #Criir"{%k7}" ";\n" +#define ZEND2f(Criir,Ciirr, tmp) "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp ";\n" \ + "vsubps " #tmp "," #Ciirr "," #Criir"{%k7}" ";\n" #define ZEND1d(Criir,Ciirr, tmp) "vshufpd $0x55," #Criir "," #Criir "," #tmp ";\n"\ - "vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n" +"vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n" -#define ZEND2d(Criir,Ciirr, tmp) "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp ";\n"\ - "vsubpd " #tmp "," #Ciirr "," #Criir"{%k7};\n" // ri+ir ; ri+ir,rr-ii +#define ZEND2d(Criir,Ciirr, tmp) "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp ";\n" \ + "vsubpd " #tmp "," #Ciirr "," #Criir"{%k7};\n" // ri+ir ; ri+ir,rr-ii #define VMOVRDUPd(OFF,A,DEST) "vpshufd $0x44," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 1,0,3,2 #define VMOVIDUPd(OFF,A,DEST) "vpshufd $0xee," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 3,2,3,2 @@ -123,10 +123,10 @@ Author: paboyle #define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n" #define VMULRDUPd(O,P,B,accum) "vmulpd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n" #define VMULIDUPd(O,P,B,accum) "vmulpd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n" - /* - * TimesI is used only in the XP recon - * Could zero the regs and use RECON_ACCUM - */ +/* + * TimesI is used only in the XP recon + * Could zero the regs and use RECON_ACCUM + */ #define VTIMESI0f(A,DEST, Z) VSHUFf(A,DEST) #define VTIMESI1f(A,DEST, Z) "vaddps " #DEST "," #Z "," #DEST"{%k6}" ";\n" diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h index e69e541c..c9472918 100644 --- a/lib/simd/Intel512common.h +++ b/lib/simd/Intel512common.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,8 +23,8 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_ASM_INTEL_COMMON_512_H #define GRID_ASM_INTEL_COMMON_512_H @@ -36,10 +36,10 @@ Author: paboyle //////////////////////////////////////////////////////////////////////////////////////////////////// // Opcodes common //////////////////////////////////////////////////////////////////////////////////////////////////// -#define MASK_REGS \ - __asm__ ("mov $0xAAAA, %%eax \n"\ - "kmovw %%eax, %%k6 \n"\ - "mov $0x5555, %%eax \n"\ +#define MASK_REGS \ + __asm__ ("mov $0xAAAA, %%eax \n" \ + "kmovw %%eax, %%k6 \n" \ + "mov $0x5555, %%eax \n" \ "kmovw %%eax, %%k7 \n" : : : "%eax"); //#define label(B) __asm__ ( __func__ _LINE__ #B ":\n" ); @@ -47,44 +47,44 @@ Author: paboyle #define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n" #define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n" -#define VTIMESIf(A,DEST, Z) \ - VTIMESI0f(A,DEST, Z) \ - VTIMESI1f(A,DEST, Z) \ +#define VTIMESIf(A,DEST, Z) \ + VTIMESI0f(A,DEST, Z) \ + VTIMESI1f(A,DEST, Z) \ VTIMESI2f(A,DEST, Z) -#define VTIMESId(A,DEST, Z) \ - VTIMESI0d(A,DEST, Z) \ - VTIMESI1d(A,DEST, Z) \ +#define VTIMESId(A,DEST, Z) \ + VTIMESI0d(A,DEST, Z) \ + VTIMESI1d(A,DEST, Z) \ VTIMESI2d(A,DEST, Z) -#define VTIMESMINUSIf(A,DEST, Z) \ - VTIMESMINUSI0f(A,DEST, Z) \ - VTIMESMINUSI1f(A,DEST, Z) \ - VTIMESMINUSI2f(A,DEST, Z) +#define VTIMESMINUSIf(A,DEST, Z) \ + VTIMESMINUSI0f(A,DEST, Z) \ + VTIMESMINUSI1f(A,DEST, Z) \ + VTIMESMINUSI2f(A,DEST, Z) -#define VTIMESMINUSId(A,DEST, Z) \ - VTIMESMINUSI0d(A,DEST, Z) \ - VTIMESMINUSI1d(A,DEST, Z) \ - VTIMESMINUSI2d(A,DEST, Z) +#define VTIMESMINUSId(A,DEST, Z) \ + VTIMESMINUSI0d(A,DEST, Z) \ + VTIMESMINUSI1d(A,DEST, Z) \ + VTIMESMINUSI2d(A,DEST, Z) #define VACCTIMESIf(A,ACC,tmp) \ - VACCTIMESI0f(A,ACC,tmp) \ - VACCTIMESI1f(A,ACC,tmp) \ - VACCTIMESI2f(A,ACC,tmp) + VACCTIMESI0f(A,ACC,tmp) \ + VACCTIMESI1f(A,ACC,tmp) \ + VACCTIMESI2f(A,ACC,tmp) #define VACCTIMESId(A,ACC,tmp) \ - VACCTIMESI0d(A,ACC,tmp) \ - VACCTIMESI1d(A,ACC,tmp) \ - VACCTIMESI2d(A,ACC,tmp) + VACCTIMESI0d(A,ACC,tmp) \ + VACCTIMESI1d(A,ACC,tmp) \ + VACCTIMESI2d(A,ACC,tmp) -#define VACCTIMESMINUSIf(A,ACC,tmp) \ - VACCTIMESMINUSI0f(A,ACC,tmp) \ - VACCTIMESMINUSI1f(A,ACC,tmp) \ +#define VACCTIMESMINUSIf(A,ACC,tmp) \ + VACCTIMESMINUSI0f(A,ACC,tmp) \ + VACCTIMESMINUSI1f(A,ACC,tmp) \ VACCTIMESMINUSI2f(A,ACC,tmp) -#define VACCTIMESMINUSId(A,ACC,tmp) \ - VACCTIMESMINUSI0d(A,ACC,tmp) \ - VACCTIMESMINUSI1d(A,ACC,tmp) \ +#define VACCTIMESMINUSId(A,ACC,tmp) \ + VACCTIMESMINUSI0d(A,ACC,tmp) \ + VACCTIMESMINUSI1d(A,ACC,tmp) \ VACCTIMESMINUSI2d(A,ACC,tmp) #define LOAD64a(A,ptr) "movq %0, %" #A : : "r"(ptr) : #A diff --git a/lib/simd/Intel512double.h b/lib/simd/Intel512double.h index 632b5639..f0b7fa0d 100644 --- a/lib/simd/Intel512double.h +++ b/lib/simd/Intel512double.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,8 +23,8 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ // No guard can be multiply included as undef clearage #undef VZERO #undef VMOV diff --git a/lib/simd/Intel512imci.h b/lib/simd/Intel512imci.h index 7176890f..655c001d 100644 --- a/lib/simd/Intel512imci.h +++ b/lib/simd/Intel512imci.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,8 +23,8 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_ASM_AV512_H #define GRID_ASM_AV512_H @@ -44,32 +44,32 @@ Author: paboyle #define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp) #define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp) -#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ - VSHUFMEMf(O,P,tmp) \ - VMULMEMf(O,P,B,Biirr) \ - VMULMEMf(O,P,C,Ciirr) \ - VMULf(tmp,B,Briir) \ +#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \ + VSHUFMEMf(O,P,tmp) \ + VMULMEMf(O,P,B,Biirr) \ + VMULMEMf(O,P,C,Ciirr) \ + VMULf(tmp,B,Briir) \ VMULf(tmp,C,Criir) -#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ - VSHUFMEMd(O,P,tmp) \ +#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \ + VSHUFMEMd(O,P,tmp) \ VMULMEMd(O,P,B,Biirr) \ - VMULMEMd(O,P,C,Ciirr) \ - VMULd(tmp,B,Briir) \ - VMULd(tmp,C,Criir) +VMULMEMd(O,P,C,Ciirr) \ +VMULd(tmp,B,Briir) \ +VMULd(tmp,C,Criir) -#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ - VSHUFMEMf(O,P,tmp) \ - VMADDMEMf(O,P,B,Biirr) \ - VMADDMEMf(O,P,C,Ciirr) \ - VMADDf(tmp,B,Briir) \ +#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \ + VSHUFMEMf(O,P,tmp) \ + VMADDMEMf(O,P,B,Biirr) \ + VMADDMEMf(O,P,C,Ciirr) \ + VMADDf(tmp,B,Briir) \ VMADDf(tmp,C,Criir) #define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \ - VSHUFMEMd(O,P,tmp) \ - VMADDMEMd(O,P,B,Biirr) \ - VMADDMEMd(O,P,C,Ciirr) \ - VMADDd(tmp,B,Briir) \ + VSHUFMEMd(O,P,tmp) \ + VMADDMEMd(O,P,B,Biirr) \ + VMADDMEMd(O,P,C,Ciirr) \ + VMADDd(tmp,B,Briir) \ VMADDd(tmp,C,Criir) #define ZEND1d(Criir,Ciirr, tmp) "vaddpd " #Criir "{cdab} ," #Criir "," #Criir"{%k6}" ";\n" @@ -106,7 +106,7 @@ Author: paboyle #define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n" #define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n" - // Acc = Acc - i A +// Acc = Acc - i A #define VACCTIMESMINUSI0d(A,ACC,tmp) #define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n" #define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n" diff --git a/lib/simd/Intel512single.h b/lib/simd/Intel512single.h index ed135651..5c1e4135 100644 --- a/lib/simd/Intel512single.h +++ b/lib/simd/Intel512single.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,8 +23,8 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ // No guard can be multiply included as undef clearge of macros #undef VZERO #undef VMOV diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h index 64142a2e..66781387 100644 --- a/lib/simd/Intel512wilson.h +++ b/lib/simd/Intel512wilson.h @@ -23,8 +23,8 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_ASM_INTEL_512_QCD_H #define GRID_ASM_INTEL_512_QCD_H @@ -109,18 +109,18 @@ Author: paboyle #define SAVE_RESULT(PT,R) SAVE_RESULTi(PT,R) #define ADD_RESULT(PT,R) ADD_RESULTi(PT,R) -#define ZERO_PSI \ - asm( VZERO(psi_00) \ - VZERO(psi_01) \ - VZERO(psi_02) \ - VZERO(psi_10) \ - VZERO(psi_11) \ - VZERO(psi_12) \ - VZERO(psi_20) \ - VZERO(psi_21) \ - VZERO(psi_22) \ - VZERO(psi_30) \ - VZERO(psi_31) \ +#define ZERO_PSI \ + asm( VZERO(psi_00) \ + VZERO(psi_01) \ + VZERO(psi_02) \ + VZERO(psi_10) \ + VZERO(psi_11) \ + VZERO(psi_12) \ + VZERO(psi_20) \ + VZERO(psi_21) \ + VZERO(psi_22) \ + VZERO(psi_30) \ + VZERO(psi_31) \ VZERO(psi_32)); #define LOAD_CHIMUi \ @@ -143,41 +143,41 @@ Author: paboyle VLOAD(10,%r8,Chimu_31) \ VLOAD(11,%r8,Chimu_32) -#define SHUF_CHIMU23i\ - VSHUFMEM(6,%r8,Chimu_20) \ - VSHUFMEM(7,%r8,Chimu_21) \ - VSHUFMEM(8,%r8,Chimu_22) \ - VSHUFMEM(9,%r8,Chimu_30) \ - VSHUFMEM(10,%r8,Chimu_31) \ - VSHUFMEM(11,%r8,Chimu_32) +#define SHUF_CHIMU23i \ + VSHUFMEM(6,%r8,Chimu_20) \ + VSHUFMEM(7,%r8,Chimu_21) \ + VSHUFMEM(8,%r8,Chimu_22) \ + VSHUFMEM(9,%r8,Chimu_30) \ + VSHUFMEM(10,%r8,Chimu_31) \ + VSHUFMEM(11,%r8,Chimu_32) #define LOAD_CHIi \ - VLOAD(0,%r8,Chi_00) \ - VLOAD(1,%r8,Chi_01) \ - VLOAD(2,%r8,Chi_02) \ - VLOAD(3,%r8,Chi_10) \ - VLOAD(4,%r8,Chi_11) \ + VLOAD(0,%r8,Chi_00) \ + VLOAD(1,%r8,Chi_01) \ + VLOAD(2,%r8,Chi_02) \ + VLOAD(3,%r8,Chi_10) \ + VLOAD(4,%r8,Chi_11) \ VLOAD(5,%r8,Chi_12) -#define SAVE_UCHIi(PTR) \ - LOAD64(%r8,PTR) \ - __asm__ ( \ - VSTORE(0,%r8,UChi_00) \ - VSTORE(1,%r8,UChi_01) \ - VSTORE(2,%r8,UChi_02) \ - VSTORE(3,%r8,UChi_10) \ - VSTORE(4,%r8,UChi_11) \ - VSTORE(5,%r8,UChi_12) ); +#define SAVE_UCHIi(PTR) \ + LOAD64(%r8,PTR) \ + __asm__ ( \ + VSTORE(0,%r8,UChi_00) \ + VSTORE(1,%r8,UChi_01) \ + VSTORE(2,%r8,UChi_02) \ + VSTORE(3,%r8,UChi_10) \ + VSTORE(4,%r8,UChi_11) \ + VSTORE(5,%r8,UChi_12) ); -#define SAVE_CHIi(PTR) \ - LOAD64(%r8,PTR) \ - __asm__ ( \ - VSTORE(0,%r8,Chi_00) \ - VSTORE(1,%r8,Chi_01) \ - VSTORE(2,%r8,Chi_02) \ - VSTORE(3,%r8,Chi_10) \ - VSTORE(4,%r8,Chi_11) \ - VSTORE(5,%r8,Chi_12) ); +#define SAVE_CHIi(PTR) \ + LOAD64(%r8,PTR) \ + __asm__ ( \ + VSTORE(0,%r8,Chi_00) \ + VSTORE(1,%r8,Chi_01) \ + VSTORE(2,%r8,Chi_02) \ + VSTORE(3,%r8,Chi_10) \ + VSTORE(4,%r8,Chi_11) \ + VSTORE(5,%r8,Chi_12) ); #define MULT_2SPIN_DIR_PF(A,p) MULT_2SPIN_PF(&U._odata[sU](A),p) #define MULT_2SPIN_PF(ptr,pf) MULT_2SPIN(ptr,pf) @@ -187,352 +187,352 @@ Author: paboyle ////////////////////////////////////////////////////////////////// // hspin(0)=fspin(0)+timesI(fspin(3)); // hspin(1)=fspin(1)+timesI(fspin(2)); -#define XP_PROJMEM(PTR) \ - LOAD64(%r8,PTR) \ - __asm__ ( \ +#define XP_PROJMEM(PTR) \ + LOAD64(%r8,PTR) \ + __asm__ ( \ LOAD_CHIi \ - SHUF_CHIMU23i \ - VACCTIMESI1(Chi_00,Chi_00,Chimu_30) \ - VACCTIMESI1(Chi_01,Chi_01,Chimu_31) \ - VACCTIMESI1(Chi_02,Chi_02,Chimu_32) \ - VACCTIMESI1(Chi_10,Chi_10,Chimu_20) \ - VACCTIMESI1(Chi_11,Chi_11,Chimu_21) \ - VACCTIMESI1(Chi_12,Chi_12,Chimu_22) \ - VACCTIMESI2(Chi_00,Chi_00,Chimu_30) \ - VACCTIMESI2(Chi_01,Chi_01,Chimu_31) \ - VACCTIMESI2(Chi_02,Chi_02,Chimu_32) \ - VACCTIMESI2(Chi_10,Chi_10,Chimu_20) \ - VACCTIMESI2(Chi_11,Chi_11,Chimu_21) \ + SHUF_CHIMU23i \ + VACCTIMESI1(Chi_00,Chi_00,Chimu_30) \ + VACCTIMESI1(Chi_01,Chi_01,Chimu_31) \ + VACCTIMESI1(Chi_02,Chi_02,Chimu_32) \ + VACCTIMESI1(Chi_10,Chi_10,Chimu_20) \ + VACCTIMESI1(Chi_11,Chi_11,Chimu_21) \ + VACCTIMESI1(Chi_12,Chi_12,Chimu_22) \ + VACCTIMESI2(Chi_00,Chi_00,Chimu_30) \ + VACCTIMESI2(Chi_01,Chi_01,Chimu_31) \ + VACCTIMESI2(Chi_02,Chi_02,Chimu_32) \ + VACCTIMESI2(Chi_10,Chi_10,Chimu_20) \ + VACCTIMESI2(Chi_11,Chi_11,Chimu_21) \ VACCTIMESI2(Chi_12,Chi_12,Chimu_22) ); -#define YP_PROJMEM(ptr) \ - LOAD64(%r8,ptr) \ - __asm__ ( \ - LOAD_CHIMU01i \ - VSUBMEM(9,%r8 ,Chimu_00,Chi_00) \ - VSUBMEM(10,%r8,Chimu_01,Chi_01) \ - VSUBMEM(11,%r8,Chimu_02,Chi_02) \ - VADDMEM(6,%r8,Chimu_10,Chi_10) \ - VADDMEM(7,%r8,Chimu_11,Chi_11) \ - VADDMEM(8,%r8,Chimu_12,Chi_12) ); +#define YP_PROJMEM(ptr) \ + LOAD64(%r8,ptr) \ + __asm__ ( \ + LOAD_CHIMU01i \ + VSUBMEM(9,%r8 ,Chimu_00,Chi_00) \ + VSUBMEM(10,%r8,Chimu_01,Chi_01) \ + VSUBMEM(11,%r8,Chimu_02,Chi_02) \ + VADDMEM(6,%r8,Chimu_10,Chi_10) \ + VADDMEM(7,%r8,Chimu_11,Chi_11) \ + VADDMEM(8,%r8,Chimu_12,Chi_12) ); -#define ZP_PROJMEM(PTR) \ - LOAD64(%r8,PTR) \ - __asm__ ( \ +#define ZP_PROJMEM(PTR) \ + LOAD64(%r8,PTR) \ + __asm__ ( \ LOAD_CHIi \ - SHUF_CHIMU23i \ - VACCTIMESI1(Chi_00,Chi_00,Chimu_20) \ - VACCTIMESI1(Chi_01,Chi_01,Chimu_21) \ - VACCTIMESI1(Chi_02,Chi_02,Chimu_22) \ - VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_30) \ - VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_31) \ - VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_32) \ - VACCTIMESI2(Chi_00,Chi_00,Chimu_20) \ - VACCTIMESI2(Chi_01,Chi_01,Chimu_21) \ - VACCTIMESI2(Chi_02,Chi_02,Chimu_22) \ + SHUF_CHIMU23i \ + VACCTIMESI1(Chi_00,Chi_00,Chimu_20) \ + VACCTIMESI1(Chi_01,Chi_01,Chimu_21) \ + VACCTIMESI1(Chi_02,Chi_02,Chimu_22) \ + VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_30) \ + VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_31) \ + VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_32) \ + VACCTIMESI2(Chi_00,Chi_00,Chimu_20) \ + VACCTIMESI2(Chi_01,Chi_01,Chimu_21) \ + VACCTIMESI2(Chi_02,Chi_02,Chimu_22) \ VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_30) \ VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_31) \ VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_32) ); -#define TP_PROJMEM(ptr) \ - LOAD64(%r8,ptr) \ - __asm__ ( \ - LOAD_CHIMU01i \ - VADDMEM(6,%r8 ,Chimu_00,Chi_00) \ - VADDMEM(7,%r8,Chimu_01,Chi_01) \ - VADDMEM(8,%r8,Chimu_02,Chi_02) \ - VADDMEM(9,%r8,Chimu_10,Chi_10) \ - VADDMEM(10,%r8,Chimu_11,Chi_11) \ +#define TP_PROJMEM(ptr) \ + LOAD64(%r8,ptr) \ + __asm__ ( \ + LOAD_CHIMU01i \ + VADDMEM(6,%r8 ,Chimu_00,Chi_00) \ + VADDMEM(7,%r8,Chimu_01,Chi_01) \ + VADDMEM(8,%r8,Chimu_02,Chi_02) \ + VADDMEM(9,%r8,Chimu_10,Chi_10) \ + VADDMEM(10,%r8,Chimu_11,Chi_11) \ VADDMEM(11,%r8,Chimu_12,Chi_12) ); // hspin(0)=fspin(0)-timesI(fspin(3)) // hspin(1)=fspin(1)-timesI(fspin(2)) -#define XM_PROJMEM(PTR) \ - LOAD64(%r8,PTR)\ - __asm__ ( \ - LOAD_CHIi \ - SHUF_CHIMU23i \ - VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\ - VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\ - VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\ - VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_20)\ - VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_21)\ - VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_22)\ - VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_30)\ - VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_31)\ - VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_32)\ - VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_20)\ - VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_21)\ +#define XM_PROJMEM(PTR) \ + LOAD64(%r8,PTR) \ + __asm__ ( \ + LOAD_CHIi \ + SHUF_CHIMU23i \ + VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30) \ + VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31) \ + VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32) \ + VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_20) \ + VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_21) \ + VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_22) \ + VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_30) \ + VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_31) \ + VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_32) \ + VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_20) \ + VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_21) \ VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_22) ); -#define YM_PROJMEM(ptr) \ - LOAD64(%r8,ptr) \ - __asm__ ( \ - LOAD_CHIMU01i \ - VADDMEM(9,%r8 ,Chimu_00,Chi_00) \ - VADDMEM(10,%r8,Chimu_01,Chi_01) \ - VADDMEM(11,%r8,Chimu_02,Chi_02) \ - VSUBMEM(6,%r8,Chimu_10,Chi_10) \ - VSUBMEM(7,%r8,Chimu_11,Chi_11) \ - VSUBMEM(8,%r8,Chimu_12,Chi_12) ); - -#define ZM_PROJMEM(PTR) \ - LOAD64(%r8,PTR) \ +#define YM_PROJMEM(ptr) \ + LOAD64(%r8,ptr) \ __asm__ ( \ - LOAD_CHIi \ - SHUF_CHIMU23i \ - VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\ - VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\ - VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\ - VACCTIMESI1(Chi_10,Chi_10,Chimu_30)\ - VACCTIMESI1(Chi_11,Chi_11,Chimu_31)\ - VACCTIMESI1(Chi_12,Chi_12,Chimu_32)\ - VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_20)\ - VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_21)\ - VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_22)\ - VACCTIMESI2(Chi_10,Chi_10,Chimu_30)\ - VACCTIMESI2(Chi_11,Chi_11,Chimu_31)\ + LOAD_CHIMU01i \ + VADDMEM(9,%r8 ,Chimu_00,Chi_00) \ + VADDMEM(10,%r8,Chimu_01,Chi_01) \ + VADDMEM(11,%r8,Chimu_02,Chi_02) \ + VSUBMEM(6,%r8,Chimu_10,Chi_10) \ + VSUBMEM(7,%r8,Chimu_11,Chi_11) \ + VSUBMEM(8,%r8,Chimu_12,Chi_12) ); + +#define ZM_PROJMEM(PTR) \ + LOAD64(%r8,PTR) \ + __asm__ ( \ + LOAD_CHIi \ + SHUF_CHIMU23i \ + VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20) \ + VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21) \ + VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22) \ + VACCTIMESI1(Chi_10,Chi_10,Chimu_30) \ + VACCTIMESI1(Chi_11,Chi_11,Chimu_31) \ + VACCTIMESI1(Chi_12,Chi_12,Chimu_32) \ + VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_20) \ + VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_21) \ + VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_22) \ + VACCTIMESI2(Chi_10,Chi_10,Chimu_30) \ + VACCTIMESI2(Chi_11,Chi_11,Chimu_31) \ VACCTIMESI2(Chi_12,Chi_12,Chimu_32) ); -#define TM_PROJMEM(ptr) \ - LOAD64(%r8,ptr) \ - __asm__ ( \ - LOAD_CHIMU01i \ - VSUBMEM(6,%r8,Chimu_00,Chi_00) \ - VSUBMEM(7,%r8,Chimu_01,Chi_01) \ - VSUBMEM(8,%r8,Chimu_02,Chi_02) \ - VSUBMEM(9,%r8,Chimu_10,Chi_10) \ - VSUBMEM(10,%r8,Chimu_11,Chi_11) \ - VSUBMEM(11,%r8,Chimu_12,Chi_12) ); +#define TM_PROJMEM(ptr) \ + LOAD64(%r8,ptr) \ + __asm__ ( \ + LOAD_CHIMU01i \ + VSUBMEM(6,%r8,Chimu_00,Chi_00) \ + VSUBMEM(7,%r8,Chimu_01,Chi_01) \ + VSUBMEM(8,%r8,Chimu_02,Chi_02) \ + VSUBMEM(9,%r8,Chimu_10,Chi_10) \ + VSUBMEM(10,%r8,Chimu_11,Chi_11) \ + VSUBMEM(11,%r8,Chimu_12,Chi_12) ); // fspin(0)=hspin(0) // fspin(1)=hspin(1) // fspin(2)=timesMinusI(hspin(1)) // fspin(3)=timesMinusI(hspin(0)) -#define XP_RECON __asm__ ( \ - VZERO(TMP) \ +#define XP_RECON __asm__ ( \ + VZERO(TMP) \ VTIMESMINUSI0(UChi_00,psi_30,TMP) \ VTIMESMINUSI0(UChi_10,psi_20,TMP) \ VTIMESMINUSI0(UChi_01,psi_31,TMP) \ VTIMESMINUSI0(UChi_11,psi_21,TMP) \ - VTIMESMINUSI0(UChi_02,psi_32,TMP) \ + VTIMESMINUSI0(UChi_02,psi_32,TMP) \ VTIMESMINUSI0(UChi_12,psi_22,TMP) \ - VMOV(UChi_00,psi_00) \ - VMOV(UChi_10,psi_10) \ - VMOV(UChi_01,psi_01) \ - VMOV(UChi_11,psi_11) \ - VMOV(UChi_02,psi_02) \ - VMOV(UChi_12,psi_12) \ + VMOV(UChi_00,psi_00) \ + VMOV(UChi_10,psi_10) \ + VMOV(UChi_01,psi_01) \ + VMOV(UChi_11,psi_11) \ + VMOV(UChi_02,psi_02) \ + VMOV(UChi_12,psi_12) \ VTIMESMINUSI1(UChi_10,psi_20,TMP) \ VTIMESMINUSI1(UChi_11,psi_21,TMP) \ VTIMESMINUSI1(UChi_12,psi_22,TMP) \ VTIMESMINUSI1(UChi_00,psi_30,TMP) \ VTIMESMINUSI1(UChi_01,psi_31,TMP) \ - VTIMESMINUSI1(UChi_02,psi_32,TMP) \ + VTIMESMINUSI1(UChi_02,psi_32,TMP) \ VTIMESMINUSI2(UChi_10,psi_20,TMP) \ VTIMESMINUSI2(UChi_11,psi_21,TMP) \ VTIMESMINUSI2(UChi_12,psi_22,TMP) \ VTIMESMINUSI2(UChi_00,psi_30,TMP) \ VTIMESMINUSI2(UChi_01,psi_31,TMP) \ - VTIMESMINUSI2(UChi_02,psi_32,TMP) \ + VTIMESMINUSI2(UChi_02,psi_32,TMP) \ ); - // NB could save 6 ops using addsub => 12 cycles -#define XP_RECON_ACCUM __asm__ ( \ - VZERO(TMP)\ - VACCTIMESMINUSI0(UChi_00,psi_30,Z3)\ - VACCTIMESMINUSI0(UChi_10,psi_20,Z0)\ - VACCTIMESMINUSI0(UChi_01,psi_31,Z4)\ - VACCTIMESMINUSI0(UChi_11,psi_21,Z1)\ - VACCTIMESMINUSI0(UChi_02,psi_32,Z5)\ - VACCTIMESMINUSI0(UChi_12,psi_22,Z2)\ - VADD(UChi_00,psi_00,psi_00)\ - VADD(UChi_10,psi_10,psi_10)\ - VADD(UChi_01,psi_01,psi_01)\ - VADD(UChi_11,psi_11,psi_11)\ - VADD(UChi_02,psi_02,psi_02)\ - VADD(UChi_12,psi_12,psi_12)\ - VACCTIMESMINUSI1(UChi_00,psi_30,Z3)\ - VACCTIMESMINUSI1(UChi_10,psi_20,Z0)\ - VACCTIMESMINUSI1(UChi_01,psi_31,Z4)\ - VACCTIMESMINUSI1(UChi_11,psi_21,Z1)\ - VACCTIMESMINUSI1(UChi_02,psi_32,Z5)\ - VACCTIMESMINUSI1(UChi_12,psi_22,Z2)\ - VACCTIMESMINUSI2(UChi_10,psi_20,Z0)\ - VACCTIMESMINUSI2(UChi_11,psi_21,Z1)\ - VACCTIMESMINUSI2(UChi_12,psi_22,Z2)\ - VACCTIMESMINUSI2(UChi_00,psi_30,Z3)\ - VACCTIMESMINUSI2(UChi_01,psi_31,Z4)\ - VACCTIMESMINUSI2(UChi_02,psi_32,Z5)\ +// NB could save 6 ops using addsub => 12 cycles +#define XP_RECON_ACCUM __asm__ ( \ + VZERO(TMP) \ + VACCTIMESMINUSI0(UChi_00,psi_30,Z3) \ + VACCTIMESMINUSI0(UChi_10,psi_20,Z0) \ + VACCTIMESMINUSI0(UChi_01,psi_31,Z4) \ + VACCTIMESMINUSI0(UChi_11,psi_21,Z1) \ + VACCTIMESMINUSI0(UChi_02,psi_32,Z5) \ + VACCTIMESMINUSI0(UChi_12,psi_22,Z2) \ + VADD(UChi_00,psi_00,psi_00) \ + VADD(UChi_10,psi_10,psi_10) \ + VADD(UChi_01,psi_01,psi_01) \ + VADD(UChi_11,psi_11,psi_11) \ + VADD(UChi_02,psi_02,psi_02) \ + VADD(UChi_12,psi_12,psi_12) \ + VACCTIMESMINUSI1(UChi_00,psi_30,Z3) \ + VACCTIMESMINUSI1(UChi_10,psi_20,Z0) \ + VACCTIMESMINUSI1(UChi_01,psi_31,Z4) \ + VACCTIMESMINUSI1(UChi_11,psi_21,Z1) \ + VACCTIMESMINUSI1(UChi_02,psi_32,Z5) \ + VACCTIMESMINUSI1(UChi_12,psi_22,Z2) \ + VACCTIMESMINUSI2(UChi_10,psi_20,Z0) \ + VACCTIMESMINUSI2(UChi_11,psi_21,Z1) \ + VACCTIMESMINUSI2(UChi_12,psi_22,Z2) \ + VACCTIMESMINUSI2(UChi_00,psi_30,Z3) \ + VACCTIMESMINUSI2(UChi_01,psi_31,Z4) \ + VACCTIMESMINUSI2(UChi_02,psi_32,Z5) \ ); -#define XM_RECON __asm__ ( \ - VZERO(TMP)\ - VTIMESI0(UChi_00,psi_30,TMP)\ - VTIMESI0(UChi_10,psi_20,TMP)\ - VTIMESI0(UChi_01,psi_31,TMP)\ - VTIMESI0(UChi_11,psi_21,TMP)\ - VTIMESI0(UChi_02,psi_32,TMP)\ - VTIMESI0(UChi_12,psi_22,TMP)\ - VMOV(UChi_00,psi_00)\ - VMOV(UChi_10,psi_10)\ - VMOV(UChi_01,psi_01)\ - VMOV(UChi_11,psi_11)\ - VMOV(UChi_02,psi_02)\ - VMOV(UChi_12,psi_12)\ - VTIMESI1(UChi_00,psi_30,TMP)\ - VTIMESI1(UChi_10,psi_20,TMP)\ - VTIMESI1(UChi_01,psi_31,TMP)\ - VTIMESI1(UChi_11,psi_21,TMP)\ - VTIMESI1(UChi_02,psi_32,TMP)\ - VTIMESI1(UChi_12,psi_22,TMP)\ - VTIMESI2(UChi_10,psi_20,TMP)\ - VTIMESI2(UChi_11,psi_21,TMP)\ - VTIMESI2(UChi_12,psi_22,TMP)\ - VTIMESI2(UChi_00,psi_30,TMP)\ - VTIMESI2(UChi_01,psi_31,TMP)\ - VTIMESI2(UChi_02,psi_32,TMP)\ +#define XM_RECON __asm__ ( \ + VZERO(TMP) \ + VTIMESI0(UChi_00,psi_30,TMP) \ + VTIMESI0(UChi_10,psi_20,TMP) \ + VTIMESI0(UChi_01,psi_31,TMP) \ + VTIMESI0(UChi_11,psi_21,TMP) \ + VTIMESI0(UChi_02,psi_32,TMP) \ + VTIMESI0(UChi_12,psi_22,TMP) \ + VMOV(UChi_00,psi_00) \ + VMOV(UChi_10,psi_10) \ + VMOV(UChi_01,psi_01) \ + VMOV(UChi_11,psi_11) \ + VMOV(UChi_02,psi_02) \ + VMOV(UChi_12,psi_12) \ + VTIMESI1(UChi_00,psi_30,TMP) \ + VTIMESI1(UChi_10,psi_20,TMP) \ + VTIMESI1(UChi_01,psi_31,TMP) \ + VTIMESI1(UChi_11,psi_21,TMP) \ + VTIMESI1(UChi_02,psi_32,TMP) \ + VTIMESI1(UChi_12,psi_22,TMP) \ + VTIMESI2(UChi_10,psi_20,TMP) \ + VTIMESI2(UChi_11,psi_21,TMP) \ + VTIMESI2(UChi_12,psi_22,TMP) \ + VTIMESI2(UChi_00,psi_30,TMP) \ + VTIMESI2(UChi_01,psi_31,TMP) \ + VTIMESI2(UChi_02,psi_32,TMP) \ ); -#define XM_RECON_ACCUM __asm__ ( \ - VACCTIMESI0(UChi_10,psi_20,Z0)\ - VACCTIMESI0(UChi_00,psi_30,Z3)\ - VACCTIMESI0(UChi_11,psi_21,Z1)\ - VACCTIMESI0(UChi_01,psi_31,Z4)\ - VACCTIMESI0(UChi_12,psi_22,Z2)\ - VACCTIMESI0(UChi_02,psi_32,Z5)\ - \ - VADD(UChi_10,psi_10,psi_10)\ - VADD(UChi_00,psi_00,psi_00)\ - VADD(UChi_11,psi_11,psi_11)\ - VADD(UChi_01,psi_01,psi_01)\ - VADD(UChi_12,psi_12,psi_12)\ - VADD(UChi_02,psi_02,psi_02)\ - \ - VACCTIMESI1(UChi_10,psi_20,Z0)\ - VACCTIMESI1(UChi_00,psi_30,Z3)\ - VACCTIMESI1(UChi_11,psi_21,Z1)\ - VACCTIMESI1(UChi_01,psi_31,Z4)\ - VACCTIMESI1(UChi_12,psi_22,Z2)\ - VACCTIMESI1(UChi_02,psi_32,Z5)\ - VACCTIMESI2(UChi_10,psi_20,Z0)\ - VACCTIMESI2(UChi_11,psi_21,Z1)\ - VACCTIMESI2(UChi_12,psi_22,Z2)\ - VACCTIMESI2(UChi_00,psi_30,Z3)\ - VACCTIMESI2(UChi_01,psi_31,Z4)\ - VACCTIMESI2(UChi_02,psi_32,Z5)\ +#define XM_RECON_ACCUM __asm__ ( \ + VACCTIMESI0(UChi_10,psi_20,Z0) \ + VACCTIMESI0(UChi_00,psi_30,Z3) \ + VACCTIMESI0(UChi_11,psi_21,Z1) \ + VACCTIMESI0(UChi_01,psi_31,Z4) \ + VACCTIMESI0(UChi_12,psi_22,Z2) \ + VACCTIMESI0(UChi_02,psi_32,Z5) \ + \ + VADD(UChi_10,psi_10,psi_10) \ + VADD(UChi_00,psi_00,psi_00) \ + VADD(UChi_11,psi_11,psi_11) \ + VADD(UChi_01,psi_01,psi_01) \ + VADD(UChi_12,psi_12,psi_12) \ + VADD(UChi_02,psi_02,psi_02) \ + \ + VACCTIMESI1(UChi_10,psi_20,Z0) \ + VACCTIMESI1(UChi_00,psi_30,Z3) \ + VACCTIMESI1(UChi_11,psi_21,Z1) \ + VACCTIMESI1(UChi_01,psi_31,Z4) \ + VACCTIMESI1(UChi_12,psi_22,Z2) \ + VACCTIMESI1(UChi_02,psi_32,Z5) \ + VACCTIMESI2(UChi_10,psi_20,Z0) \ + VACCTIMESI2(UChi_11,psi_21,Z1) \ + VACCTIMESI2(UChi_12,psi_22,Z2) \ + VACCTIMESI2(UChi_00,psi_30,Z3) \ + VACCTIMESI2(UChi_01,psi_31,Z4) \ + VACCTIMESI2(UChi_02,psi_32,Z5) \ ); -#define YP_RECON_ACCUM __asm__ ( \ - VADD(UChi_00,psi_00,psi_00)\ - VADD(UChi_10,psi_10,psi_10)\ - VADD(UChi_01,psi_01,psi_01)\ - VADD(UChi_11,psi_11,psi_11)\ - VADD(UChi_02,psi_02,psi_02)\ - VADD(UChi_12,psi_12,psi_12)\ - VADD(UChi_10,psi_20,psi_20)\ - VADD(UChi_11,psi_21,psi_21)\ - VADD(UChi_12,psi_22,psi_22)\ - VSUB(UChi_00,psi_30,psi_30)\ - VSUB(UChi_01,psi_31,psi_31)\ - VSUB(UChi_02,psi_32,psi_32) ); +#define YP_RECON_ACCUM __asm__ ( \ + VADD(UChi_00,psi_00,psi_00) \ + VADD(UChi_10,psi_10,psi_10) \ + VADD(UChi_01,psi_01,psi_01) \ + VADD(UChi_11,psi_11,psi_11) \ + VADD(UChi_02,psi_02,psi_02) \ + VADD(UChi_12,psi_12,psi_12) \ + VADD(UChi_10,psi_20,psi_20) \ + VADD(UChi_11,psi_21,psi_21) \ + VADD(UChi_12,psi_22,psi_22) \ + VSUB(UChi_00,psi_30,psi_30) \ + VSUB(UChi_01,psi_31,psi_31) \ + VSUB(UChi_02,psi_32,psi_32) ); -#define YM_RECON_ACCUM __asm__ ( \ - VADD(UChi_00,psi_00,psi_00)\ - VADD(UChi_10,psi_10,psi_10)\ - VADD(UChi_01,psi_01,psi_01)\ - VADD(UChi_11,psi_11,psi_11)\ - VADD(UChi_02,psi_02,psi_02)\ - VADD(UChi_12,psi_12,psi_12)\ - VSUB(UChi_10,psi_20,psi_20)\ - VSUB(UChi_11,psi_21,psi_21)\ - VSUB(UChi_12,psi_22,psi_22)\ - VADD(UChi_00,psi_30,psi_30)\ - VADD(UChi_01,psi_31,psi_31)\ - VADD(UChi_02,psi_32,psi_32) ); +#define YM_RECON_ACCUM __asm__ ( \ + VADD(UChi_00,psi_00,psi_00) \ + VADD(UChi_10,psi_10,psi_10) \ + VADD(UChi_01,psi_01,psi_01) \ + VADD(UChi_11,psi_11,psi_11) \ + VADD(UChi_02,psi_02,psi_02) \ + VADD(UChi_12,psi_12,psi_12) \ + VSUB(UChi_10,psi_20,psi_20) \ + VSUB(UChi_11,psi_21,psi_21) \ + VSUB(UChi_12,psi_22,psi_22) \ + VADD(UChi_00,psi_30,psi_30) \ + VADD(UChi_01,psi_31,psi_31) \ + VADD(UChi_02,psi_32,psi_32) ); -#define ZP_RECON_ACCUM __asm__ ( \ - VACCTIMESMINUSI0(UChi_00,psi_20,Z0)\ - VACCTIMESI0(UChi_10,psi_30,Z3)\ - VACCTIMESMINUSI0(UChi_01,psi_21,Z1)\ - VACCTIMESI0(UChi_11,psi_31,Z4)\ - VACCTIMESMINUSI0(UChi_02,psi_22,Z2)\ - VACCTIMESI0(UChi_12,psi_32,Z5)\ - VADD(UChi_00,psi_00,psi_00)\ - VADD(UChi_10,psi_10,psi_10)\ - VADD(UChi_01,psi_01,psi_01)\ - VADD(UChi_11,psi_11,psi_11)\ - VADD(UChi_02,psi_02,psi_02)\ - VADD(UChi_12,psi_12,psi_12)\ - VACCTIMESMINUSI1(UChi_00,psi_20,Z0)\ - VACCTIMESI1(UChi_10,psi_30,Z3)\ - VACCTIMESMINUSI1(UChi_01,psi_21,Z1)\ - VACCTIMESI1(UChi_11,psi_31,Z4)\ - VACCTIMESMINUSI1(UChi_02,psi_22,Z2)\ - VACCTIMESI1(UChi_12,psi_32,Z5)\ - VACCTIMESMINUSI2(UChi_00,psi_20,Z0)\ - VACCTIMESMINUSI2(UChi_01,psi_21,Z1)\ - VACCTIMESMINUSI2(UChi_02,psi_22,Z2)\ - VACCTIMESI2(UChi_10,psi_30,Z3)\ - VACCTIMESI2(UChi_11,psi_31,Z4)\ - VACCTIMESI2(UChi_12,psi_32,Z5)\ +#define ZP_RECON_ACCUM __asm__ ( \ + VACCTIMESMINUSI0(UChi_00,psi_20,Z0) \ + VACCTIMESI0(UChi_10,psi_30,Z3) \ + VACCTIMESMINUSI0(UChi_01,psi_21,Z1) \ + VACCTIMESI0(UChi_11,psi_31,Z4) \ + VACCTIMESMINUSI0(UChi_02,psi_22,Z2) \ + VACCTIMESI0(UChi_12,psi_32,Z5) \ + VADD(UChi_00,psi_00,psi_00) \ + VADD(UChi_10,psi_10,psi_10) \ + VADD(UChi_01,psi_01,psi_01) \ + VADD(UChi_11,psi_11,psi_11) \ + VADD(UChi_02,psi_02,psi_02) \ + VADD(UChi_12,psi_12,psi_12) \ + VACCTIMESMINUSI1(UChi_00,psi_20,Z0) \ + VACCTIMESI1(UChi_10,psi_30,Z3) \ + VACCTIMESMINUSI1(UChi_01,psi_21,Z1) \ + VACCTIMESI1(UChi_11,psi_31,Z4) \ + VACCTIMESMINUSI1(UChi_02,psi_22,Z2) \ + VACCTIMESI1(UChi_12,psi_32,Z5) \ + VACCTIMESMINUSI2(UChi_00,psi_20,Z0) \ + VACCTIMESMINUSI2(UChi_01,psi_21,Z1) \ + VACCTIMESMINUSI2(UChi_02,psi_22,Z2) \ + VACCTIMESI2(UChi_10,psi_30,Z3) \ + VACCTIMESI2(UChi_11,psi_31,Z4) \ + VACCTIMESI2(UChi_12,psi_32,Z5) \ ); -#define ZM_RECON_ACCUM __asm__ ( \ - VACCTIMESI0(UChi_00,psi_20,Z0)\ - VACCTIMESMINUSI0(UChi_10,psi_30,Z3)\ - VACCTIMESI0(UChi_01,psi_21,Z1)\ - VACCTIMESMINUSI0(UChi_11,psi_31,Z4)\ - VACCTIMESI0(UChi_02,psi_22,Z2)\ - VACCTIMESMINUSI0(UChi_12,psi_32,Z5)\ - VADD(UChi_00,psi_00,psi_00)\ - VADD(UChi_10,psi_10,psi_10)\ - VADD(UChi_01,psi_01,psi_01)\ - VADD(UChi_11,psi_11,psi_11)\ - VADD(UChi_02,psi_02,psi_02)\ - VADD(UChi_12,psi_12,psi_12)\ - VACCTIMESI1(UChi_00,psi_20,Z0)\ - VACCTIMESMINUSI1(UChi_10,psi_30,Z3)\ - VACCTIMESI1(UChi_01,psi_21,Z1)\ - VACCTIMESMINUSI1(UChi_11,psi_31,Z4)\ - VACCTIMESI1(UChi_02,psi_22,Z2)\ - VACCTIMESMINUSI1(UChi_12,psi_32,Z5)\ - VACCTIMESI2(UChi_00,psi_20,Z0)\ - VACCTIMESI2(UChi_01,psi_21,Z1)\ - VACCTIMESI2(UChi_02,psi_22,Z2)\ - VACCTIMESMINUSI2(UChi_10,psi_30,Z3)\ - VACCTIMESMINUSI2(UChi_11,psi_31,Z4)\ - VACCTIMESMINUSI2(UChi_12,psi_32,Z5)\ +#define ZM_RECON_ACCUM __asm__ ( \ + VACCTIMESI0(UChi_00,psi_20,Z0) \ + VACCTIMESMINUSI0(UChi_10,psi_30,Z3) \ + VACCTIMESI0(UChi_01,psi_21,Z1) \ + VACCTIMESMINUSI0(UChi_11,psi_31,Z4) \ + VACCTIMESI0(UChi_02,psi_22,Z2) \ + VACCTIMESMINUSI0(UChi_12,psi_32,Z5) \ + VADD(UChi_00,psi_00,psi_00) \ + VADD(UChi_10,psi_10,psi_10) \ + VADD(UChi_01,psi_01,psi_01) \ + VADD(UChi_11,psi_11,psi_11) \ + VADD(UChi_02,psi_02,psi_02) \ + VADD(UChi_12,psi_12,psi_12) \ + VACCTIMESI1(UChi_00,psi_20,Z0) \ + VACCTIMESMINUSI1(UChi_10,psi_30,Z3) \ + VACCTIMESI1(UChi_01,psi_21,Z1) \ + VACCTIMESMINUSI1(UChi_11,psi_31,Z4) \ + VACCTIMESI1(UChi_02,psi_22,Z2) \ + VACCTIMESMINUSI1(UChi_12,psi_32,Z5) \ + VACCTIMESI2(UChi_00,psi_20,Z0) \ + VACCTIMESI2(UChi_01,psi_21,Z1) \ + VACCTIMESI2(UChi_02,psi_22,Z2) \ + VACCTIMESMINUSI2(UChi_10,psi_30,Z3) \ + VACCTIMESMINUSI2(UChi_11,psi_31,Z4) \ + VACCTIMESMINUSI2(UChi_12,psi_32,Z5) \ ); -#define TP_RECON_ACCUM __asm__ ( \ - VADD(UChi_00,psi_00,psi_00)\ - VADD(UChi_10,psi_10,psi_10)\ - VADD(UChi_01,psi_01,psi_01)\ - VADD(UChi_11,psi_11,psi_11)\ - VADD(UChi_02,psi_02,psi_02)\ - VADD(UChi_12,psi_12,psi_12)\ - VADD(UChi_00,psi_20,psi_20)\ - VADD(UChi_10,psi_30,psi_30)\ - VADD(UChi_01,psi_21,psi_21)\ - VADD(UChi_11,psi_31,psi_31)\ - VADD(UChi_02,psi_22,psi_22)\ - VADD(UChi_12,psi_32,psi_32) ); +#define TP_RECON_ACCUM __asm__ ( \ + VADD(UChi_00,psi_00,psi_00) \ + VADD(UChi_10,psi_10,psi_10) \ + VADD(UChi_01,psi_01,psi_01) \ + VADD(UChi_11,psi_11,psi_11) \ + VADD(UChi_02,psi_02,psi_02) \ + VADD(UChi_12,psi_12,psi_12) \ + VADD(UChi_00,psi_20,psi_20) \ + VADD(UChi_10,psi_30,psi_30) \ + VADD(UChi_01,psi_21,psi_21) \ + VADD(UChi_11,psi_31,psi_31) \ + VADD(UChi_02,psi_22,psi_22) \ + VADD(UChi_12,psi_32,psi_32) ); -#define TM_RECON_ACCUM __asm__ ( \ - VADD(UChi_00,psi_00,psi_00)\ - VADD(UChi_10,psi_10,psi_10)\ - VADD(UChi_01,psi_01,psi_01)\ - VADD(UChi_11,psi_11,psi_11)\ - VADD(UChi_02,psi_02,psi_02)\ - VADD(UChi_12,psi_12,psi_12)\ - VSUB(UChi_00,psi_20,psi_20)\ - VSUB(UChi_10,psi_30,psi_30)\ - VSUB(UChi_01,psi_21,psi_21)\ - VSUB(UChi_11,psi_31,psi_31)\ - VSUB(UChi_02,psi_22,psi_22)\ - VSUB(UChi_12,psi_32,psi_32) ); +#define TM_RECON_ACCUM __asm__ ( \ + VADD(UChi_00,psi_00,psi_00) \ + VADD(UChi_10,psi_10,psi_10) \ + VADD(UChi_01,psi_01,psi_01) \ + VADD(UChi_11,psi_11,psi_11) \ + VADD(UChi_02,psi_02,psi_02) \ + VADD(UChi_12,psi_12,psi_12) \ + VSUB(UChi_00,psi_20,psi_20) \ + VSUB(UChi_10,psi_30,psi_30) \ + VSUB(UChi_01,psi_21,psi_21) \ + VSUB(UChi_11,psi_31,psi_31) \ + VSUB(UChi_02,psi_22,psi_22) \ + VSUB(UChi_12,psi_32,psi_32) ); #define AVX512_PF_L1 #define AVX512_PF_L2_GAUGE @@ -560,27 +560,27 @@ Author: paboyle #define VPREFETCH_G2(A,B) VPREFETCH2(A,B) #endif -#define PF_GAUGE(A) \ +#define PF_GAUGE(A) \ LOAD64(%r8,&U._odata[sU](A)) \ __asm__ ( \ VPREFETCH_G1(0,%r8) VPREFETCH_G1(1,%r8) \ VPREFETCH_G1(2,%r8) VPREFETCH_G1(3,%r8) \ ); -#define SAVE_RESULTi(PTR,pf) \ - LOAD64(%r8,PTR) \ - LOAD64(%r9,pf) \ - __asm__ ( \ - VSTORE(0,%r8,psi_00) VPREFETCH_M1(0,%r9) \ - VSTORE(1,%r8,psi_01) VPREFETCH_M1(1,%r9) \ - VSTORE(2,%r8,psi_02) VPREFETCH_M1(2,%r9) \ - VSTORE(3,%r8,psi_10) VPREFETCH_M1(3,%r9) \ - VSTORE(4,%r8,psi_11) VPREFETCH_M1(4,%r9) \ - VSTORE(5,%r8,psi_12) VPREFETCH_M1(5,%r9) \ - VSTORE(6,%r8,psi_20) VPREFETCH_M1(6,%r9) \ - VSTORE(7,%r8,psi_21) VPREFETCH_M1(7,%r9) \ - VSTORE(8,%r8,psi_22) VPREFETCH_M1(8,%r9) \ - VSTORE(9,%r8,psi_30) VPREFETCH_M1(9,%r9) \ +#define SAVE_RESULTi(PTR,pf) \ + LOAD64(%r8,PTR) \ + LOAD64(%r9,pf) \ + __asm__ ( \ + VSTORE(0,%r8,psi_00) VPREFETCH_M1(0,%r9) \ + VSTORE(1,%r8,psi_01) VPREFETCH_M1(1,%r9) \ + VSTORE(2,%r8,psi_02) VPREFETCH_M1(2,%r9) \ + VSTORE(3,%r8,psi_10) VPREFETCH_M1(3,%r9) \ + VSTORE(4,%r8,psi_11) VPREFETCH_M1(4,%r9) \ + VSTORE(5,%r8,psi_12) VPREFETCH_M1(5,%r9) \ + VSTORE(6,%r8,psi_20) VPREFETCH_M1(6,%r9) \ + VSTORE(7,%r8,psi_21) VPREFETCH_M1(7,%r9) \ + VSTORE(8,%r8,psi_22) VPREFETCH_M1(8,%r9) \ + VSTORE(9,%r8,psi_30) VPREFETCH_M1(9,%r9) \ VSTORE(10,%r8,psi_31) VPREFETCH_M1(10,%r9) \ VSTORE(11,%r8,psi_32) VPREFETCH_M1(11,%r9) \ ); @@ -596,343 +596,343 @@ Author: paboyle #define ADD_RESULTia(PTR,pf) \ - LOAD64(%r8,PTR) \ + LOAD64(%r8,PTR) \ __asm__ ( \ - VADDMEM(0,%r8,psi_00,psi_00) \ - VADDMEM(1,%r8,psi_01,psi_01) \ - VADDMEM(2,%r8,psi_02,psi_02) \ - VADDMEM(3,%r8,psi_10,psi_10) \ - VADDMEM(4,%r8,psi_11,psi_11) \ - VADDMEM(5,%r8,psi_12,psi_12) \ - VADDMEM(6,%r8,psi_20,psi_20) \ - VADDMEM(7,%r8,psi_21,psi_21) \ - VADDMEM(8,%r8,psi_22,psi_22) \ - VADDMEM(9,%r8,psi_30,psi_30) \ - VADDMEM(10,%r8,psi_31,psi_31) \ + VADDMEM(0,%r8,psi_00,psi_00) \ + VADDMEM(1,%r8,psi_01,psi_01) \ + VADDMEM(2,%r8,psi_02,psi_02) \ + VADDMEM(3,%r8,psi_10,psi_10) \ + VADDMEM(4,%r8,psi_11,psi_11) \ + VADDMEM(5,%r8,psi_12,psi_12) \ + VADDMEM(6,%r8,psi_20,psi_20) \ + VADDMEM(7,%r8,psi_21,psi_21) \ + VADDMEM(8,%r8,psi_22,psi_22) \ + VADDMEM(9,%r8,psi_30,psi_30) \ + VADDMEM(10,%r8,psi_31,psi_31) \ VADDMEM(11,%r8,psi_32,psi_32) \ - VSTORE(0,%r8,psi_00) \ - VSTORE(1,%r8,psi_01) \ - VSTORE(2,%r8,psi_02) \ - VSTORE(3,%r8,psi_10) \ - VSTORE(4,%r8,psi_11) \ - VSTORE(5,%r8,psi_12) \ - VSTORE(6,%r8,psi_20) \ - VSTORE(7,%r8,psi_21) \ - VSTORE(8,%r8,psi_22) \ - VSTORE(9,%r8,psi_30) \ - VSTORE(10,%r8,psi_31) \ - VSTORE(11,%r8,psi_32) \ + VSTORE(0,%r8,psi_00) \ + VSTORE(1,%r8,psi_01) \ + VSTORE(2,%r8,psi_02) \ + VSTORE(3,%r8,psi_10) \ + VSTORE(4,%r8,psi_11) \ + VSTORE(5,%r8,psi_12) \ + VSTORE(6,%r8,psi_20) \ + VSTORE(7,%r8,psi_21) \ + VSTORE(8,%r8,psi_22) \ + VSTORE(9,%r8,psi_30) \ + VSTORE(10,%r8,psi_31) \ + VSTORE(11,%r8,psi_32) \ ); #ifdef AVX512_PF_L2_TABLE -#define PREFETCH_CHIMU(A) \ - LOAD64(%r9,A) \ - __asm__ ( \ - VPREFETCH_P1(0,%r9) \ - VPREFETCH_P1(1,%r9) \ - VPREFETCH_P1(2,%r9) \ - VPREFETCH_P1(3,%r9) \ - VPREFETCH_P1(4,%r9) \ - VPREFETCH_P1(5,%r9) \ - VPREFETCH_P1(6,%r9) \ - VPREFETCH_P1(7,%r9) \ - VPREFETCH_P1(8,%r9) \ - VPREFETCH_P1(9,%r9) \ - VPREFETCH_P1(10,%r9) \ +#define PREFETCH_CHIMU(A) \ + LOAD64(%r9,A) \ + __asm__ ( \ + VPREFETCH_P1(0,%r9) \ + VPREFETCH_P1(1,%r9) \ + VPREFETCH_P1(2,%r9) \ + VPREFETCH_P1(3,%r9) \ + VPREFETCH_P1(4,%r9) \ + VPREFETCH_P1(5,%r9) \ + VPREFETCH_P1(6,%r9) \ + VPREFETCH_P1(7,%r9) \ + VPREFETCH_P1(8,%r9) \ + VPREFETCH_P1(9,%r9) \ + VPREFETCH_P1(10,%r9) \ VPREFETCH_P1(11,%r9)); #else #define PREFETCH_CHIMU(A) #endif -#define PREFETCH1_CHIMU(A) \ - LOAD64(%r9,A) \ - __asm__ ( \ - VPREFETCH_P1(0,%r9) \ - VPREFETCH_P1(1,%r9) \ - VPREFETCH_P1(2,%r9) \ - VPREFETCH_P1(3,%r9) \ - VPREFETCH_P1(4,%r9) \ - VPREFETCH_P1(5,%r9) \ - VPREFETCH_P1(6,%r9) \ - VPREFETCH_P1(7,%r9) \ - VPREFETCH_P1(8,%r9) \ - VPREFETCH_P1(9,%r9) \ - VPREFETCH_P1(10,%r9) \ +#define PREFETCH1_CHIMU(A) \ + LOAD64(%r9,A) \ + __asm__ ( \ + VPREFETCH_P1(0,%r9) \ + VPREFETCH_P1(1,%r9) \ + VPREFETCH_P1(2,%r9) \ + VPREFETCH_P1(3,%r9) \ + VPREFETCH_P1(4,%r9) \ + VPREFETCH_P1(5,%r9) \ + VPREFETCH_P1(6,%r9) \ + VPREFETCH_P1(7,%r9) \ + VPREFETCH_P1(8,%r9) \ + VPREFETCH_P1(9,%r9) \ + VPREFETCH_P1(10,%r9) \ VPREFETCH_P1(11,%r9)); -#define PERMUTE_DIR0 __asm__ ( \ - VPERM0(Chi_00,Chi_00) \ - VPERM0(Chi_01,Chi_01) \ - VPERM0(Chi_02,Chi_02) \ - VPERM0(Chi_10,Chi_10) \ - VPERM0(Chi_11,Chi_11) \ - VPERM0(Chi_12,Chi_12) ); +#define PERMUTE_DIR0 __asm__ ( \ + VPERM0(Chi_00,Chi_00) \ + VPERM0(Chi_01,Chi_01) \ + VPERM0(Chi_02,Chi_02) \ + VPERM0(Chi_10,Chi_10) \ + VPERM0(Chi_11,Chi_11) \ + VPERM0(Chi_12,Chi_12) ); -#define PERMUTE_DIR1 __asm__ ( \ - VPERM1(Chi_00,Chi_00) \ - VPERM1(Chi_01,Chi_01) \ - VPERM1(Chi_02,Chi_02) \ - VPERM1(Chi_10,Chi_10) \ - VPERM1(Chi_11,Chi_11) \ - VPERM1(Chi_12,Chi_12)); +#define PERMUTE_DIR1 __asm__ ( \ + VPERM1(Chi_00,Chi_00) \ + VPERM1(Chi_01,Chi_01) \ + VPERM1(Chi_02,Chi_02) \ + VPERM1(Chi_10,Chi_10) \ + VPERM1(Chi_11,Chi_11) \ + VPERM1(Chi_12,Chi_12)); -#define PERMUTE_DIR2 __asm__ ( \ - VPERM2(Chi_00,Chi_00) \ - VPERM2(Chi_01,Chi_01) \ - VPERM2(Chi_02,Chi_02) \ - VPERM2(Chi_10,Chi_10) \ - VPERM2(Chi_11,Chi_11) \ - VPERM2(Chi_12,Chi_12) ); +#define PERMUTE_DIR2 __asm__ ( \ + VPERM2(Chi_00,Chi_00) \ + VPERM2(Chi_01,Chi_01) \ + VPERM2(Chi_02,Chi_02) \ + VPERM2(Chi_10,Chi_10) \ + VPERM2(Chi_11,Chi_11) \ + VPERM2(Chi_12,Chi_12) ); -#define PERMUTE_DIR3 __asm__ ( \ - VPERM3(Chi_00,Chi_00) \ - VPERM3(Chi_01,Chi_01) \ - VPERM3(Chi_02,Chi_02) \ - VPERM3(Chi_10,Chi_10) \ - VPERM3(Chi_11,Chi_11) \ - VPERM3(Chi_12,Chi_12) ); +#define PERMUTE_DIR3 __asm__ ( \ + VPERM3(Chi_00,Chi_00) \ + VPERM3(Chi_01,Chi_01) \ + VPERM3(Chi_02,Chi_02) \ + VPERM3(Chi_10,Chi_10) \ + VPERM3(Chi_11,Chi_11) \ + VPERM3(Chi_12,Chi_12) ); #define MULT_ADDSUB_2SPIN(ptr,pf) \ - LOAD64(%r8,ptr) \ - LOAD64(%r9,pf) \ - __asm__ ( \ - VPREFETCH_G2(9,%r8) \ - VPREFETCH_G2(10,%r8) \ - VPREFETCH_G2(11,%r8) \ - VPREFETCH_G2(12,%r8) \ - VPREFETCH_G2(13,%r8) \ - VPREFETCH_G2(14,%r8) \ - VPREFETCH_G2(15,%r8) \ - VPREFETCH_G2(16,%r8) \ - VPREFETCH_G2(17,%r8) \ - VSHUF(Chi_00,T1) \ - VMOVIDUP(0,%r8,Z0 ) \ - VMOVIDUP(3,%r8,Z1 ) \ + LOAD64(%r8,ptr) \ + LOAD64(%r9,pf) \ + __asm__ ( \ + VPREFETCH_G2(9,%r8) \ + VPREFETCH_G2(10,%r8) \ + VPREFETCH_G2(11,%r8) \ + VPREFETCH_G2(12,%r8) \ + VPREFETCH_G2(13,%r8) \ + VPREFETCH_G2(14,%r8) \ + VPREFETCH_G2(15,%r8) \ + VPREFETCH_G2(16,%r8) \ + VPREFETCH_G2(17,%r8) \ + VSHUF(Chi_00,T1) \ + VMOVIDUP(0,%r8,Z0 ) \ + VMOVIDUP(3,%r8,Z1 ) \ VMOVIDUP(6,%r8,Z2 ) VSHUF(Chi_10,T2) \ /*6*/ \ - VMUL(Z0,T1,UChi_00) VMOVRDUP(0,%r8,Z3 ) \ - VMUL(Z0,T2,UChi_10) VMOVRDUP(3,%r8,Z4 ) \ - VMUL(Z1,T1,UChi_01) VMOVRDUP(6,%r8,Z5 ) \ - VMUL(Z1,T2,UChi_11) VMOVIDUP(1,%r8,Z0 ) \ - VMUL(Z2,T1,UChi_02) VMOVIDUP(4,%r8,Z1 ) \ - VMUL(Z2,T2,UChi_12) VMOVIDUP(7,%r8,Z2 ) \ - VPREFETCH_M1(0,%r9) \ - VPREFETCH_M1(1,%r9) \ - VPREFETCH_M1(2,%r9) \ - VPREFETCH_M1(3,%r9) \ - /*18*/ \ - VMADDSUB(Z3,Chi_00,UChi_00) VSHUF(Chi_01,T1) \ - VMADDSUB(Z3,Chi_10,UChi_10) \ - VMADDSUB(Z4,Chi_00,UChi_01) VMOVRDUP(1,%r8,Z3 ) \ - VMADDSUB(Z4,Chi_10,UChi_11) VSHUF(Chi_11,T2) \ - VMADDSUB(Z5,Chi_00,UChi_02) VMOVRDUP(4,%r8,Z4 ) \ - VMADDSUB(Z5,Chi_10,UChi_12) \ - VPREFETCH_M1(4,%r9) \ - VPREFETCH_M1(5,%r9) \ - VPREFETCH_M1(6,%r9) \ - VPREFETCH_M1(7,%r9) \ - /*28*/ \ - VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(7,%r8,Z5 ) \ - VMADDSUB(Z0,T2,UChi_10) \ - VMADDSUB(Z1,T1,UChi_01) VMOVIDUP(2,%r8,Z0 ) \ - VMADDSUB(Z1,T2,UChi_11) \ - VMADDSUB(Z2,T1,UChi_02) VMOVIDUP(5,%r8,Z1 ) \ - VMADDSUB(Z2,T2,UChi_12) VMOVIDUP(8,%r8,Z2 ) \ - VPREFETCH2(12,%r9) \ - VPREFETCH2(13,%r9) \ - VPREFETCH2(14,%r9) \ - VPREFETCH2(15,%r9) \ - VPREFETCH2(16,%r9) \ - VPREFETCH2(17,%r9) \ - VPREFETCH2(18,%r9) \ - VPREFETCH2(19,%r9) \ - VPREFETCH2(20,%r9) \ - VPREFETCH2(21,%r9) \ - VPREFETCH2(22,%r9) \ - VPREFETCH2(23,%r9) \ - /*38*/ \ - VMADDSUB(Z3,Chi_01,UChi_00) VSHUF(Chi_02,T1) \ - VMADDSUB(Z3,Chi_11,UChi_10) \ - VMADDSUB(Z4,Chi_01,UChi_01) VMOVRDUP(2,%r8,Z3 ) \ - VMADDSUB(Z4,Chi_11,UChi_11) VSHUF(Chi_12,T2) \ - VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \ - VMADDSUB(Z5,Chi_11,UChi_12) \ - VPREFETCH_M1(9,%r8) \ - VPREFETCH_M1(10,%r8) \ - VPREFETCH_M1(11,%r8) \ - VPREFETCH_M1(12,%r8) \ - VPREFETCH_M1(13,%r8) \ - VPREFETCH_M1(14,%r8) \ - VPREFETCH_M1(15,%r8) \ - VPREFETCH_M1(16,%r8) \ - VPREFETCH_M1(17,%r8) \ - /*48*/ \ - VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \ - VMADDSUB(Z0,T2,UChi_10) \ - VMADDSUB(Z1,T1,UChi_01) \ - VMADDSUB(Z1,T2,UChi_11) \ - VMADDSUB(Z2,T1,UChi_02) \ - VMADDSUB(Z2,T2,UChi_12) \ - VPREFETCH_M1(8,%r9) \ - VPREFETCH_M1(9,%r9) \ - VPREFETCH_M1(10,%r9) \ - VPREFETCH_M1(11,%r9) \ - /*55*/ \ - VMADDSUB(Z3,Chi_02,UChi_00) \ - VMADDSUB(Z3,Chi_12,UChi_10) \ - VMADDSUB(Z4,Chi_02,UChi_01) \ - VMADDSUB(Z4,Chi_12,UChi_11) \ - VMADDSUB(Z5,Chi_02,UChi_02) \ - VMADDSUB(Z5,Chi_12,UChi_12) \ + VMUL(Z0,T1,UChi_00) VMOVRDUP(0,%r8,Z3 ) \ + VMUL(Z0,T2,UChi_10) VMOVRDUP(3,%r8,Z4 ) \ + VMUL(Z1,T1,UChi_01) VMOVRDUP(6,%r8,Z5 ) \ + VMUL(Z1,T2,UChi_11) VMOVIDUP(1,%r8,Z0 ) \ + VMUL(Z2,T1,UChi_02) VMOVIDUP(4,%r8,Z1 ) \ + VMUL(Z2,T2,UChi_12) VMOVIDUP(7,%r8,Z2 ) \ + VPREFETCH_M1(0,%r9) \ + VPREFETCH_M1(1,%r9) \ + VPREFETCH_M1(2,%r9) \ + VPREFETCH_M1(3,%r9) \ + /*18*/ \ + VMADDSUB(Z3,Chi_00,UChi_00) VSHUF(Chi_01,T1) \ + VMADDSUB(Z3,Chi_10,UChi_10) \ + VMADDSUB(Z4,Chi_00,UChi_01) VMOVRDUP(1,%r8,Z3 ) \ + VMADDSUB(Z4,Chi_10,UChi_11) VSHUF(Chi_11,T2) \ + VMADDSUB(Z5,Chi_00,UChi_02) VMOVRDUP(4,%r8,Z4 ) \ + VMADDSUB(Z5,Chi_10,UChi_12) \ + VPREFETCH_M1(4,%r9) \ + VPREFETCH_M1(5,%r9) \ + VPREFETCH_M1(6,%r9) \ + VPREFETCH_M1(7,%r9) \ + /*28*/ \ + VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(7,%r8,Z5 ) \ + VMADDSUB(Z0,T2,UChi_10) \ + VMADDSUB(Z1,T1,UChi_01) VMOVIDUP(2,%r8,Z0 ) \ + VMADDSUB(Z1,T2,UChi_11) \ + VMADDSUB(Z2,T1,UChi_02) VMOVIDUP(5,%r8,Z1 ) \ + VMADDSUB(Z2,T2,UChi_12) VMOVIDUP(8,%r8,Z2 ) \ + VPREFETCH2(12,%r9) \ + VPREFETCH2(13,%r9) \ + VPREFETCH2(14,%r9) \ + VPREFETCH2(15,%r9) \ + VPREFETCH2(16,%r9) \ + VPREFETCH2(17,%r9) \ + VPREFETCH2(18,%r9) \ + VPREFETCH2(19,%r9) \ + VPREFETCH2(20,%r9) \ + VPREFETCH2(21,%r9) \ + VPREFETCH2(22,%r9) \ + VPREFETCH2(23,%r9) \ + /*38*/ \ + VMADDSUB(Z3,Chi_01,UChi_00) VSHUF(Chi_02,T1) \ + VMADDSUB(Z3,Chi_11,UChi_10) \ + VMADDSUB(Z4,Chi_01,UChi_01) VMOVRDUP(2,%r8,Z3 ) \ + VMADDSUB(Z4,Chi_11,UChi_11) VSHUF(Chi_12,T2) \ + VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \ + VMADDSUB(Z5,Chi_11,UChi_12) \ + VPREFETCH_M1(9,%r8) \ + VPREFETCH_M1(10,%r8) \ + VPREFETCH_M1(11,%r8) \ + VPREFETCH_M1(12,%r8) \ + VPREFETCH_M1(13,%r8) \ + VPREFETCH_M1(14,%r8) \ + VPREFETCH_M1(15,%r8) \ + VPREFETCH_M1(16,%r8) \ + VPREFETCH_M1(17,%r8) \ + /*48*/ \ + VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \ + VMADDSUB(Z0,T2,UChi_10) \ + VMADDSUB(Z1,T1,UChi_01) \ + VMADDSUB(Z1,T2,UChi_11) \ + VMADDSUB(Z2,T1,UChi_02) \ + VMADDSUB(Z2,T2,UChi_12) \ + VPREFETCH_M1(8,%r9) \ + VPREFETCH_M1(9,%r9) \ + VPREFETCH_M1(10,%r9) \ + VPREFETCH_M1(11,%r9) \ + /*55*/ \ + VMADDSUB(Z3,Chi_02,UChi_00) \ + VMADDSUB(Z3,Chi_12,UChi_10) \ + VMADDSUB(Z4,Chi_02,UChi_01) \ + VMADDSUB(Z4,Chi_12,UChi_11) \ + VMADDSUB(Z5,Chi_02,UChi_02) \ + VMADDSUB(Z5,Chi_12,UChi_12) \ /*61 insns*/ ); -#define MULT_ADDSUB_2SPIN_LS(ptr,pf) \ - LOAD64(%r8,ptr) \ - LOAD64(%r9,pf) \ - __asm__ ( \ - VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \ - VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \ - VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \ - VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \ - VPREFETCH_M1(0,%r9) \ - VPREFETCH_M1(1,%r9) \ - VPREFETCH_M1(2,%r9) \ - VPREFETCH_M1(3,%r9) \ - /*8*/ \ - VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \ +#define MULT_ADDSUB_2SPIN_LS(ptr,pf) \ + LOAD64(%r8,ptr) \ + LOAD64(%r9,pf) \ + __asm__ ( \ + VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \ + VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \ + VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \ + VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \ + VPREFETCH_M1(0,%r9) \ + VPREFETCH_M1(1,%r9) \ + VPREFETCH_M1(2,%r9) \ + VPREFETCH_M1(3,%r9) \ + /*8*/ \ + VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \ VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \ VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \ VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \ - VPREFETCH_M1(4,%r9) \ - VPREFETCH_M1(5,%r9) \ - VPREFETCH_M1(6,%r9) \ - VPREFETCH_M1(7,%r9) \ - /*16*/ \ - VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \ + VPREFETCH_M1(4,%r9) \ + VPREFETCH_M1(5,%r9) \ + VPREFETCH_M1(6,%r9) \ + VPREFETCH_M1(7,%r9) \ + /*16*/ \ + VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \ VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \ VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \ - VPREFETCH_M1(8,%r9) \ - VPREFETCH_M1(9,%r9) \ - VPREFETCH_M1(10,%r9) \ - VPREFETCH_M1(11,%r9) \ - /*22*/ \ - VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \ + VPREFETCH_M1(8,%r9) \ + VPREFETCH_M1(9,%r9) \ + VPREFETCH_M1(10,%r9) \ + VPREFETCH_M1(11,%r9) \ + /*22*/ \ + VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \ VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \ VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \ VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \ - VPREFETCH_M2(12,%r9) \ - VPREFETCH_M2(13,%r9) \ - VPREFETCH_M2(14,%r9) \ - VPREFETCH_M2(15,%r9) \ - /*30*/ \ - VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \ - VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \ - VPREFETCH_M2(16,%r9) \ - VPREFETCH_M2(17,%r9) \ - VPREFETCH_M2(18,%r9) \ - VPREFETCH_M2(19,%r9) \ - VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \ - /*36*/ \ + VPREFETCH_M2(12,%r9) \ + VPREFETCH_M2(13,%r9) \ + VPREFETCH_M2(14,%r9) \ + VPREFETCH_M2(15,%r9) \ + /*30*/ \ + VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \ + VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \ + VPREFETCH_M2(16,%r9) \ + VPREFETCH_M2(17,%r9) \ + VPREFETCH_M2(18,%r9) \ + VPREFETCH_M2(19,%r9) \ + VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \ + /*36*/ \ VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \ VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \ VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \ - VPREFETCH_M2(20,%r9) \ - VPREFETCH_M2(21,%r9) \ - VPREFETCH_M2(22,%r9) \ - VPREFETCH_M2(23,%r9) \ - VPREFETCH_G1(2,%r8) \ - VPREFETCH_G1(3,%r8) \ - VPREFETCH_G2(4,%r8) \ - VPREFETCH_G2(5,%r8) \ - VPREFETCH_G2(6,%r8) \ - VPREFETCH_G2(7,%r8) \ + VPREFETCH_M2(20,%r9) \ + VPREFETCH_M2(21,%r9) \ + VPREFETCH_M2(22,%r9) \ + VPREFETCH_M2(23,%r9) \ + VPREFETCH_G1(2,%r8) \ + VPREFETCH_G1(3,%r8) \ + VPREFETCH_G2(4,%r8) \ + VPREFETCH_G2(5,%r8) \ + VPREFETCH_G2(6,%r8) \ + VPREFETCH_G2(7,%r8) \ /*42 insns*/ ); -#define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) \ - LOAD64(%r8,ptr) \ - LOAD64(%r9,pf) \ - __asm__ ( \ - VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \ - VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \ - VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \ - VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \ - /*8*/ \ - VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \ +#define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) \ + LOAD64(%r8,ptr) \ + LOAD64(%r9,pf) \ + __asm__ ( \ + VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \ + VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \ + VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \ + VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \ + /*8*/ \ + VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \ VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \ VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \ VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \ - /*16*/ \ - VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \ + /*16*/ \ + VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \ VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \ VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \ - /*22*/ \ - VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \ + /*22*/ \ + VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \ VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \ VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \ VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \ - /*30*/ \ - VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \ - VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \ - VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \ - /*36*/ \ + /*30*/ \ + VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \ + VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \ + VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \ + /*36*/ \ VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \ VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \ VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \ - /* VPREFETCH1(2,%r8)*/ \ - /* VPREFETCH1(3,%r8)*/ \ + /* VPREFETCH1(2,%r8)*/ \ + /* VPREFETCH1(3,%r8)*/ \ /*42 insns*/ ); #define Z6 Chi_00 -#define MULT_ADDSUB_2SPIN_NEW(ptr,pf) \ - LOAD64(%r8,ptr) \ - __asm__ ( \ - VSHUFMEM(0,%r8,Z0) \ - VRDUP(Chi_00,T1) VIDUP(Chi_00,Chi_00) \ - VRDUP(Chi_10,T2) VIDUP(Chi_10,Chi_10) \ - VMUL(Z0,Chi_00,Z1) VMUL(Z0,Chi_10,Z2) \ - VSHUFMEM(3,%r8,Z0) \ - VMUL(Z0,Chi_00,Z3) VMUL(Z0,Chi_10,Z4) \ - VSHUFMEM(6,%r8,Z0) \ - VMUL(Z0,Chi_00,Z5) VMUL(Z0,Chi_10,Z6) \ - VMULMEM(0,%r8,T1,UChi_00) VMULMEM(0,%r8,T2,UChi_10) \ - VMULMEM(3,%r8,T1,UChi_01) VMULMEM(3,%r8,T2,UChi_11) \ - VMULMEM(6,%r8,T1,UChi_02) VMULMEM(6,%r8,T2,UChi_12) \ - /*11 cycles*/ \ - VSHUFMEM(1,%r8,Z0) \ - VRDUP(Chi_01,T1) VIDUP(Chi_01,Chi_01) \ - VRDUP(Chi_11,T2) VIDUP(Chi_11,Chi_11) \ - VMADD(Z0,Chi_01,Z1) VMADD(Z0,Chi_11,Z2) \ - VSHUFMEM(4,%r8,Z0) \ - VMADD(Z0,Chi_01,Z3) VMADD(Z0,Chi_11,Z4) \ - VSHUFMEM(7,%r8,Z0) \ - VMADD(Z0,Chi_01,Z5) VMADD(Z0,Chi_11,Z6) \ - VMADDMEM(1,%r8,T1,UChi_00) VMADDMEM(1,%r8,T2,UChi_10) \ - VMADDMEM(4,%r8,T1,UChi_01) VMADDMEM(4,%r8,T2,UChi_11) \ - VMADDMEM(7,%r8,T1,UChi_02) VMADDMEM(7,%r8,T2,UChi_12) \ - /*22 cycles*/ \ - VSHUFMEM(2,%r8,Z0) \ - VRDUP(Chi_02,T1) VIDUP(Chi_02,Chi_02) \ - VRDUP(Chi_12,T2) VIDUP(Chi_12,Chi_12) \ - VMADD(Z0,Chi_02,Z1) VMADD(Z0,Chi_12,Z2) \ - VSHUFMEM(5,%r8,Z0) \ - VMADD(Z0,Chi_02,Z3) VMADD(Z0,Chi_12,Z4) \ - VSHUFMEM(8,%r8,Z0) \ - VMADD(Z0,Chi_02,Z5) VMADD(Z0,Chi_12,Z6) \ - /*33 cycles*/ \ - VMADDSUBMEM(2,%r8,T1,Z1) VMADDSUBMEM(2,%r8,T2,Z2) \ - VMADDSUBMEM(5,%r8,T1,Z3) VMADDSUBMEM(5,%r8,T2,Z4) \ - VMADDSUBMEM(8,%r8,T1,Z5) VMADDSUBMEM(8,%r8,T2,Z6) \ - /*stall*/ \ - /*stall*/ \ - /*stall*/ \ - VADD(Z1,UChi_00,UChi_00) VADD(Z2,UChi_10,UChi_10) \ - VADD(Z3,UChi_01,UChi_01) VADD(Z4,UChi_11,UChi_11) \ - VADD(Z5,UChi_02,UChi_02) VADD(Z6,UChi_12,UChi_12) ) +#define MULT_ADDSUB_2SPIN_NEW(ptr,pf) \ + LOAD64(%r8,ptr) \ + __asm__ ( \ + VSHUFMEM(0,%r8,Z0) \ + VRDUP(Chi_00,T1) VIDUP(Chi_00,Chi_00) \ + VRDUP(Chi_10,T2) VIDUP(Chi_10,Chi_10) \ + VMUL(Z0,Chi_00,Z1) VMUL(Z0,Chi_10,Z2) \ + VSHUFMEM(3,%r8,Z0) \ + VMUL(Z0,Chi_00,Z3) VMUL(Z0,Chi_10,Z4) \ + VSHUFMEM(6,%r8,Z0) \ + VMUL(Z0,Chi_00,Z5) VMUL(Z0,Chi_10,Z6) \ + VMULMEM(0,%r8,T1,UChi_00) VMULMEM(0,%r8,T2,UChi_10) \ + VMULMEM(3,%r8,T1,UChi_01) VMULMEM(3,%r8,T2,UChi_11) \ + VMULMEM(6,%r8,T1,UChi_02) VMULMEM(6,%r8,T2,UChi_12) \ + /*11 cycles*/ \ + VSHUFMEM(1,%r8,Z0) \ + VRDUP(Chi_01,T1) VIDUP(Chi_01,Chi_01) \ + VRDUP(Chi_11,T2) VIDUP(Chi_11,Chi_11) \ + VMADD(Z0,Chi_01,Z1) VMADD(Z0,Chi_11,Z2) \ + VSHUFMEM(4,%r8,Z0) \ + VMADD(Z0,Chi_01,Z3) VMADD(Z0,Chi_11,Z4) \ + VSHUFMEM(7,%r8,Z0) \ + VMADD(Z0,Chi_01,Z5) VMADD(Z0,Chi_11,Z6) \ + VMADDMEM(1,%r8,T1,UChi_00) VMADDMEM(1,%r8,T2,UChi_10) \ + VMADDMEM(4,%r8,T1,UChi_01) VMADDMEM(4,%r8,T2,UChi_11) \ + VMADDMEM(7,%r8,T1,UChi_02) VMADDMEM(7,%r8,T2,UChi_12) \ + /*22 cycles*/ \ + VSHUFMEM(2,%r8,Z0) \ + VRDUP(Chi_02,T1) VIDUP(Chi_02,Chi_02) \ + VRDUP(Chi_12,T2) VIDUP(Chi_12,Chi_12) \ + VMADD(Z0,Chi_02,Z1) VMADD(Z0,Chi_12,Z2) \ + VSHUFMEM(5,%r8,Z0) \ + VMADD(Z0,Chi_02,Z3) VMADD(Z0,Chi_12,Z4) \ + VSHUFMEM(8,%r8,Z0) \ + VMADD(Z0,Chi_02,Z5) VMADD(Z0,Chi_12,Z6) \ + /*33 cycles*/ \ + VMADDSUBMEM(2,%r8,T1,Z1) VMADDSUBMEM(2,%r8,T2,Z2) \ + VMADDSUBMEM(5,%r8,T1,Z3) VMADDSUBMEM(5,%r8,T2,Z4) \ + VMADDSUBMEM(8,%r8,T1,Z5) VMADDSUBMEM(8,%r8,T2,Z6) \ + /*stall*/ \ + /*stall*/ \ + /*stall*/ \ + VADD(Z1,UChi_00,UChi_00) VADD(Z2,UChi_10,UChi_10) \ + VADD(Z3,UChi_01,UChi_01) VADD(Z4,UChi_11,UChi_11) \ + VADD(Z5,UChi_02,UChi_02) VADD(Z6,UChi_12,UChi_12) ) #endif From 68b69a2ac0d877666d758220e9d8bb12071fee8d Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:26:14 +0000 Subject: [PATCH 022/754] Namespace management --- lib/GridCore.h | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/GridCore.h b/lib/GridCore.h index 55396a37..5fab4c71 100644 --- a/lib/GridCore.h +++ b/lib/GridCore.h @@ -38,6 +38,7 @@ Author: paboyle #ifndef GRID_BASE_H #define GRID_BASE_H +#include #include #include From e6f7a5a818844e7ecfbdd207f5e24ea78a75f262 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:28:01 +0000 Subject: [PATCH 023/754] Namespace --- lib/util/Init.h | 58 ++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/lib/util/Init.h b/lib/util/Init.h index 3da00742..545ee7f6 100644 --- a/lib/util/Init.h +++ b/lib/util/Init.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -24,43 +24,43 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_INIT_H #define GRID_INIT_H -namespace Grid { +NAMESPACE_BEGIN(Grid); - void Grid_init(int *argc,char ***argv); - void Grid_finalize(void); +void Grid_init(int *argc,char ***argv); +void Grid_finalize(void); - // internal, controled with --handle - void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr); - void Grid_debug_handler_init(void); - void Grid_quiesce_nodes(void); - void Grid_unquiesce_nodes(void); +// internal, controled with --handle +void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr); +void Grid_debug_handler_init(void); +void Grid_quiesce_nodes(void); +void Grid_unquiesce_nodes(void); - const std::vector GridDefaultSimd(int dims,int nsimd); - const std::vector &GridDefaultLatt(void); - const std::vector &GridDefaultMpi(void); - const int &GridThreads(void) ; - void GridSetThreads(int t) ; - void GridLogTimestamp(int); - void GridLogLayout(); +const std::vector GridDefaultSimd(int dims,int nsimd); +const std::vector &GridDefaultLatt(void); +const std::vector &GridDefaultMpi(void); +const int &GridThreads(void) ; +void GridSetThreads(int t) ; +void GridLogTimestamp(int); +void GridLogLayout(); - // Common parsing chores - std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option); - bool GridCmdOptionExists(char** begin, char** end, const std::string& option); - std::string GridCmdVectorIntToString(const std::vector & vec); - void GridCmdOptionCSL(std::string str,std::vector & vec); - void GridCmdOptionIntVector(std::string &str,std::vector & vec); +// Common parsing chores +std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option); +bool GridCmdOptionExists(char** begin, char** end, const std::string& option); +std::string GridCmdVectorIntToString(const std::vector & vec); +void GridCmdOptionCSL(std::string str,std::vector & vec); +void GridCmdOptionIntVector(std::string &str,std::vector & vec); - void GridParseLayout(char **argv,int argc, - std::vector &latt, - std::vector &simd, - std::vector &mpi); +void GridParseLayout(char **argv,int argc, + std::vector &latt, + std::vector &simd, + std::vector &mpi); +NAMESPACE_END(Grid); -}; #endif From b673174b71fd303b099246f4f5a763b94b4eb407 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:29:22 +0000 Subject: [PATCH 024/754] FOrmat, NAMESPACE --- lib/util/Init.cc | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/lib/util/Init.cc b/lib/util/Init.cc index fb3d7a1e..5182e80f 100644 --- a/lib/util/Init.cc +++ b/lib/util/Init.cc @@ -26,8 +26,8 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ /****************************************************************************/ /* pab: Signal magic. Processor state dump is x86-64 specific */ /****************************************************************************/ @@ -71,8 +71,7 @@ feenableexcept (unsigned int excepts) } #endif -namespace Grid { - +NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////// // Convenience functions to access stadard command line arg @@ -89,18 +88,18 @@ const std::vector &GridDefaultLatt(void) {return Grid_default_latt;}; const std::vector &GridDefaultMpi(void) {return Grid_default_mpi;}; const std::vector GridDefaultSimd(int dims,int nsimd) { - std::vector layout(dims); - int nn=nsimd; - for(int d=dims-1;d>=0;d--){ - if ( nn>=2) { - layout[d]=2; - nn/=2; - } else { - layout[d]=1; - } + std::vector layout(dims); + int nn=nsimd; + for(int d=dims-1;d>=0;d--){ + if ( nn>=2) { + layout[d]=2; + nn/=2; + } else { + layout[d]=1; } - assert(nn==1); - return layout; + } + assert(nn==1); + return layout; } //////////////////////////////////////////////////////////// @@ -119,7 +118,7 @@ bool GridCmdOptionExists(char** begin, char** end, const std::string& option) { return std::find(begin, end, option) != end; } - // Comma separated list +// Comma separated list void GridCmdOptionCSL(std::string str,std::vector & vec) { size_t pos = 0; @@ -430,10 +429,10 @@ void Grid_finalize(void) } void GridLogLayout() { - std::cout << GridLogMessage << "Grid Layout\n"; - std::cout << GridLogMessage << "\tGlobal lattice size : "<< GridCmdVectorIntToString(GridDefaultLatt()) << std::endl; - std::cout << GridLogMessage << "\tOpenMP threads : "<< GridThread::GetThreads() < Date: Fri, 12 Jan 2018 23:38:15 +0000 Subject: [PATCH 025/754] =?UTF-8?q?Formatting,=20NAMESPACE=C2=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lib/threads/Threads.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lib/threads/Threads.h b/lib/threads/Threads.h index 36daf2af..59c6f3cb 100644 --- a/lib/threads/Threads.h +++ b/lib/threads/Threads.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -24,8 +24,8 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_THREADS_H #define GRID_THREADS_H @@ -56,13 +56,13 @@ Author: paboyle #define parallel_for_internal PARALLEL_FOR_LOOP_INTERN for #define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for -namespace Grid { +NAMESPACE_BEGIN(Grid); - // Introduce a class to gain deterministic bit reproducible reduction. - // make static; perhaps just a namespace is required. +// Introduce a class to gain deterministic bit reproducible reduction. +// make static; perhaps just a namespace is required. class GridThread { - public: +public: static int _threads; static int _hyperthreads; static int _cores; @@ -153,5 +153,5 @@ class GridThread { }; -} +NAMESPACE_END(Grid); #endif From 4be31ad1f682a1d591e7e13863471b26c0641f90 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:39:49 +0000 Subject: [PATCH 026/754] C++ indentation --- lib/serialisation/BaseIO.h | 802 +++++++++++++++++++------------------ 1 file changed, 402 insertions(+), 400 deletions(-) diff --git a/lib/serialisation/BaseIO.h b/lib/serialisation/BaseIO.h index 24e1cec7..b8ca5e16 100644 --- a/lib/serialisation/BaseIO.h +++ b/lib/serialisation/BaseIO.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -25,502 +25,504 @@ Author: Guido Cossu 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_SERIALISATION_ABSTRACT_READER_H #define GRID_SERIALISATION_ABSTRACT_READER_H #include -namespace Grid { - // Vector IO utilities /////////////////////////////////////////////////////// - // helper function to read space-separated values - template - std::vector strToVec(const std::string s) - { - std::istringstream sstr(s); - T buf; - std::vector v; +NAMESPACE_BEGIN(Grid); + +// Vector IO utilities /////////////////////////////////////////////////////// +// helper function to read space-separated values +template +std::vector strToVec(const std::string s) +{ + std::istringstream sstr(s); + T buf; + std::vector v; - while(!sstr.eof()) + while(!sstr.eof()) { sstr >> buf; v.push_back(buf); } - return v; - } + return v; +} - // output to streams for vectors - template < class T > - inline std::ostream & operator<<(std::ostream &os, const std::vector &v) - { - os << "["; - for (auto &x: v) +// output to streams for vectors +template < class T > +inline std::ostream & operator<<(std::ostream &os, const std::vector &v) +{ + os << "["; + for (auto &x: v) { os << x << " "; } - if (v.size() > 0) + if (v.size() > 0) { os << "\b"; } - os << "]"; + os << "]"; - return os; - } + return os; +} - // Vector element trait ////////////////////////////////////////////////////// - template - struct element - { - typedef T type; - static constexpr bool is_number = false; - }; +// Vector element trait ////////////////////////////////////////////////////// +template +struct element +{ + typedef T type; + static constexpr bool is_number = false; +}; - template - struct element> - { - typedef typename element::type type; - static constexpr bool is_number = std::is_arithmetic::value - or is_complex::value - or element::is_number; - }; +template +struct element> +{ + typedef typename element::type type; + static constexpr bool is_number = std::is_arithmetic::value + or is_complex::value + or element::is_number; +}; - // Vector flattening utility class //////////////////////////////////////////// - // Class to flatten a multidimensional std::vector - template - class Flatten - { - public: - typedef typename element::type Element; - public: - explicit Flatten(const V &vector); - const V & getVector(void); - const std::vector & getFlatVector(void); - const std::vector & getDim(void); - private: - void accumulate(const Element &e); - template - void accumulate(const W &v); - void accumulateDim(const Element &e); - template - void accumulateDim(const W &v); - private: - const V &vector_; - std::vector flatVector_; - std::vector dim_; - }; +// Vector flattening utility class //////////////////////////////////////////// +// Class to flatten a multidimensional std::vector +template +class Flatten +{ +public: + typedef typename element::type Element; +public: + explicit Flatten(const V &vector); + const V & getVector(void); + const std::vector & getFlatVector(void); + const std::vector & getDim(void); +private: + void accumulate(const Element &e); + template + void accumulate(const W &v); + void accumulateDim(const Element &e); + template + void accumulateDim(const W &v); +private: + const V &vector_; + std::vector flatVector_; + std::vector dim_; +}; - // Class to reconstruct a multidimensional std::vector - template - class Reconstruct - { - public: - typedef typename element::type Element; - public: - Reconstruct(const std::vector &flatVector, - const std::vector &dim); - const V & getVector(void); - const std::vector & getFlatVector(void); - const std::vector & getDim(void); - private: - void fill(std::vector &v); - template - void fill(W &v); - void resize(std::vector &v, const unsigned int dim); - template - void resize(W &v, const unsigned int dim); - private: - V vector_; - const std::vector &flatVector_; - std::vector dim_; - size_t ind_{0}; - unsigned int dimInd_{0}; - }; +// Class to reconstruct a multidimensional std::vector +template +class Reconstruct +{ +public: + typedef typename element::type Element; +public: + Reconstruct(const std::vector &flatVector, + const std::vector &dim); + const V & getVector(void); + const std::vector & getFlatVector(void); + const std::vector & getDim(void); +private: + void fill(std::vector &v); + template + void fill(W &v); + void resize(std::vector &v, const unsigned int dim); + template + void resize(W &v, const unsigned int dim); +private: + V vector_; + const std::vector &flatVector_; + std::vector dim_; + size_t ind_{0}; + unsigned int dimInd_{0}; +}; - // Pair IO utilities ///////////////////////////////////////////////////////// - // helper function to parse input in the format "" - template - inline std::istream & operator>>(std::istream &is, std::pair &buf) - { - T1 buf1; - T2 buf2; - char c; +// Pair IO utilities ///////////////////////////////////////////////////////// +// helper function to parse input in the format "" +template +inline std::istream & operator>>(std::istream &is, std::pair &buf) +{ + T1 buf1; + T2 buf2; + char c; - // Search for "pair" delimiters. - do + // Search for "pair" delimiters. + do { is.get(c); } while (c != '<' && !is.eof()); - if (c == '<') + if (c == '<') { int start = is.tellg(); do - { - is.get(c); - } while (c != '>' && !is.eof()); + { + is.get(c); + } while (c != '>' && !is.eof()); if (c == '>') - { - int end = is.tellg(); - int psize = end - start - 1; + { + int end = is.tellg(); + int psize = end - start - 1; - // Only read data between pair limiters. - is.seekg(start); - std::string tmpstr(psize, ' '); - is.read(&tmpstr[0], psize); - std::istringstream temp(tmpstr); - temp >> buf1 >> buf2; - buf = std::make_pair(buf1, buf2); - is.seekg(end); - } + // Only read data between pair limiters. + is.seekg(start); + std::string tmpstr(psize, ' '); + is.read(&tmpstr[0], psize); + std::istringstream temp(tmpstr); + temp >> buf1 >> buf2; + buf = std::make_pair(buf1, buf2); + is.seekg(end); + } } - is.peek(); - return is; - } + is.peek(); + return is; +} - // output to streams for pairs - template - inline std::ostream & operator<<(std::ostream &os, const std::pair &p) +// output to streams for pairs +template +inline std::ostream & operator<<(std::ostream &os, const std::pair &p) +{ + os << "<" << p.first << " " << p.second << ">"; + return os; +} + +// Abstract writer/reader classes //////////////////////////////////////////// +// static polymorphism implemented using CRTP idiom +class Serializable; + +// Static abstract writer +template +class Writer +{ +public: + Writer(void); + virtual ~Writer(void) = default; + void push(const std::string &s); + void pop(void); + template + typename std::enable_if::value, void>::type + write(const std::string& s, const U &output); + template + typename std::enable_if::value, void>::type + write(const std::string& s, const U &output); +private: + T *upcast; +}; + +// Static abstract reader +template +class Reader +{ +public: + Reader(void); + virtual ~Reader(void) = default; + bool push(const std::string &s); + void pop(void); + template + typename std::enable_if::value, void>::type + read(const std::string& s, U &output); + template + typename std::enable_if::value, void>::type + read(const std::string& s, U &output); +protected: + template + void fromString(U &output, const std::string &s); +private: + T *upcast; +}; + +// What is the vtype +template struct isReader { + static const bool value = false; +}; +template struct isWriter { + static const bool value = false; +}; + + + +// Generic writer interface +// serializable base class +class Serializable +{ +public: + template + static inline void write(Writer &WR,const std::string &s, + const Serializable &obj) + {} + + template + static inline void read(Reader &RD,const std::string &s, + Serializable &obj) + {} + + friend inline std::ostream & operator<<(std::ostream &os, + const Serializable &obj) { - os << "<" << p.first << " " << p.second << ">"; return os; } - - // Abstract writer/reader classes //////////////////////////////////////////// - // static polymorphism implemented using CRTP idiom - class Serializable; +}; - // Static abstract writer - template - class Writer - { - public: - Writer(void); - virtual ~Writer(void) = default; - void push(const std::string &s); - void pop(void); - template - typename std::enable_if::value, void>::type - write(const std::string& s, const U &output); - template - typename std::enable_if::value, void>::type - write(const std::string& s, const U &output); - private: - T *upcast; - }; +// Flatten class template implementation ///////////////////////////////////// +template +void Flatten::accumulate(const Element &e) +{ + flatVector_.push_back(e); +} - // Static abstract reader - template - class Reader - { - public: - Reader(void); - virtual ~Reader(void) = default; - bool push(const std::string &s); - void pop(void); - template - typename std::enable_if::value, void>::type - read(const std::string& s, U &output); - template - typename std::enable_if::value, void>::type - read(const std::string& s, U &output); - protected: - template - void fromString(U &output, const std::string &s); - private: - T *upcast; - }; - - // What is the vtype - template struct isReader { - static const bool value = false; - }; - template struct isWriter { - static const bool value = false; - }; - - - - // Generic writer interface - // serializable base class - class Serializable - { - public: - template - static inline void write(Writer &WR,const std::string &s, - const Serializable &obj) - {} - - template - static inline void read(Reader &RD,const std::string &s, - Serializable &obj) - {} - - friend inline std::ostream & operator<<(std::ostream &os, - const Serializable &obj) - { - return os; - } - }; - - // Flatten class template implementation ///////////////////////////////////// - template - void Flatten::accumulate(const Element &e) - { - flatVector_.push_back(e); - } - - template - template - void Flatten::accumulate(const W &v) - { - for (auto &e: v) +template +template +void Flatten::accumulate(const W &v) +{ + for (auto &e: v) { accumulate(e); } - } +} - template - void Flatten::accumulateDim(const Element &e) {}; +template +void Flatten::accumulateDim(const Element &e) {}; - template - template - void Flatten::accumulateDim(const W &v) - { - dim_.push_back(v.size()); - accumulateDim(v[0]); - } +template +template +void Flatten::accumulateDim(const W &v) +{ + dim_.push_back(v.size()); + accumulateDim(v[0]); +} - template - Flatten::Flatten(const V &vector) +template +Flatten::Flatten(const V &vector) : vector_(vector) - { - accumulate(vector_); - accumulateDim(vector_); - } +{ + accumulate(vector_); + accumulateDim(vector_); +} - template - const V & Flatten::getVector(void) - { - return vector_; - } +template +const V & Flatten::getVector(void) +{ + return vector_; +} - template - const std::vector::Element> & - Flatten::getFlatVector(void) - { - return flatVector_; - } +template +const std::vector::Element> & +Flatten::getFlatVector(void) +{ + return flatVector_; +} - template - const std::vector & Flatten::getDim(void) - { - return dim_; - } +template +const std::vector & Flatten::getDim(void) +{ + return dim_; +} - // Reconstruct class template implementation ///////////////////////////////// - template - void Reconstruct::fill(std::vector &v) - { - for (auto &e: v) +// Reconstruct class template implementation ///////////////////////////////// +template +void Reconstruct::fill(std::vector &v) +{ + for (auto &e: v) { e = flatVector_[ind_++]; } - } +} - template - template - void Reconstruct::fill(W &v) - { - for (auto &e: v) +template +template +void Reconstruct::fill(W &v) +{ + for (auto &e: v) { fill(e); } - } +} - template - void Reconstruct::resize(std::vector &v, const unsigned int dim) - { - v.resize(dim_[dim]); - } +template +void Reconstruct::resize(std::vector &v, const unsigned int dim) +{ + v.resize(dim_[dim]); +} - template - template - void Reconstruct::resize(W &v, const unsigned int dim) - { - v.resize(dim_[dim]); - for (auto &e: v) +template +template +void Reconstruct::resize(W &v, const unsigned int dim) +{ + v.resize(dim_[dim]); + for (auto &e: v) { resize(e, dim + 1); } - } +} - template - Reconstruct::Reconstruct(const std::vector &flatVector, - const std::vector &dim) +template +Reconstruct::Reconstruct(const std::vector &flatVector, + const std::vector &dim) : flatVector_(flatVector) , dim_(dim) - { - resize(vector_, 0); - fill(vector_); - } +{ + resize(vector_, 0); + fill(vector_); +} - template - const V & Reconstruct::getVector(void) - { - return vector_; - } +template +const V & Reconstruct::getVector(void) +{ + return vector_; +} - template - const std::vector::Element> & - Reconstruct::getFlatVector(void) - { - return flatVector_; - } +template +const std::vector::Element> & +Reconstruct::getFlatVector(void) +{ + return flatVector_; +} - template - const std::vector & Reconstruct::getDim(void) - { - return dim_; - } +template +const std::vector & Reconstruct::getDim(void) +{ + return dim_; +} - // Generic writer interface ////////////////////////////////////////////////// - template - inline void push(Writer &w, const std::string &s) { - w.push(s); - } +// Generic writer interface ////////////////////////////////////////////////// +template +inline void push(Writer &w, const std::string &s) { + w.push(s); +} - template - inline void push(Writer &w, const char *s) - { - w.push(std::string(s)); - } +template +inline void push(Writer &w, const char *s) +{ + w.push(std::string(s)); +} - template - inline void pop(Writer &w) - { - w.pop(); - } +template +inline void pop(Writer &w) +{ + w.pop(); +} - template - inline void write(Writer &w, const std::string& s, const U &output) - { - w.write(s, output); - } +template +inline void write(Writer &w, const std::string& s, const U &output) +{ + w.write(s, output); +} - // Generic reader interface - template - inline bool push(Reader &r, const std::string &s) - { - return r.push(s); - } +// Generic reader interface +template +inline bool push(Reader &r, const std::string &s) +{ + return r.push(s); +} - template - inline bool push(Reader &r, const char *s) - { - return r.push(std::string(s)); - } +template +inline bool push(Reader &r, const char *s) +{ + return r.push(std::string(s)); +} - template - inline void pop(Reader &r) - { - r.pop(); - } +template +inline void pop(Reader &r) +{ + r.pop(); +} - template - inline void read(Reader &r, const std::string &s, U &output) - { - r.read(s, output); - } +template +inline void read(Reader &r, const std::string &s, U &output) +{ + r.read(s, output); +} - // Writer template implementation //////////////////////////////////////////// - template - Writer::Writer(void) - { - upcast = static_cast(this); - } +// Writer template implementation //////////////////////////////////////////// +template +Writer::Writer(void) +{ + upcast = static_cast(this); +} - template - void Writer::push(const std::string &s) - { - upcast->push(s); - } +template +void Writer::push(const std::string &s) +{ + upcast->push(s); +} - template - void Writer::pop(void) - { - upcast->pop(); - } +template +void Writer::pop(void) +{ + upcast->pop(); +} - template - template - typename std::enable_if::value, void>::type - Writer::write(const std::string &s, const U &output) - { - U::write(*this, s, output); - } +template +template +typename std::enable_if::value, void>::type +Writer::write(const std::string &s, const U &output) +{ + U::write(*this, s, output); +} - template - template - typename std::enable_if::value, void>::type - Writer::write(const std::string &s, const U &output) - { - upcast->writeDefault(s, output); - } +template +template +typename std::enable_if::value, void>::type +Writer::write(const std::string &s, const U &output) +{ + upcast->writeDefault(s, output); +} - // Reader template implementation - template - Reader::Reader(void) - { - upcast = static_cast(this); - } +// Reader template implementation +template +Reader::Reader(void) +{ + upcast = static_cast(this); +} - template - bool Reader::push(const std::string &s) - { - return upcast->push(s); - } +template +bool Reader::push(const std::string &s) +{ + return upcast->push(s); +} - template - void Reader::pop(void) - { - upcast->pop(); - } +template +void Reader::pop(void) +{ + upcast->pop(); +} - template - template - typename std::enable_if::value, void>::type - Reader::read(const std::string &s, U &output) - { - U::read(*this, s, output); - } +template +template +typename std::enable_if::value, void>::type +Reader::read(const std::string &s, U &output) +{ + U::read(*this, s, output); +} - template - template - typename std::enable_if::value, void>::type - Reader::read(const std::string &s, U &output) - { - upcast->readDefault(s, output); - } +template +template +typename std::enable_if::value, void>::type +Reader::read(const std::string &s, U &output) +{ + upcast->readDefault(s, output); +} - template - template - void Reader::fromString(U &output, const std::string &s) - { - std::istringstream is(s); +template +template +void Reader::fromString(U &output, const std::string &s) +{ + std::istringstream is(s); - is.exceptions(std::ios::failbit); - try + is.exceptions(std::ios::failbit); + try { is >> std::boolalpha >> output; } - catch(std::istringstream::failure &e) + catch(std::istringstream::failure &e) { std::cerr << "numerical conversion failure on '" << s << "' "; std::cerr << "(typeid: " << typeid(U).name() << ")" << std::endl; abort(); } - } } +NAMESPACE_END(Grid); + #endif From 69496482fcc33b38d5e8b6b4d39e313472587cf5 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:42:22 +0000 Subject: [PATCH 027/754] Format, NAMESPACE --- lib/serialisation/BinaryIO.h | 133 ++++++++++++++++++----------------- 1 file changed, 67 insertions(+), 66 deletions(-) diff --git a/lib/serialisation/BinaryIO.h b/lib/serialisation/BinaryIO.h index 757753c7..9cc65aa9 100644 --- a/lib/serialisation/BinaryIO.h +++ b/lib/serialisation/BinaryIO.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -24,8 +24,8 @@ Author: Peter Boyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_SERIALISATION_BINARY_READER_H #define GRID_SERIALISATION_BINARY_READER_H @@ -37,83 +37,84 @@ Author: Peter Boyle #include #include -namespace Grid { +NAMESPACE_BEGIN(Grid); - class BinaryWriter: public Writer - { - public: - BinaryWriter(const std::string &fileName); - virtual ~BinaryWriter(void) = default; - void push(const std::string &s) {}; - void pop(void) {}; - template - void writeDefault(const std::string &s, const U &x); - template - void writeDefault(const std::string &s, const std::vector &x); - void writeDefault(const std::string &s, const char *x); - private: - std::ofstream file_; - }; - - class BinaryReader: public Reader - { - public: - BinaryReader(const std::string &fileName); - virtual ~BinaryReader(void) = default; - bool push(const std::string &s) {return true;} - void pop(void) {}; - template - void readDefault(const std::string &s, U &output); - template - void readDefault(const std::string &s, std::vector &output); - private: - std::ifstream file_; - }; - - // Writer template implementation //////////////////////////////////////////// +class BinaryWriter: public Writer +{ +public: + BinaryWriter(const std::string &fileName); + virtual ~BinaryWriter(void) = default; + void push(const std::string &s) {}; + void pop(void) {}; template - void BinaryWriter::writeDefault(const std::string &s, const U &x) - { - file_.write((char *)&x, sizeof(U)); - } - - template <> - void BinaryWriter::writeDefault(const std::string &s, const std::string &x); - + void writeDefault(const std::string &s, const U &x); template - void BinaryWriter::writeDefault(const std::string &s, const std::vector &x) - { - uint64_t sz = x.size(); + void writeDefault(const std::string &s, const std::vector &x); + void writeDefault(const std::string &s, const char *x); +private: + std::ofstream file_; +}; + +class BinaryReader: public Reader +{ +public: + BinaryReader(const std::string &fileName); + virtual ~BinaryReader(void) = default; + bool push(const std::string &s) {return true;} + void pop(void) {}; + template + void readDefault(const std::string &s, U &output); + template + void readDefault(const std::string &s, std::vector &output); +private: + std::ifstream file_; +}; + +// Writer template implementation //////////////////////////////////////////// +template +void BinaryWriter::writeDefault(const std::string &s, const U &x) +{ + file_.write((char *)&x, sizeof(U)); +} + +template <> +void BinaryWriter::writeDefault(const std::string &s, const std::string &x); + +template +void BinaryWriter::writeDefault(const std::string &s, const std::vector &x) +{ + uint64_t sz = x.size(); - write("", sz); - for (uint64_t i = 0; i < sz; ++i) + write("", sz); + for (uint64_t i = 0; i < sz; ++i) { write("", x[i]); } - } +} - // Reader template implementation //////////////////////////////////////////// - template - void BinaryReader::readDefault(const std::string &s, U &output) - { - file_.read((char *)&output, sizeof(U)); - } +// Reader template implementation //////////////////////////////////////////// +template +void BinaryReader::readDefault(const std::string &s, U &output) +{ + file_.read((char *)&output, sizeof(U)); +} - template <> - void BinaryReader::readDefault(const std::string &s, std::string &output); +template <> +void BinaryReader::readDefault(const std::string &s, std::string &output); - template - void BinaryReader::readDefault(const std::string &s, std::vector &output) - { - uint64_t sz; +template +void BinaryReader::readDefault(const std::string &s, std::vector &output) +{ + uint64_t sz; - read("", sz); - output.resize(sz); - for (uint64_t i = 0; i < sz; ++i) + read("", sz); + output.resize(sz); + for (uint64_t i = 0; i < sz; ++i) { read("", output[i]); } - } } +NAMESPACE_END(Grid); + #endif From 59b31b6bb818878a15e6ef0ca95b53ad7cfb80f2 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:43:44 +0000 Subject: [PATCH 028/754] Format, NAMESPACE --- lib/serialisation/Serialisation.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/serialisation/Serialisation.h b/lib/serialisation/Serialisation.h index 0e1c7531..b2dceb1b 100644 --- a/lib/serialisation/Serialisation.h +++ b/lib/serialisation/Serialisation.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -24,8 +24,8 @@ Author: Peter Boyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_SERIALISATION_READER_H #define GRID_SERIALISATION_READER_H @@ -50,8 +50,8 @@ Author: Peter Boyle ////////////////////////////////////////// // Select the default serialiser use ifdef's ////////////////////////////////////////// -namespace Grid { - typedef XmlReader DefaultReader; - typedef XmlWriter DefaultWriter; -} +NAMESPACE_BEGIN(Grid); +typedef XmlReader DefaultReader; +typedef XmlWriter DefaultWriter; +NAMESPACE_END(Grid); #endif From 0b8a88978ba51f5beed2dfb3831f35f185415944 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:47:24 +0000 Subject: [PATCH 029/754] Format, NAMESPACE --- lib/cartesian/Cartesian.h | 6 +- lib/cartesian/Cartesian_base.h | 449 ++++++++++++++-------------- lib/cartesian/Cartesian_full.h | 189 ++++++------ lib/cartesian/Cartesian_red_black.h | 402 ++++++++++++------------- 4 files changed, 523 insertions(+), 523 deletions(-) diff --git a/lib/cartesian/Cartesian.h b/lib/cartesian/Cartesian.h index f3710a48..070cad95 100644 --- a/lib/cartesian/Cartesian.h +++ b/lib/cartesian/Cartesian.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,8 +23,8 @@ Author: Peter Boyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_CARTESIAN_H #define GRID_CARTESIAN_H diff --git a/lib/cartesian/Cartesian_base.h b/lib/cartesian/Cartesian_base.h index acc870de..2729b2cb 100644 --- a/lib/cartesian/Cartesian_base.h +++ b/lib/cartesian/Cartesian_base.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -25,265 +25,264 @@ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_CARTESIAN_BASE_H #define GRID_CARTESIAN_BASE_H -namespace Grid{ +NAMESPACE_BEGIN(Grid); - ////////////////////////////////////////////////////////////////////// - // Commicator provides information on the processor grid - ////////////////////////////////////////////////////////////////////// - // unsigned long _ndimension; - // std::vector _processors; // processor grid - // int _processor; // linear processor rank - // std::vector _processor_coor; // linear processor rank - ////////////////////////////////////////////////////////////////////// - class GridBase : public CartesianCommunicator , public GridThread { +////////////////////////////////////////////////////////////////////// +// Commicator provides information on the processor grid +////////////////////////////////////////////////////////////////////// +// unsigned long _ndimension; +// std::vector _processors; // processor grid +// int _processor; // linear processor rank +// std::vector _processor_coor; // linear processor rank +////////////////////////////////////////////////////////////////////// +class GridBase : public CartesianCommunicator , public GridThread { public: - int dummy; - // Give Lattice access - template friend class Lattice; + int dummy; + // Give Lattice access + template friend class Lattice; - GridBase(const std::vector & processor_grid) : CartesianCommunicator(processor_grid) {}; - GridBase(const std::vector & processor_grid, - const CartesianCommunicator &parent, - int &split_rank) - : CartesianCommunicator(processor_grid,parent,split_rank) {}; - GridBase(const std::vector & processor_grid, - const CartesianCommunicator &parent) - : CartesianCommunicator(processor_grid,parent,dummy) {}; + GridBase(const std::vector & processor_grid) : CartesianCommunicator(processor_grid) {}; + GridBase(const std::vector & processor_grid, + const CartesianCommunicator &parent, + int &split_rank) + : CartesianCommunicator(processor_grid,parent,split_rank) {}; + GridBase(const std::vector & processor_grid, + const CartesianCommunicator &parent) + : CartesianCommunicator(processor_grid,parent,dummy) {}; - virtual ~GridBase() = default; + virtual ~GridBase() = default; - // Physics Grid information. - std::vector _simd_layout;// Which dimensions get relayed out over simd lanes. - std::vector _fdimensions;// (full) Global dimensions of array prior to cb removal - std::vector _gdimensions;// Global dimensions of array after cb removal - std::vector _ldimensions;// local dimensions of array with processor images removed - std::vector _rdimensions;// Reduced local dimensions with simd lane images and processor images removed - std::vector _ostride; // Outer stride for each dimension - std::vector _istride; // Inner stride i.e. within simd lane - int _osites; // _isites*_osites = product(dimensions). - int _isites; - int _fsites; // _isites*_osites = product(dimensions). - int _gsites; - std::vector _slice_block;// subslice information - std::vector _slice_stride; - std::vector _slice_nblock; + // Physics Grid information. + std::vector _simd_layout;// Which dimensions get relayed out over simd lanes. + std::vector _fdimensions;// (full) Global dimensions of array prior to cb removal + std::vector _gdimensions;// Global dimensions of array after cb removal + std::vector _ldimensions;// local dimensions of array with processor images removed + std::vector _rdimensions;// Reduced local dimensions with simd lane images and processor images removed + std::vector _ostride; // Outer stride for each dimension + std::vector _istride; // Inner stride i.e. within simd lane + int _osites; // _isites*_osites = product(dimensions). + int _isites; + int _fsites; // _isites*_osites = product(dimensions). + int _gsites; + std::vector _slice_block;// subslice information + std::vector _slice_stride; + std::vector _slice_nblock; - std::vector _lstart; // local start of array in gcoors _processor_coor[d]*_ldimensions[d] - std::vector _lend ; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1 + std::vector _lstart; // local start of array in gcoors _processor_coor[d]*_ldimensions[d] + std::vector _lend ; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1 public: - //////////////////////////////////////////////////////////////// - // Checkerboarding interface is virtual and overridden by - // GridCartesian / GridRedBlackCartesian - //////////////////////////////////////////////////////////////// - virtual int CheckerBoarded(int dim)=0; - virtual int CheckerBoard(const std::vector &site)=0; - virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0; - virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0; - virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0; - virtual int CheckerBoardFromOindex (int Oindex)=0; - virtual int CheckerBoardFromOindexTable (int Oindex)=0; + //////////////////////////////////////////////////////////////// + // Checkerboarding interface is virtual and overridden by + // GridCartesian / GridRedBlackCartesian + //////////////////////////////////////////////////////////////// + virtual int CheckerBoarded(int dim)=0; + virtual int CheckerBoard(const std::vector &site)=0; + virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0; + virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0; + virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0; + virtual int CheckerBoardFromOindex (int Oindex)=0; + virtual int CheckerBoardFromOindexTable (int Oindex)=0; - ////////////////////////////////////////////////////////////////////////////////////////////// - // Local layout calculations - ////////////////////////////////////////////////////////////////////////////////////////////// - // These routines are key. Subdivide the linearised cartesian index into - // "inner" index identifying which simd lane of object is associated with coord - // "outer" index identifying which element of _odata in class "Lattice" is associated with coord. - // - // Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer - // stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional - // coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all - // lanes are operated upon simultaneously. + ////////////////////////////////////////////////////////////////////////////////////////////// + // Local layout calculations + ////////////////////////////////////////////////////////////////////////////////////////////// + // These routines are key. Subdivide the linearised cartesian index into + // "inner" index identifying which simd lane of object is associated with coord + // "outer" index identifying which element of _odata in class "Lattice" is associated with coord. + // + // Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer + // stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional + // coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all + // lanes are operated upon simultaneously. - virtual int oIndex(std::vector &coor) - { - int idx=0; - // Works with either global or local coordinates - for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]); - return idx; - } - virtual int iIndex(std::vector &lcoor) - { - int idx=0; - for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]); - return idx; - } - inline int oIndexReduced(std::vector &ocoor) - { - int idx=0; - // ocoor is already reduced so can eliminate the modulo operation - // for fast indexing and inline the routine - for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d]; - return idx; - } - inline void oCoorFromOindex (std::vector& coor,int Oindex){ - Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions); - } + virtual int oIndex(std::vector &coor) + { + int idx=0; + // Works with either global or local coordinates + for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]); + return idx; + } + virtual int iIndex(std::vector &lcoor) + { + int idx=0; + for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]); + return idx; + } + inline int oIndexReduced(std::vector &ocoor) + { + int idx=0; + // ocoor is already reduced so can eliminate the modulo operation + // for fast indexing and inline the routine + for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d]; + return idx; + } + inline void oCoorFromOindex (std::vector& coor,int Oindex){ + Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions); + } - inline void InOutCoorToLocalCoor (std::vector &ocoor, std::vector &icoor, std::vector &lcoor) { - lcoor.resize(_ndimension); - for (int d = 0; d < _ndimension; d++) - lcoor[d] = ocoor[d] + _rdimensions[d] * icoor[d]; - } + inline void InOutCoorToLocalCoor (std::vector &ocoor, std::vector &icoor, std::vector &lcoor) { + lcoor.resize(_ndimension); + for (int d = 0; d < _ndimension; d++) + lcoor[d] = ocoor[d] + _rdimensions[d] * icoor[d]; + } - ////////////////////////////////////////////////////////// - // SIMD lane addressing - ////////////////////////////////////////////////////////// - inline void iCoorFromIindex(std::vector &coor,int lane) - { - Lexicographic::CoorFromIndex(coor,lane,_simd_layout); - } + ////////////////////////////////////////////////////////// + // SIMD lane addressing + ////////////////////////////////////////////////////////// + inline void iCoorFromIindex(std::vector &coor,int lane) + { + Lexicographic::CoorFromIndex(coor,lane,_simd_layout); + } - inline int PermuteDim(int dimension){ - return _simd_layout[dimension]>1; - } - inline int PermuteType(int dimension){ - int permute_type=0; - // - // FIXME: - // - // Best way to encode this would be to present a mask - // for which simd dimensions are rotated, and the rotation - // size. If there is only one simd dimension rotated, this is just - // a permute. - // - // Cases: PermuteType == 1,2,4,8 - // Distance should be either 0,1,2.. - // - if ( _simd_layout[dimension] > 2 ) { - for(int d=0;d<_ndimension;d++){ - if ( d != dimension ) assert ( (_simd_layout[d]==1) ); - } - permute_type = RotateBit; // How to specify distance; this is not just direction. - return permute_type; - } - - for(int d=_ndimension-1;d>dimension;d--){ - if (_simd_layout[d]>1 ) permute_type++; + inline int PermuteDim(int dimension){ + return _simd_layout[dimension]>1; + } + inline int PermuteType(int dimension){ + int permute_type=0; + // + // FIXME: + // + // Best way to encode this would be to present a mask + // for which simd dimensions are rotated, and the rotation + // size. If there is only one simd dimension rotated, this is just + // a permute. + // + // Cases: PermuteType == 1,2,4,8 + // Distance should be either 0,1,2.. + // + if ( _simd_layout[dimension] > 2 ) { + for(int d=0;d<_ndimension;d++){ + if ( d != dimension ) assert ( (_simd_layout[d]==1) ); } + permute_type = RotateBit; // How to specify distance; this is not just direction. return permute_type; } - //////////////////////////////////////////////////////////////// - // Array sizing queries - //////////////////////////////////////////////////////////////// - inline int iSites(void) const { return _isites; }; - inline int Nsimd(void) const { return _isites; };// Synonymous with iSites - inline int oSites(void) const { return _osites; }; - inline int lSites(void) const { return _isites*_osites; }; - inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; - inline int Nd (void) const { return _ndimension;}; - - inline const std::vector LocalStarts(void) { return _lstart; }; - inline const std::vector &FullDimensions(void) { return _fdimensions;}; - inline const std::vector &GlobalDimensions(void) { return _gdimensions;}; - inline const std::vector &LocalDimensions(void) { return _ldimensions;}; - inline const std::vector &VirtualLocalDimensions(void) { return _ldimensions;}; - - //////////////////////////////////////////////////////////////// - // Utility to print the full decomposition details - //////////////////////////////////////////////////////////////// - - void show_decomposition(){ - std::cout << GridLogMessage << "\tFull Dimensions : " << _fdimensions << std::endl; - std::cout << GridLogMessage << "\tSIMD layout : " << _simd_layout << std::endl; - std::cout << GridLogMessage << "\tGlobal Dimensions : " << _gdimensions << std::endl; - std::cout << GridLogMessage << "\tLocal Dimensions : " << _ldimensions << std::endl; - std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl; - std::cout << GridLogMessage << "\tOuter strides : " << _ostride << std::endl; - std::cout << GridLogMessage << "\tInner strides : " << _istride << std::endl; - std::cout << GridLogMessage << "\tiSites : " << _isites << std::endl; - std::cout << GridLogMessage << "\toSites : " << _osites << std::endl; - std::cout << GridLogMessage << "\tlSites : " << lSites() << std::endl; - std::cout << GridLogMessage << "\tgSites : " << gSites() << std::endl; - std::cout << GridLogMessage << "\tNd : " << _ndimension << std::endl; - } - - //////////////////////////////////////////////////////////////// - // Global addressing - //////////////////////////////////////////////////////////////// - void GlobalIndexToGlobalCoor(int gidx,std::vector &gcoor){ - assert(gidx< gSites()); - Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions); + for(int d=_ndimension-1;d>dimension;d--){ + if (_simd_layout[d]>1 ) permute_type++; } - void LocalIndexToLocalCoor(int lidx,std::vector &lcoor){ - assert(lidx LocalStarts(void) { return _lstart; }; + inline const std::vector &FullDimensions(void) { return _fdimensions;}; + inline const std::vector &GlobalDimensions(void) { return _gdimensions;}; + inline const std::vector &LocalDimensions(void) { return _ldimensions;}; + inline const std::vector &VirtualLocalDimensions(void) { return _ldimensions;}; + + //////////////////////////////////////////////////////////////// + // Utility to print the full decomposition details + //////////////////////////////////////////////////////////////// + + void show_decomposition(){ + std::cout << GridLogMessage << "\tFull Dimensions : " << _fdimensions << std::endl; + std::cout << GridLogMessage << "\tSIMD layout : " << _simd_layout << std::endl; + std::cout << GridLogMessage << "\tGlobal Dimensions : " << _gdimensions << std::endl; + std::cout << GridLogMessage << "\tLocal Dimensions : " << _ldimensions << std::endl; + std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl; + std::cout << GridLogMessage << "\tOuter strides : " << _ostride << std::endl; + std::cout << GridLogMessage << "\tInner strides : " << _istride << std::endl; + std::cout << GridLogMessage << "\tiSites : " << _isites << std::endl; + std::cout << GridLogMessage << "\toSites : " << _osites << std::endl; + std::cout << GridLogMessage << "\tlSites : " << lSites() << std::endl; + std::cout << GridLogMessage << "\tgSites : " << gSites() << std::endl; + std::cout << GridLogMessage << "\tNd : " << _ndimension << std::endl; + } + + //////////////////////////////////////////////////////////////// + // Global addressing + //////////////////////////////////////////////////////////////// + void GlobalIndexToGlobalCoor(int gidx,std::vector &gcoor){ + assert(gidx< gSites()); + Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions); + } + void LocalIndexToLocalCoor(int lidx,std::vector &lcoor){ + assert(lidx & gcoor,int & gidx){ + gidx=0; + int mult=1; + for(int mu=0;mu<_ndimension;mu++) { + gidx+=mult*gcoor[mu]; + mult*=_gdimensions[mu]; } - void GlobalCoorToGlobalIndex(const std::vector & gcoor,int & gidx){ - gidx=0; - int mult=1; - for(int mu=0;mu<_ndimension;mu++) { - gidx+=mult*gcoor[mu]; - mult*=_gdimensions[mu]; - } + } + void GlobalCoorToProcessorCoorLocalCoor(std::vector &pcoor,std::vector &lcoor,const std::vector &gcoor) + { + pcoor.resize(_ndimension); + lcoor.resize(_ndimension); + for(int mu=0;mu<_ndimension;mu++){ + int _fld = _fdimensions[mu]/_processors[mu]; + pcoor[mu] = gcoor[mu]/_fld; + lcoor[mu] = gcoor[mu]%_fld; } - void GlobalCoorToProcessorCoorLocalCoor(std::vector &pcoor,std::vector &lcoor,const std::vector &gcoor) - { - pcoor.resize(_ndimension); - lcoor.resize(_ndimension); - for(int mu=0;mu<_ndimension;mu++){ - int _fld = _fdimensions[mu]/_processors[mu]; - pcoor[mu] = gcoor[mu]/_fld; - lcoor[mu] = gcoor[mu]%_fld; - } - } - void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector &gcoor) - { - std::vector pcoor; - std::vector lcoor; - GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor); - rank = RankFromProcessorCoor(pcoor); - /* + } + void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector &gcoor) + { + std::vector pcoor; + std::vector lcoor; + GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor); + rank = RankFromProcessorCoor(pcoor); + /* std::vector cblcoor(lcoor); for(int d=0;dCheckerBoarded(d) ) { - cblcoor[d] = lcoor[d]/2; - } + if( this->CheckerBoarded(d) ) { + cblcoor[d] = lcoor[d]/2; } - */ - i_idx= iIndex(lcoor); - o_idx= oIndex(lcoor); - } + } + */ + i_idx= iIndex(lcoor); + o_idx= oIndex(lcoor); + } - void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector &gcoor) - { - gcoor.resize(_ndimension); - std::vector coor(_ndimension); + void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector &gcoor) + { + gcoor.resize(_ndimension); + std::vector coor(_ndimension); - ProcessorCoorFromRank(rank,coor); - for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]*coor[mu]; + ProcessorCoorFromRank(rank,coor); + for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]*coor[mu]; - iCoorFromIindex(coor,i_idx); - for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += _rdimensions[mu]*coor[mu]; + iCoorFromIindex(coor,i_idx); + for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += _rdimensions[mu]*coor[mu]; - oCoorFromOindex (coor,o_idx); - for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu]; + oCoorFromOindex (coor,o_idx); + for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu]; + } + void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,std::vector &fcoor) + { + RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor); + if(CheckerBoarded(0)){ + fcoor[0] = fcoor[0]*2+cb; } - void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,std::vector &fcoor) - { - RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor); - if(CheckerBoarded(0)){ - fcoor[0] = fcoor[0]*2+cb; - } - } - void ProcessorCoorLocalCoorToGlobalCoor(std::vector &Pcoor,std::vector &Lcoor,std::vector &gcoor) - { - gcoor.resize(_ndimension); - for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu]; - } + } + void ProcessorCoorLocalCoorToGlobalCoor(std::vector &Pcoor,std::vector &Lcoor,std::vector &gcoor) + { + gcoor.resize(_ndimension); + for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu]; + } }; - -} +NAMESPACE_END(Grid); #endif diff --git a/lib/cartesian/Cartesian_full.h b/lib/cartesian/Cartesian_full.h index 9273abf3..c3e5e5e7 100644 --- a/lib/cartesian/Cartesian_full.h +++ b/lib/cartesian/Cartesian_full.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,12 +23,12 @@ Author: Peter Boyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_CARTESIAN_FULL_H #define GRID_CARTESIAN_FULL_H -namespace Grid{ +NAMESPACE_BEGIN(Grid); ///////////////////////////////////////////////////////////////////////////////////////// // Grid Support. @@ -38,81 +38,81 @@ namespace Grid{ class GridCartesian: public GridBase { public: - int dummy; - virtual int CheckerBoardFromOindexTable (int Oindex) { - return 0; - } - virtual int CheckerBoardFromOindex (int Oindex) - { - return 0; - } - virtual int CheckerBoarded(int dim){ - return 0; - } - virtual int CheckerBoard(const std::vector &site){ - return 0; - } - virtual int CheckerBoardDestination(int cb,int shift,int dim){ - return 0; - } - virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift, int ocb){ - return shift; - } - virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){ - return shift; - } - ///////////////////////////////////////////////////////////////////////// - // Constructor takes a parent grid and possibly subdivides communicator. - ///////////////////////////////////////////////////////////////////////// - GridCartesian(const std::vector &dimensions, - const std::vector &simd_layout, - const std::vector &processor_grid, - const GridCartesian &parent) : GridBase(processor_grid,parent,dummy) - { - Init(dimensions,simd_layout,processor_grid); - } - GridCartesian(const std::vector &dimensions, - const std::vector &simd_layout, - const std::vector &processor_grid, - const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank) - { - Init(dimensions,simd_layout,processor_grid); - } - ///////////////////////////////////////////////////////////////////////// - // Construct from comm world - ///////////////////////////////////////////////////////////////////////// - GridCartesian(const std::vector &dimensions, - const std::vector &simd_layout, - const std::vector &processor_grid) : GridBase(processor_grid) - { - Init(dimensions,simd_layout,processor_grid); - } + int dummy; + virtual int CheckerBoardFromOindexTable (int Oindex) { + return 0; + } + virtual int CheckerBoardFromOindex (int Oindex) + { + return 0; + } + virtual int CheckerBoarded(int dim){ + return 0; + } + virtual int CheckerBoard(const std::vector &site){ + return 0; + } + virtual int CheckerBoardDestination(int cb,int shift,int dim){ + return 0; + } + virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift, int ocb){ + return shift; + } + virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){ + return shift; + } + ///////////////////////////////////////////////////////////////////////// + // Constructor takes a parent grid and possibly subdivides communicator. + ///////////////////////////////////////////////////////////////////////// + GridCartesian(const std::vector &dimensions, + const std::vector &simd_layout, + const std::vector &processor_grid, + const GridCartesian &parent) : GridBase(processor_grid,parent,dummy) + { + Init(dimensions,simd_layout,processor_grid); + } + GridCartesian(const std::vector &dimensions, + const std::vector &simd_layout, + const std::vector &processor_grid, + const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank) + { + Init(dimensions,simd_layout,processor_grid); + } + ///////////////////////////////////////////////////////////////////////// + // Construct from comm world + ///////////////////////////////////////////////////////////////////////// + GridCartesian(const std::vector &dimensions, + const std::vector &simd_layout, + const std::vector &processor_grid) : GridBase(processor_grid) + { + Init(dimensions,simd_layout,processor_grid); + } - virtual ~GridCartesian() = default; + virtual ~GridCartesian() = default; - void Init(const std::vector &dimensions, - const std::vector &simd_layout, - const std::vector &processor_grid) - { - /////////////////////// - // Grid information - /////////////////////// - _ndimension = dimensions.size(); + void Init(const std::vector &dimensions, + const std::vector &simd_layout, + const std::vector &processor_grid) + { + /////////////////////// + // Grid information + /////////////////////// + _ndimension = dimensions.size(); - _fdimensions.resize(_ndimension); - _gdimensions.resize(_ndimension); - _ldimensions.resize(_ndimension); - _rdimensions.resize(_ndimension); - _simd_layout.resize(_ndimension); - _lstart.resize(_ndimension); - _lend.resize(_ndimension); + _fdimensions.resize(_ndimension); + _gdimensions.resize(_ndimension); + _ldimensions.resize(_ndimension); + _rdimensions.resize(_ndimension); + _simd_layout.resize(_ndimension); + _lstart.resize(_ndimension); + _lend.resize(_ndimension); - _ostride.resize(_ndimension); - _istride.resize(_ndimension); + _ostride.resize(_ndimension); + _istride.resize(_ndimension); - _fsites = _gsites = _osites = _isites = 1; + _fsites = _gsites = _osites = _isites = 1; - for (int d = 0; d < _ndimension; d++) + for (int d = 0; d < _ndimension; d++) { _fdimensions[d] = dimensions[d]; // Global dimensions _gdimensions[d] = _fdimensions[d]; // Global dimensions @@ -134,30 +134,30 @@ public: // Addressing support if (d == 0) - { - _ostride[d] = 1; - _istride[d] = 1; - } + { + _ostride[d] = 1; + _istride[d] = 1; + } else - { - _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1]; - _istride[d] = _istride[d - 1] * _simd_layout[d - 1]; - } + { + _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1]; + _istride[d] = _istride[d - 1] * _simd_layout[d - 1]; + } } - /////////////////////// - // subplane information - /////////////////////// - _slice_block.resize(_ndimension); - _slice_stride.resize(_ndimension); - _slice_nblock.resize(_ndimension); + /////////////////////// + // subplane information + /////////////////////// + _slice_block.resize(_ndimension); + _slice_stride.resize(_ndimension); + _slice_nblock.resize(_ndimension); - int block = 1; - int nblock = 1; - for (int d = 0; d < _ndimension; d++) - nblock *= _rdimensions[d]; + int block = 1; + int nblock = 1; + for (int d = 0; d < _ndimension; d++) + nblock *= _rdimensions[d]; - for (int d = 0; d < _ndimension; d++) + for (int d = 0; d < _ndimension; d++) { nblock /= _rdimensions[d]; _slice_block[d] = block; @@ -165,7 +165,8 @@ public: _slice_nblock[d] = nblock; block = block * _rdimensions[d]; } - }; + }; }; -} + +NAMESPACE_END(Grid); #endif diff --git a/lib/cartesian/Cartesian_red_black.h b/lib/cartesian/Cartesian_red_black.h index ee424385..8496042c 100644 --- a/lib/cartesian/Cartesian_red_black.h +++ b/lib/cartesian/Cartesian_red_black.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -24,179 +24,179 @@ Author: Peter Boyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_CARTESIAN_RED_BLACK_H #define GRID_CARTESIAN_RED_BLACK_H -namespace Grid { +NAMESPACE_BEGIN(Grid); - static const int CbRed =0; - static const int CbBlack=1; - static const int Even =CbRed; - static const int Odd =CbBlack; +static const int CbRed =0; +static const int CbBlack=1; +static const int Even =CbRed; +static const int Odd =CbBlack; // Specialise this for red black grids storing half the data like a chess board. class GridRedBlackCartesian : public GridBase { public: - std::vector _checker_dim_mask; - int _checker_dim; - std::vector _checker_board; + std::vector _checker_dim_mask; + int _checker_dim; + std::vector _checker_board; - virtual int CheckerBoarded(int dim){ - if( dim==_checker_dim) return 1; - else return 0; - } - virtual int CheckerBoard(const std::vector &site){ - int linear=0; - assert(site.size()==_ndimension); - for(int d=0;d<_ndimension;d++){ - if(_checker_dim_mask[d]) - linear=linear+site[d]; - } - return (linear&0x1); + virtual int CheckerBoarded(int dim){ + if( dim==_checker_dim) return 1; + else return 0; + } + virtual int CheckerBoard(const std::vector &site){ + int linear=0; + assert(site.size()==_ndimension); + for(int d=0;d<_ndimension;d++){ + if(_checker_dim_mask[d]) + linear=linear+site[d]; } + return (linear&0x1); + } - // Depending on the cb of site, we toggle source cb. - // for block #b, element #e = (b, e) - // we need - virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int ocb){ - if(dim != _checker_dim) return shift; + // Depending on the cb of site, we toggle source cb. + // for block #b, element #e = (b, e) + // we need + virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int ocb){ + if(dim != _checker_dim) return shift; - int fulldim =_fdimensions[dim]; - shift = (shift+fulldim)%fulldim; + int fulldim =_fdimensions[dim]; + shift = (shift+fulldim)%fulldim; - // Probably faster with table lookup; - // or by looping over x,y,z and multiply rather than computing checkerboard. + // Probably faster with table lookup; + // or by looping over x,y,z and multiply rather than computing checkerboard. - if ( (source_cb+ocb)&1 ) { - return (shift)/2; - } else { - return (shift+1)/2; - } + if ( (source_cb+ocb)&1 ) { + return (shift)/2; + } else { + return (shift+1)/2; } - virtual int CheckerBoardFromOindexTable (int Oindex) { - return _checker_board[Oindex]; - } - virtual int CheckerBoardFromOindex (int Oindex) - { - std::vector ocoor; - oCoorFromOindex(ocoor,Oindex); - return CheckerBoard(ocoor); - } - virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){ + } + virtual int CheckerBoardFromOindexTable (int Oindex) { + return _checker_board[Oindex]; + } + virtual int CheckerBoardFromOindex (int Oindex) + { + std::vector ocoor; + oCoorFromOindex(ocoor,Oindex); + return CheckerBoard(ocoor); + } + virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){ - if(dim != _checker_dim) return shift; + if(dim != _checker_dim) return shift; - int ocb=CheckerBoardFromOindex(osite); + int ocb=CheckerBoardFromOindex(osite); - return CheckerBoardShiftForCB(source_cb,dim,shift,ocb); - } + return CheckerBoardShiftForCB(source_cb,dim,shift,ocb); + } - virtual int CheckerBoardDestination(int source_cb,int shift,int dim){ - if ( _checker_dim_mask[dim] ) { - // If _fdimensions[checker_dim] is odd, then shifting by 1 in other dims - // does NOT cause a parity hop. - int add=(dim==_checker_dim) ? 0 : _fdimensions[_checker_dim]; - if ( (shift+add) &0x1) { - return 1-source_cb; - } else { - return source_cb; - } + virtual int CheckerBoardDestination(int source_cb,int shift,int dim){ + if ( _checker_dim_mask[dim] ) { + // If _fdimensions[checker_dim] is odd, then shifting by 1 in other dims + // does NOT cause a parity hop. + int add=(dim==_checker_dim) ? 0 : _fdimensions[_checker_dim]; + if ( (shift+add) &0x1) { + return 1-source_cb; } else { return source_cb; - } - }; + } else { + return source_cb; - //////////////////////////////////////////////////////////// - // Create Redblack from original grid; require full grid pointer ? - //////////////////////////////////////////////////////////// - GridRedBlackCartesian(const GridBase *base) : GridBase(base->_processors,*base) - { - int dims = base->_ndimension; - std::vector checker_dim_mask(dims,1); - int checker_dim = 0; - Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim); - }; - - //////////////////////////////////////////////////////////// - // Create redblack from original grid, with non-trivial checker dim mask - //////////////////////////////////////////////////////////// - GridRedBlackCartesian(const GridBase *base, - const std::vector &checker_dim_mask, - int checker_dim - ) : GridBase(base->_processors,*base) - { - Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim) ; } + }; - virtual ~GridRedBlackCartesian() = default; + //////////////////////////////////////////////////////////// + // Create Redblack from original grid; require full grid pointer ? + //////////////////////////////////////////////////////////// + GridRedBlackCartesian(const GridBase *base) : GridBase(base->_processors,*base) + { + int dims = base->_ndimension; + std::vector checker_dim_mask(dims,1); + int checker_dim = 0; + Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim); + }; + + //////////////////////////////////////////////////////////// + // Create redblack from original grid, with non-trivial checker dim mask + //////////////////////////////////////////////////////////// + GridRedBlackCartesian(const GridBase *base, + const std::vector &checker_dim_mask, + int checker_dim + ) : GridBase(base->_processors,*base) + { + Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim) ; + } + + virtual ~GridRedBlackCartesian() = default; #if 0 - //////////////////////////////////////////////////////////// - // Create redblack grid ;; deprecate these. Should not - // need direct creation of redblack without a full grid to base on - //////////////////////////////////////////////////////////// - GridRedBlackCartesian(const GridBase *base, - const std::vector &dimensions, - const std::vector &simd_layout, - const std::vector &processor_grid, - const std::vector &checker_dim_mask, - int checker_dim - ) : GridBase(processor_grid,*base) - { - Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim); - } + //////////////////////////////////////////////////////////// + // Create redblack grid ;; deprecate these. Should not + // need direct creation of redblack without a full grid to base on + //////////////////////////////////////////////////////////// + GridRedBlackCartesian(const GridBase *base, + const std::vector &dimensions, + const std::vector &simd_layout, + const std::vector &processor_grid, + const std::vector &checker_dim_mask, + int checker_dim + ) : GridBase(processor_grid,*base) + { + Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim); + } - //////////////////////////////////////////////////////////// - // Create redblack grid - //////////////////////////////////////////////////////////// - GridRedBlackCartesian(const GridBase *base, - const std::vector &dimensions, - const std::vector &simd_layout, - const std::vector &processor_grid) : GridBase(processor_grid,*base) - { - std::vector checker_dim_mask(dimensions.size(),1); - int checker_dim = 0; - Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim); - } + //////////////////////////////////////////////////////////// + // Create redblack grid + //////////////////////////////////////////////////////////// + GridRedBlackCartesian(const GridBase *base, + const std::vector &dimensions, + const std::vector &simd_layout, + const std::vector &processor_grid) : GridBase(processor_grid,*base) + { + std::vector checker_dim_mask(dimensions.size(),1); + int checker_dim = 0; + Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim); + } #endif - void Init(const std::vector &dimensions, - const std::vector &simd_layout, - const std::vector &processor_grid, - const std::vector &checker_dim_mask, - int checker_dim) - { - /////////////////////// - // Grid information - /////////////////////// - _checker_dim = checker_dim; - assert(checker_dim_mask[checker_dim] == 1); - _ndimension = dimensions.size(); - assert(checker_dim_mask.size() == _ndimension); - assert(processor_grid.size() == _ndimension); - assert(simd_layout.size() == _ndimension); + void Init(const std::vector &dimensions, + const std::vector &simd_layout, + const std::vector &processor_grid, + const std::vector &checker_dim_mask, + int checker_dim) + { + /////////////////////// + // Grid information + /////////////////////// + _checker_dim = checker_dim; + assert(checker_dim_mask[checker_dim] == 1); + _ndimension = dimensions.size(); + assert(checker_dim_mask.size() == _ndimension); + assert(processor_grid.size() == _ndimension); + assert(simd_layout.size() == _ndimension); - _fdimensions.resize(_ndimension); - _gdimensions.resize(_ndimension); - _ldimensions.resize(_ndimension); - _rdimensions.resize(_ndimension); - _simd_layout.resize(_ndimension); - _lstart.resize(_ndimension); - _lend.resize(_ndimension); + _fdimensions.resize(_ndimension); + _gdimensions.resize(_ndimension); + _ldimensions.resize(_ndimension); + _rdimensions.resize(_ndimension); + _simd_layout.resize(_ndimension); + _lstart.resize(_ndimension); + _lend.resize(_ndimension); - _ostride.resize(_ndimension); - _istride.resize(_ndimension); + _ostride.resize(_ndimension); + _istride.resize(_ndimension); - _fsites = _gsites = _osites = _isites = 1; + _fsites = _gsites = _osites = _isites = 1; - _checker_dim_mask = checker_dim_mask; + _checker_dim_mask = checker_dim_mask; - for (int d = 0; d < _ndimension; d++) + for (int d = 0; d < _ndimension; d++) { _fdimensions[d] = dimensions[d]; _gdimensions[d] = _fdimensions[d]; @@ -204,11 +204,11 @@ public: _gsites = _gsites * _gdimensions[d]; if (d == _checker_dim) - { - assert((_gdimensions[d] & 0x1) == 0); - _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard - _gsites /= 2; - } + { + assert((_gdimensions[d] & 0x1) == 0); + _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard + _gsites /= 2; + } _ldimensions[d] = _gdimensions[d] / _processors[d]; assert(_ldimensions[d] * _processors[d] == _gdimensions[d]); _lstart[d] = _processor_coor[d] * _ldimensions[d]; @@ -223,42 +223,42 @@ public: // all elements of a simd vector must have same checkerboard. // If Ls vectorised, this must still be the case; e.g. dwf rb5d if (_simd_layout[d] > 1) - { - if (checker_dim_mask[d]) - { - assert((_rdimensions[d] & 0x1) == 0); - } - } + { + if (checker_dim_mask[d]) + { + assert((_rdimensions[d] & 0x1) == 0); + } + } _osites *= _rdimensions[d]; _isites *= _simd_layout[d]; // Addressing support if (d == 0) - { - _ostride[d] = 1; - _istride[d] = 1; - } + { + _ostride[d] = 1; + _istride[d] = 1; + } else - { - _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1]; - _istride[d] = _istride[d - 1] * _simd_layout[d - 1]; - } + { + _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1]; + _istride[d] = _istride[d - 1] * _simd_layout[d - 1]; + } } - //////////////////////////////////////////////////////////////////////////////////////////// - // subplane information - //////////////////////////////////////////////////////////////////////////////////////////// - _slice_block.resize(_ndimension); - _slice_stride.resize(_ndimension); - _slice_nblock.resize(_ndimension); + //////////////////////////////////////////////////////////////////////////////////////////// + // subplane information + //////////////////////////////////////////////////////////////////////////////////////////// + _slice_block.resize(_ndimension); + _slice_stride.resize(_ndimension); + _slice_nblock.resize(_ndimension); - int block = 1; - int nblock = 1; - for (int d = 0; d < _ndimension; d++) - nblock *= _rdimensions[d]; + int block = 1; + int nblock = 1; + for (int d = 0; d < _ndimension; d++) + nblock *= _rdimensions[d]; - for (int d = 0; d < _ndimension; d++) + for (int d = 0; d < _ndimension; d++) { nblock /= _rdimensions[d]; _slice_block[d] = block; @@ -267,55 +267,55 @@ public: block = block * _rdimensions[d]; } - //////////////////////////////////////////////// - // Create a checkerboard lookup table - //////////////////////////////////////////////// - int rvol = 1; - for (int d = 0; d < _ndimension; d++) + //////////////////////////////////////////////// + // Create a checkerboard lookup table + //////////////////////////////////////////////// + int rvol = 1; + for (int d = 0; d < _ndimension; d++) { rvol = rvol * _rdimensions[d]; } - _checker_board.resize(rvol); - for (int osite = 0; osite < _osites; osite++) + _checker_board.resize(rvol); + for (int osite = 0; osite < _osites; osite++) { _checker_board[osite] = CheckerBoardFromOindex(osite); } - }; + }; - protected: - virtual int oIndex(std::vector &coor) - { - int idx = 0; - for (int d = 0; d < _ndimension; d++) +protected: + virtual int oIndex(std::vector &coor) + { + int idx = 0; + for (int d = 0; d < _ndimension; d++) { if (d == _checker_dim) - { - idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]); - } + { + idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]); + } else - { - idx += _ostride[d] * (coor[d] % _rdimensions[d]); - } + { + idx += _ostride[d] * (coor[d] % _rdimensions[d]); + } } - return idx; - }; + return idx; + }; - virtual int iIndex(std::vector &lcoor) - { - int idx = 0; - for (int d = 0; d < _ndimension; d++) + virtual int iIndex(std::vector &lcoor) + { + int idx = 0; + for (int d = 0; d < _ndimension; d++) { if (d == _checker_dim) - { - idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d])); - } + { + idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d])); + } else - { - idx += _istride[d] * (lcoor[d] / _rdimensions[d]); - } + { + idx += _istride[d] * (lcoor[d] / _rdimensions[d]); + } } - return idx; - } + return idx; + } }; -} +NAMESPACE_END(Grid); #endif From 1056e36f11fcbe1de8fd679b3305109086f11264 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:49:46 +0000 Subject: [PATCH 030/754] Format, NAMESPACE --- lib/cartesian/Cartesian_base.h | 1 - lib/cartesian/Cartesian_full.h | 1 - lib/cartesian/Cartesian_red_black.h | 31 ----------------------------- 3 files changed, 33 deletions(-) diff --git a/lib/cartesian/Cartesian_base.h b/lib/cartesian/Cartesian_base.h index 2729b2cb..392ed6cd 100644 --- a/lib/cartesian/Cartesian_base.h +++ b/lib/cartesian/Cartesian_base.h @@ -30,7 +30,6 @@ #ifndef GRID_CARTESIAN_BASE_H #define GRID_CARTESIAN_BASE_H - NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////////////////////// diff --git a/lib/cartesian/Cartesian_full.h b/lib/cartesian/Cartesian_full.h index c3e5e5e7..3b46368f 100644 --- a/lib/cartesian/Cartesian_full.h +++ b/lib/cartesian/Cartesian_full.h @@ -34,7 +34,6 @@ NAMESPACE_BEGIN(Grid); // Grid Support. ///////////////////////////////////////////////////////////////////////////////////////// - class GridCartesian: public GridBase { public: diff --git a/lib/cartesian/Cartesian_red_black.h b/lib/cartesian/Cartesian_red_black.h index 8496042c..6313fdab 100644 --- a/lib/cartesian/Cartesian_red_black.h +++ b/lib/cartesian/Cartesian_red_black.h @@ -29,7 +29,6 @@ Author: Peter Boyle #ifndef GRID_CARTESIAN_RED_BLACK_H #define GRID_CARTESIAN_RED_BLACK_H - NAMESPACE_BEGIN(Grid); static const int CbRed =0; @@ -59,7 +58,6 @@ public: return (linear&0x1); } - // Depending on the cb of site, we toggle source cb. // for block #b, element #e = (b, e) // we need @@ -135,35 +133,6 @@ public: } virtual ~GridRedBlackCartesian() = default; -#if 0 - //////////////////////////////////////////////////////////// - // Create redblack grid ;; deprecate these. Should not - // need direct creation of redblack without a full grid to base on - //////////////////////////////////////////////////////////// - GridRedBlackCartesian(const GridBase *base, - const std::vector &dimensions, - const std::vector &simd_layout, - const std::vector &processor_grid, - const std::vector &checker_dim_mask, - int checker_dim - ) : GridBase(processor_grid,*base) - { - Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim); - } - - //////////////////////////////////////////////////////////// - // Create redblack grid - //////////////////////////////////////////////////////////// - GridRedBlackCartesian(const GridBase *base, - const std::vector &dimensions, - const std::vector &simd_layout, - const std::vector &processor_grid) : GridBase(processor_grid,*base) - { - std::vector checker_dim_mask(dimensions.size(),1); - int checker_dim = 0; - Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim); - } -#endif void Init(const std::vector &dimensions, const std::vector &simd_layout, From 7e70f4ed9caab258160ae052e7d60bb479639021 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:55:03 +0000 Subject: [PATCH 031/754] Format, NAMESPACE --- lib/perfmon/PerfCount.cc | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/lib/perfmon/PerfCount.cc b/lib/perfmon/PerfCount.cc index c6f92b9f..2062bb59 100644 --- a/lib/perfmon/PerfCount.cc +++ b/lib/perfmon/PerfCount.cc @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,13 +23,13 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #include #include -namespace Grid { +NAMESPACE_BEGIN(Grid); #define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16)) #define RawConfig(A,B) (A<<8|B) @@ -39,16 +39,16 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES , "CACHE_MISSES......." , CACHE_REFERENCES}, { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES , "CPUCYCLES.........." , INSTRUCTIONS}, { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS , "INSTRUCTIONS......." , CPUCYCLES }, - // 4 + // 4 #ifdef KNL - { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES }, - { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS }, - { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS }, - { PERF_TYPE_RAW, RawConfig(0x02,0x04), "L2_HIT_LOADS.......", L1D_READ_ACCESS }, - { PERF_TYPE_RAW, RawConfig(0x04,0x04), "L2_MISS_LOADS......", L1D_READ_ACCESS }, - { PERF_TYPE_RAW, RawConfig(0x10,0x04), "UTLB_MISS_LOADS....", L1D_READ_ACCESS }, - { PERF_TYPE_RAW, RawConfig(0x08,0x04), "DTLB_MISS_LOADS....", L1D_READ_ACCESS }, - // 11 + { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES }, + { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS }, + { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS }, + { PERF_TYPE_RAW, RawConfig(0x02,0x04), "L2_HIT_LOADS.......", L1D_READ_ACCESS }, + { PERF_TYPE_RAW, RawConfig(0x04,0x04), "L2_MISS_LOADS......", L1D_READ_ACCESS }, + { PERF_TYPE_RAW, RawConfig(0x10,0x04), "UTLB_MISS_LOADS....", L1D_READ_ACCESS }, + { PERF_TYPE_RAW, RawConfig(0x08,0x04), "DTLB_MISS_LOADS....", L1D_READ_ACCESS }, + // 11 #else { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS) , "L1D_READ_ACCESS....",INSTRUCTIONS}, { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS) , "L1D_READ_MISS......",L1D_READ_ACCESS}, @@ -57,19 +57,20 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS) , "L1D_PREFETCH_MISS..",L1D_READ_ACCESS}, { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) , "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS}, { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) , "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS}, - // 11 + // 11 #endif { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS) , "LL_READ_MISS.......",L1D_READ_ACCESS}, { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS) , "LL_READ_ACCESS.....",L1D_READ_ACCESS}, { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS) , "LL_WRITE_MISS......",L1D_READ_ACCESS}, { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS) , "LL_WRITE_ACCESS....",L1D_READ_ACCESS}, - //15 + //15 { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS) , "LL_PREFETCH_MISS...",L1D_READ_ACCESS}, { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS) , "LL_PREFETCH_ACCESS.",L1D_READ_ACCESS}, { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS) , "L1I_READ_MISS......",INSTRUCTIONS}, { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS) , "L1I_READ_ACCESS....",INSTRUCTIONS} - //19 + //19 // { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" }, #endif }; -} +NAMESPACE_END(Grid); + From c01a1e02fe36405e3bf9da55dd510236e961de87 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:55:38 +0000 Subject: [PATCH 032/754] Namespace, format --- lib/perfmon/PerfCount.h | 23 +++++---- lib/perfmon/Stat.cc | 67 ++++++++++++------------ lib/perfmon/Stat.h | 111 ++++++++++++++++++++-------------------- lib/perfmon/Timer.h | 15 +++--- 4 files changed, 109 insertions(+), 107 deletions(-) diff --git a/lib/perfmon/PerfCount.h b/lib/perfmon/PerfCount.h index 73d2c70f..8e27f23f 100644 --- a/lib/perfmon/PerfCount.h +++ b/lib/perfmon/PerfCount.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -25,8 +25,8 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_PERFCOUNT_H #define GRID_PERFCOUNT_H @@ -47,7 +47,7 @@ Author: paboyle #include #endif -namespace Grid { +NAMESPACE_BEGIN(Grid); #ifdef __linux__ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, @@ -84,9 +84,9 @@ inline uint64_t cyclecount(void){ #ifdef __bgq__ inline uint64_t cyclecount(void){ - uint64_t tmp; - asm volatile ("mfspr %0,0x10C" : "=&r" (tmp) ); - return tmp; + uint64_t tmp; + asm volatile ("mfspr %0,0x10C" : "=&r" (tmp) ); + return tmp; } #elif defined __x86_64__ inline uint64_t cyclecount(void){ @@ -97,7 +97,7 @@ inline uint64_t cyclecount(void){ #else inline uint64_t cyclecount(void){ - return 0; + return 0; } #endif @@ -225,8 +225,8 @@ public: int N = PerformanceCounterConfigs[PCT].normalisation; const char * sn = PerformanceCounterConfigs[N].name ; const char * sc = PerformanceCounterConfigs[PCT].name; - std::printf("tsc = %llu %s = %llu %s = %20llu\n (%s/%s) rate = %lf\n", elapsed,sn ,cycles, - sc, count, sc,sn, (double)count/(double)cycles); + std::printf("tsc = %llu %s = %llu %s = %20llu\n (%s/%s) rate = %lf\n", elapsed,sn ,cycles, + sc, count, sc,sn, (double)count/(double)cycles); #else std::printf("%llu cycles \n", elapsed ); #endif @@ -241,5 +241,6 @@ public: }; -} +NAMESPACE_END(Grid); + #endif diff --git a/lib/perfmon/Stat.cc b/lib/perfmon/Stat.cc index 3f47fd83..4c3be254 100644 --- a/lib/perfmon/Stat.cc +++ b/lib/perfmon/Stat.cc @@ -2,7 +2,7 @@ #include #include -namespace Grid { +NAMESPACE_BEGIN(Grid); bool PmuStat::pmu_initialized=false; @@ -175,39 +175,39 @@ void PmuStat::KNLevsetup(const char *ename, int &fd, int event, int umask) } - void PmuStat::KNLsetup(void){ +void PmuStat::KNLsetup(void){ - int ret; - char fname[1024]; + int ret; + char fname[1024]; - // MC RPQ inserts and WPQ inserts (reads & writes) - for (int mc = 0; mc < NMC; ++mc) - { - ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_imc_%d",mc); - // RPQ Inserts - KNLevsetup(fname, gbl.mc_rd[mc], 0x1, 0x1); - // WPQ Inserts - KNLevsetup(fname, gbl.mc_wr[mc], 0x2, 0x1); - } - // EDC RPQ inserts and WPQ inserts - for (int edc=0; edc < NEDC; ++edc) - { - ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_eclk_%d",edc); - // RPQ inserts - KNLevsetup(fname, gbl.edc_rd[edc], 0x1, 0x1); - // WPQ inserts - KNLevsetup(fname, gbl.edc_wr[edc], 0x2, 0x1); - } - // EDC HitE, HitM, MissE, MissM - for (int edc=0; edc < NEDC; ++edc) - { - ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_uclk_%d", edc); - KNLevsetup(fname, gbl.edc_hite[edc], 0x2, 0x1); - KNLevsetup(fname, gbl.edc_hitm[edc], 0x2, 0x2); - KNLevsetup(fname, gbl.edc_misse[edc], 0x2, 0x4); - KNLevsetup(fname, gbl.edc_missm[edc], 0x2, 0x8); - } - } + // MC RPQ inserts and WPQ inserts (reads & writes) + for (int mc = 0; mc < NMC; ++mc) + { + ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_imc_%d",mc); + // RPQ Inserts + KNLevsetup(fname, gbl.mc_rd[mc], 0x1, 0x1); + // WPQ Inserts + KNLevsetup(fname, gbl.mc_wr[mc], 0x2, 0x1); + } + // EDC RPQ inserts and WPQ inserts + for (int edc=0; edc < NEDC; ++edc) + { + ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_eclk_%d",edc); + // RPQ inserts + KNLevsetup(fname, gbl.edc_rd[edc], 0x1, 0x1); + // WPQ inserts + KNLevsetup(fname, gbl.edc_wr[edc], 0x2, 0x1); + } + // EDC HitE, HitM, MissE, MissM + for (int edc=0; edc < NEDC; ++edc) + { + ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_uclk_%d", edc); + KNLevsetup(fname, gbl.edc_hite[edc], 0x2, 0x1); + KNLevsetup(fname, gbl.edc_hitm[edc], 0x2, 0x2); + KNLevsetup(fname, gbl.edc_misse[edc], 0x2, 0x4); + KNLevsetup(fname, gbl.edc_missm[edc], 0x2, 0x8); + } +} uint64_t PmuStat::KNLreadctr(int fd) { @@ -242,4 +242,5 @@ void PmuStat::KNLreadctrs(ctrs &c) } #endif -} +NAMESPACE_END(Grid); + diff --git a/lib/perfmon/Stat.h b/lib/perfmon/Stat.h index 96bd594a..30baec29 100644 --- a/lib/perfmon/Stat.h +++ b/lib/perfmon/Stat.h @@ -5,7 +5,7 @@ #define _KNIGHTS_LANDING_ROOTONLY #endif -namespace Grid { +NAMESPACE_BEGIN(Grid); /////////////////////////////////////////////////////////////////////////////// // Extra KNL counters from MCDRAM @@ -15,14 +15,14 @@ namespace Grid { #define NEDC 8 struct ctrs { - uint64_t mcrd[NMC]; - uint64_t mcwr[NMC]; - uint64_t edcrd[NEDC]; - uint64_t edcwr[NEDC]; - uint64_t edchite[NEDC]; - uint64_t edchitm[NEDC]; - uint64_t edcmisse[NEDC]; - uint64_t edcmissm[NEDC]; + uint64_t mcrd[NMC]; + uint64_t mcwr[NMC]; + uint64_t edcrd[NEDC]; + uint64_t edcwr[NEDC]; + uint64_t edchite[NEDC]; + uint64_t edchitm[NEDC]; + uint64_t edcmisse[NEDC]; + uint64_t edcmissm[NEDC]; }; // Peter/Azusa: // Our modification of a code provided by Larry Meadows from Intel @@ -44,61 +44,62 @@ struct knl_gbl_ class PmuStat { - uint64_t counters[8][256]; + uint64_t counters[8][256]; #ifdef _KNIGHTS_LANDING_ - static struct knl_gbl_ gbl; + static struct knl_gbl_ gbl; #endif - const char *name; + const char *name; - uint64_t reads; // memory reads - uint64_t writes; // memory writes - uint64_t mrstart; // memory read counter at start of parallel region - uint64_t mrend; // memory read counter at end of parallel region - uint64_t mwstart; // memory write counter at start of parallel region - uint64_t mwend; // memory write counter at end of parallel region + uint64_t reads; // memory reads + uint64_t writes; // memory writes + uint64_t mrstart; // memory read counter at start of parallel region + uint64_t mrend; // memory read counter at end of parallel region + uint64_t mwstart; // memory write counter at start of parallel region + uint64_t mwend; // memory write counter at end of parallel region - // cumulative counters - uint64_t count; // number of invocations - uint64_t tregion; // total time in parallel region (from thread 0) - uint64_t tcycles; // total cycles inside parallel region - uint64_t inst, ref, cyc; // fixed counters - uint64_t pmc0, pmc1;// pmu - // add memory counters here - // temp variables - uint64_t tstart; // tsc at start of parallel region - uint64_t tend; // tsc at end of parallel region - // map for ctrs values - // 0 pmc0 start - // 1 pmc0 end - // 2 pmc1 start - // 3 pmc1 end - // 4 tsc start - // 5 tsc end - static bool pmu_initialized; + // cumulative counters + uint64_t count; // number of invocations + uint64_t tregion; // total time in parallel region (from thread 0) + uint64_t tcycles; // total cycles inside parallel region + uint64_t inst, ref, cyc; // fixed counters + uint64_t pmc0, pmc1;// pmu + // add memory counters here + // temp variables + uint64_t tstart; // tsc at start of parallel region + uint64_t tend; // tsc at end of parallel region + // map for ctrs values + // 0 pmc0 start + // 1 pmc0 end + // 2 pmc1 start + // 3 pmc1 end + // 4 tsc start + // 5 tsc end + static bool pmu_initialized; public: - static bool is_init(void){ return pmu_initialized;} - static void pmu_init(void); - static void pmu_fini(void); - static void pmu_start(void); - static void pmu_stop(void); - void accum(int nthreads); - static void xmemctrs(uint64_t *mr, uint64_t *mw); - void start(void); - void enter(int t); - void exit(int t); - void print(void); - void init(const char *regname); - void clear(void); + static bool is_init(void){ return pmu_initialized;} + static void pmu_init(void); + static void pmu_fini(void); + static void pmu_start(void); + static void pmu_stop(void); + void accum(int nthreads); + static void xmemctrs(uint64_t *mr, uint64_t *mw); + void start(void); + void enter(int t); + void exit(int t); + void print(void); + void init(const char *regname); + void clear(void); #ifdef _KNIGHTS_LANDING_ - static void KNLsetup(void); - static uint64_t KNLreadctr(int fd); - static void KNLreadctrs(ctrs &c); - static void KNLevsetup(const char *ename, int &fd, int event, int umask); + static void KNLsetup(void); + static uint64_t KNLreadctr(int fd); + static void KNLreadctrs(ctrs &c); + static void KNLevsetup(const char *ename, int &fd, int event, int umask); #endif - }; +}; + +NAMESPACE_END(Grid); -} #endif diff --git a/lib/perfmon/Timer.h b/lib/perfmon/Timer.h index 392ccc1d..5b23fcd4 100644 --- a/lib/perfmon/Timer.h +++ b/lib/perfmon/Timer.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -24,8 +24,8 @@ Author: Peter Boyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_TIME_H #define GRID_TIME_H @@ -33,11 +33,9 @@ Author: Peter Boyle #include #include -namespace Grid { - - - // Dress the output; use std::chrono +NAMESPACE_BEGIN(Grid) +// Dress the output; use std::chrono // C++11 time facilities better? inline double usecond(void) { struct timeval tv; @@ -98,5 +96,6 @@ public: } }; -} +NAMESPACE_END(Grid) + #endif From 81cc28f6ca42274b974ae53da9496cc2c2e111d0 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:57:22 +0000 Subject: [PATCH 033/754] Format --- lib/log/Log.h | 89 +++++++++++++++++++++++++-------------------------- 1 file changed, 44 insertions(+), 45 deletions(-) diff --git a/lib/log/Log.h b/lib/log/Log.h index ddff4c1d..6fd09124 100644 --- a/lib/log/Log.h +++ b/lib/log/Log.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -25,8 +25,8 @@ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #include @@ -37,13 +37,12 @@ #include #endif -namespace Grid { +NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////////////////////////////////////////////////// // Dress the output; use std::chrono for time stamping via the StopWatch class ////////////////////////////////////////////////////////////////////////////////////////////////// - class Colours{ protected: bool is_active; @@ -57,15 +56,15 @@ public: void Active(bool activate){ is_active=activate; if (is_active){ - colour["BLACK"] ="\033[30m"; - colour["RED"] ="\033[31m"; - colour["GREEN"] ="\033[32m"; - colour["YELLOW"] ="\033[33m"; - colour["BLUE"] ="\033[34m"; - colour["PURPLE"] ="\033[35m"; - colour["CYAN"] ="\033[36m"; - colour["WHITE"] ="\033[37m"; - colour["NORMAL"] ="\033[0;39m"; + colour["BLACK"] ="\033[30m"; + colour["RED"] ="\033[31m"; + colour["GREEN"] ="\033[32m"; + colour["YELLOW"] ="\033[33m"; + colour["BLUE"] ="\033[34m"; + colour["PURPLE"] ="\033[35m"; + colour["CYAN"] ="\033[36m"; + colour["WHITE"] ="\033[37m"; + colour["NORMAL"] ="\033[0;39m"; } else { colour["BLACK"] =""; colour["RED"] =""; @@ -101,14 +100,14 @@ public: std::string colour() {return Painter.colour[COLOUR];} Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col) : active(on), - name(nm), - topName(topNm), - Painter(col_class), - timing_mode(0), - COLOUR(col) - { - StopWatch = & GlobalStopWatch; - }; + name(nm), + topName(topNm), + Painter(col_class), + timing_mode(0), + COLOUR(col) + { + StopWatch = & GlobalStopWatch; + }; void Active(int on) {active = on;}; int isActive(void) {return active;}; @@ -149,7 +148,7 @@ public: class GridLogger: public Logger { public: GridLogger(int on, std::string nm, Colours&col_class, std::string col_key = "NORMAL"): - Logger("Grid", on, nm, col_class, col_key){}; + Logger("Grid", on, nm, col_class, col_key){}; }; void GridLogConfigure(std::vector &logstreams); @@ -165,39 +164,39 @@ extern GridLogger GridLogIterative ; extern GridLogger GridLogIntegrator ; extern Colours GridLogColours; - std::string demangle(const char* name) ; +std::string demangle(const char* name) ; #define _NBACKTRACE (256) extern void * Grid_backtrace_buffer[_NBACKTRACE]; -#define BACKTRACEFILE() {\ -char string[20]; \ -std::sprintf(string,"backtrace.%d",CartesianCommunicator::RankWorld()); \ -std::FILE * fp = std::fopen(string,"w"); \ -BACKTRACEFP(fp)\ -std::fclose(fp); \ -} +#define BACKTRACEFILE() { \ + char string[20]; \ + std::sprintf(string,"backtrace.%d",CartesianCommunicator::RankWorld()); \ + std::FILE * fp = std::fopen(string,"w"); \ + BACKTRACEFP(fp) \ + std::fclose(fp); \ + } #ifdef HAVE_EXECINFO_H -#define BACKTRACEFP(fp) { \ -int symbols = backtrace (Grid_backtrace_buffer,_NBACKTRACE);\ -char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\ -for (int i = 0; i < symbols; i++){\ - std::fprintf (fp,"BackTrace Strings: %d %s\n",i, demangle(strings[i]).c_str()); std::fflush(fp); \ -}\ -} +#define BACKTRACEFP(fp) { \ + int symbols = backtrace (Grid_backtrace_buffer,_NBACKTRACE); \ + char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols); \ + for (int i = 0; i < symbols; i++){ \ + std::fprintf (fp,"BackTrace Strings: %d %s\n",i, demangle(strings[i]).c_str()); std::fflush(fp); \ + } \ + } #else -#define BACKTRACEFP(fp) { \ -std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \ -std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \ -std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \ -std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \ -} +#define BACKTRACEFP(fp) { \ + std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \ + std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \ + std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \ + std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \ + } #endif #define BACKTRACE() BACKTRACEFP(stdout) +NAMESPACE_END(Grid); -} #endif From 18daf850696a9b362cf108e53b153c36aec0443e Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:58:23 +0000 Subject: [PATCH 034/754] Emacs format --- lib/log/Log.cc | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/lib/log/Log.cc b/lib/log/Log.cc index bc46893f..de4b8b51 100644 --- a/lib/log/Log.cc +++ b/lib/log/Log.cc @@ -28,27 +28,27 @@ with this program; if not, write to the Free Software Foundation, Inc., See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ -/* END LEGAL */ + /* END LEGAL */ #include #include #include #include -namespace Grid { +NAMESPACE_BEGIN(Grid); - std::string demangle(const char* name) { +std::string demangle(const char* name) { - int status = -4; // some arbitrary value to eliminate the compiler warning + int status = -4; // some arbitrary value to eliminate the compiler warning - // enable c++11 by passing the flag -std=c++11 to g++ - std::unique_ptr res { - abi::__cxa_demangle(name, NULL, NULL, &status), - std::free - }; + // enable c++11 by passing the flag -std=c++11 to g++ + std::unique_ptr res { + abi::__cxa_demangle(name, NULL, NULL, &status), + std::free + }; - return (status==0) ? res.get() : name ; - } + return (status==0) ? res.get() : name ; +} GridStopWatch Logger::GlobalStopWatch; int Logger::timestamp; @@ -113,4 +113,5 @@ void Grid_unquiesce_nodes(void) { std::cout.clear(); #endif } -} +NAMESPACE_END(Grid); + From dda151250f19c566dbed78d25c71e3241b6525d7 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:59:58 +0000 Subject: [PATCH 035/754] Emacs format --- lib/parallelIO/BinaryIO.h | 91 +++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 46 deletions(-) diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h index b40a75af..fb977f2c 100644 --- a/lib/parallelIO/BinaryIO.h +++ b/lib/parallelIO/BinaryIO.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -24,8 +24,8 @@ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_BINARY_IO_H #define GRID_BINARY_IO_H @@ -42,15 +42,14 @@ #include #include -namespace Grid { - +NAMESPACE_BEGIN(Grid); ///////////////////////////////////////////////////////////////////////////////// // Byte reversal garbage ///////////////////////////////////////////////////////////////////////////////// inline uint32_t byte_reverse32(uint32_t f) { - f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; - return f; + f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; + return f; } inline uint64_t byte_reverse64(uint64_t f) { uint64_t g; @@ -80,7 +79,7 @@ inline void removeWhitespace(std::string &key) // Could just use a namespace /////////////////////////////////////////////////////////////////////////////////////////////////// class BinaryIO { - public: +public: ///////////////////////////////////////////////////////////////////////////// // more byte manipulation helpers @@ -106,25 +105,25 @@ class BinaryIO { uint64_t lsites = grid->lSites(); if (fbuf.size() == 1) - { - lsites = 1; - } + { + lsites = 1; + } - #pragma omp parallel +#pragma omp parallel { uint32_t nersc_csum_thr = 0; - #pragma omp for +#pragma omp for for (uint64_t local_site = 0; local_site < lsites; local_site++) - { - uint32_t *site_buf = (uint32_t *)&fbuf[local_site]; - for (uint64_t j = 0; j < size32; j++) - { - nersc_csum_thr = nersc_csum_thr + site_buf[j]; - } - } + { + uint32_t *site_buf = (uint32_t *)&fbuf[local_site]; + for (uint64_t j = 0; j < size32; j++) + { + nersc_csum_thr = nersc_csum_thr + site_buf[j]; + } + } - #pragma omp critical +#pragma omp critical { nersc_csum += nersc_csum_thr; } @@ -372,13 +371,13 @@ class BinaryIO { std::ifstream fin; fin.open(file, std::ios::binary | std::ios::in); if (control & BINARYIO_MASTER_APPEND) - { - fin.seekg(-sizeof(fobj), fin.end); - } + { + fin.seekg(-sizeof(fobj), fin.end); + } else - { - fin.seekg(offset + myrank * lsites * sizeof(fobj)); - } + { + fin.seekg(offset + myrank * lsites * sizeof(fobj)); + } fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj)); assert(fin.fail() == 0); fin.close(); @@ -417,17 +416,17 @@ class BinaryIO { ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh); // std::cout << GridLogMessage << "Checking for errors" << std::endl; if (ierr != MPI_SUCCESS) - { - char error_string[BUFSIZ]; - int length_of_error_string, error_class; + { + char error_string[BUFSIZ]; + int length_of_error_string, error_class; - MPI_Error_class(ierr, &error_class); - MPI_Error_string(error_class, error_string, &length_of_error_string); - fprintf(stderr, "%3d: %s\n", myrank, error_string); - MPI_Error_string(ierr, error_string, &length_of_error_string); - fprintf(stderr, "%3d: %s\n", myrank, error_string); - MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0); - } + MPI_Error_class(ierr, &error_class); + MPI_Error_string(error_class, error_string, &length_of_error_string); + fprintf(stderr, "%3d: %s\n", myrank, error_string); + MPI_Error_string(ierr, error_string, &length_of_error_string); + fprintf(stderr, "%3d: %s\n", myrank, error_string); + MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0); + } std::cout << GridLogDebug << "MPI read I/O set view " << file << std::endl; ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); @@ -557,14 +556,14 @@ class BinaryIO { // Write a Lattice of object ////////////////////////////////////////////////////////////////////////////////////// template - static inline void writeLatticeObject(Lattice &Umu, - std::string file, - munger munge, - Integer offset, - const std::string &format, - uint32_t &nersc_csum, - uint32_t &scidac_csuma, - uint32_t &scidac_csumb) + static inline void writeLatticeObject(Lattice &Umu, + std::string file, + munger munge, + Integer offset, + const std::string &format, + uint32_t &nersc_csum, + uint32_t &scidac_csuma, + uint32_t &scidac_csumb) { typedef typename vobj::scalar_object sobj; typedef typename vobj::Realified::scalar_type word; word w=0; @@ -713,5 +712,5 @@ class BinaryIO { std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl; } }; -} +NAMESPACE_END(Grid); #endif From ab1068044eae62442161a56fec8c92b874ac0954 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 13 Jan 2018 00:01:58 +0000 Subject: [PATCH 036/754] C++ emacs happy --- lib/parallelIO/IldgIO.h | 315 ++++++++++++++++++++-------------------- 1 file changed, 157 insertions(+), 158 deletions(-) diff --git a/lib/parallelIO/IldgIO.h b/lib/parallelIO/IldgIO.h index b86e250f..85b47b08 100644 --- a/lib/parallelIO/IldgIO.h +++ b/lib/parallelIO/IldgIO.h @@ -23,7 +23,7 @@ with this program; if not, write to the Free Software Foundation, Inc., See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ -/* END LEGAL */ + /* END LEGAL */ #ifndef GRID_ILDG_IO_H #define GRID_ILDG_IO_H @@ -38,159 +38,158 @@ directory #include #include -//C-Lime is a must have for this functionality + //C-Lime is a must have for this functionality extern "C" { #include "lime.h" } -namespace Grid { -namespace QCD { +NAMESPACE_BEGIN(Grid); - ///////////////////////////////// - // Encode word types as strings - ///////////////////////////////// - template inline std::string ScidacWordMnemonic(void){ return std::string("unknown"); } - template<> inline std::string ScidacWordMnemonic (void){ return std::string("D"); } - template<> inline std::string ScidacWordMnemonic (void){ return std::string("F"); } - template<> inline std::string ScidacWordMnemonic< int32_t>(void){ return std::string("I32_t"); } - template<> inline std::string ScidacWordMnemonic(void){ return std::string("U32_t"); } - template<> inline std::string ScidacWordMnemonic< int64_t>(void){ return std::string("I64_t"); } - template<> inline std::string ScidacWordMnemonic(void){ return std::string("U64_t"); } +///////////////////////////////// +// Encode word types as strings +///////////////////////////////// +template inline std::string ScidacWordMnemonic(void){ return std::string("unknown"); } +template<> inline std::string ScidacWordMnemonic (void){ return std::string("D"); } +template<> inline std::string ScidacWordMnemonic (void){ return std::string("F"); } +template<> inline std::string ScidacWordMnemonic< int32_t>(void){ return std::string("I32_t"); } +template<> inline std::string ScidacWordMnemonic(void){ return std::string("U32_t"); } +template<> inline std::string ScidacWordMnemonic< int64_t>(void){ return std::string("I64_t"); } +template<> inline std::string ScidacWordMnemonic(void){ return std::string("U64_t"); } - ///////////////////////////////////////// - // Encode a generic tensor as a string - ///////////////////////////////////////// - template std::string ScidacRecordTypeString(int &colors, int &spins, int & typesize,int &datacount) { +///////////////////////////////////////// +// Encode a generic tensor as a string +///////////////////////////////////////// +template std::string ScidacRecordTypeString(int &colors, int &spins, int & typesize,int &datacount) { - typedef typename getPrecision::real_scalar_type stype; + typedef typename getPrecision::real_scalar_type stype; - int _ColourN = indexRank(); - int _ColourScalar = isScalar(); - int _ColourVector = isVector(); - int _ColourMatrix = isMatrix(); + int _ColourN = indexRank(); + int _ColourScalar = isScalar(); + int _ColourVector = isVector(); + int _ColourMatrix = isMatrix(); - int _SpinN = indexRank(); - int _SpinScalar = isScalar(); - int _SpinVector = isVector(); - int _SpinMatrix = isMatrix(); + int _SpinN = indexRank(); + int _SpinScalar = isScalar(); + int _SpinVector = isVector(); + int _SpinMatrix = isMatrix(); - int _LorentzN = indexRank(); - int _LorentzScalar = isScalar(); - int _LorentzVector = isVector(); - int _LorentzMatrix = isMatrix(); + int _LorentzN = indexRank(); + int _LorentzScalar = isScalar(); + int _LorentzVector = isVector(); + int _LorentzMatrix = isMatrix(); - std::stringstream stream; + std::stringstream stream; - stream << "GRID_"; - stream << ScidacWordMnemonic(); + stream << "GRID_"; + stream << ScidacWordMnemonic(); - if ( _LorentzVector ) stream << "_LorentzVector"<<_LorentzN; - if ( _LorentzMatrix ) stream << "_LorentzMatrix"<<_LorentzN; + if ( _LorentzVector ) stream << "_LorentzVector"<<_LorentzN; + if ( _LorentzMatrix ) stream << "_LorentzMatrix"<<_LorentzN; - if ( _SpinVector ) stream << "_SpinVector"<<_SpinN; - if ( _SpinMatrix ) stream << "_SpinMatrix"<<_SpinN; + if ( _SpinVector ) stream << "_SpinVector"<<_SpinN; + if ( _SpinMatrix ) stream << "_SpinMatrix"<<_SpinN; - if ( _ColourVector ) stream << "_ColourVector"<<_ColourN; - if ( _ColourMatrix ) stream << "_ColourMatrix"<<_ColourN; + if ( _ColourVector ) stream << "_ColourVector"<<_ColourN; + if ( _ColourMatrix ) stream << "_ColourMatrix"<<_ColourN; - if ( _ColourScalar && _LorentzScalar && _SpinScalar ) stream << "_Complex"; + if ( _ColourScalar && _LorentzScalar && _SpinScalar ) stream << "_Complex"; - typesize = sizeof(typename vobj::scalar_type); + typesize = sizeof(typename vobj::scalar_type); - if ( _ColourMatrix ) typesize*= _ColourN*_ColourN; - else typesize*= _ColourN; + if ( _ColourMatrix ) typesize*= _ColourN*_ColourN; + else typesize*= _ColourN; - if ( _SpinMatrix ) typesize*= _SpinN*_SpinN; - else typesize*= _SpinN; + if ( _SpinMatrix ) typesize*= _SpinN*_SpinN; + else typesize*= _SpinN; - colors = _ColourN; - spins = _SpinN; - datacount = _LorentzN; + colors = _ColourN; + spins = _SpinN; + datacount = _LorentzN; - return stream.str(); - } + return stream.str(); +} - template std::string ScidacRecordTypeString(Lattice & lat,int &colors, int &spins, int & typesize,int &datacount) { - return ScidacRecordTypeString(colors,spins,typesize,datacount); - }; +template std::string ScidacRecordTypeString(Lattice & lat,int &colors, int &spins, int & typesize,int &datacount) { + return ScidacRecordTypeString(colors,spins,typesize,datacount); +}; - //////////////////////////////////////////////////////////// - // Helper to fill out metadata - //////////////////////////////////////////////////////////// - template void ScidacMetaData(Lattice & field, - FieldMetaData &header, - scidacRecord & _scidacRecord, - scidacFile & _scidacFile) - { - typedef typename getPrecision::real_scalar_type stype; +//////////////////////////////////////////////////////////// +// Helper to fill out metadata +//////////////////////////////////////////////////////////// +template void ScidacMetaData(Lattice & field, + FieldMetaData &header, + scidacRecord & _scidacRecord, + scidacFile & _scidacFile) +{ + typedef typename getPrecision::real_scalar_type stype; - ///////////////////////////////////// - // Pull Grid's metadata - ///////////////////////////////////// - PrepareMetaData(field,header); + ///////////////////////////////////// + // Pull Grid's metadata + ///////////////////////////////////// + PrepareMetaData(field,header); - ///////////////////////////////////// - // Scidac Private File structure - ///////////////////////////////////// - _scidacFile = scidacFile(field._grid); + ///////////////////////////////////// + // Scidac Private File structure + ///////////////////////////////////// + _scidacFile = scidacFile(field._grid); - ///////////////////////////////////// - // Scidac Private Record structure - ///////////////////////////////////// - scidacRecord sr; - sr.datatype = ScidacRecordTypeString(field,sr.colors,sr.spins,sr.typesize,sr.datacount); - sr.date = header.creation_date; - sr.precision = ScidacWordMnemonic(); - sr.recordtype = GRID_IO_FIELD; + ///////////////////////////////////// + // Scidac Private Record structure + ///////////////////////////////////// + scidacRecord sr; + sr.datatype = ScidacRecordTypeString(field,sr.colors,sr.spins,sr.typesize,sr.datacount); + sr.date = header.creation_date; + sr.precision = ScidacWordMnemonic(); + sr.recordtype = GRID_IO_FIELD; - _scidacRecord = sr; + _scidacRecord = sr; - // std::cout << GridLogMessage << "Build SciDAC datatype " < - void writeScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile) - { - scidacFile _scidacFile(grid); - writeLimeObject(1,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML)); - writeLimeObject(0,1,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML)); - } + template + void writeScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile) + { + scidacFile _scidacFile(grid); + writeLimeObject(1,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML)); + writeLimeObject(0,1,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML)); + } //////////////////////////////////////////////// // Write generic lattice field in scidac format //////////////////////////////////////////////// @@ -424,15 +423,15 @@ class ScidacWriter : public GridLimeWriter { class ScidacReader : public GridLimeReader { - public: +public: - template - void readScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile) - { - scidacFile _scidacFile(grid); - readLimeObject(_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML)); - readLimeObject(_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML)); - } + template + void readScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile) + { + scidacFile _scidacFile(grid); + readLimeObject(_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML)); + readLimeObject(_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML)); + } //////////////////////////////////////////////// // Write generic lattice field in scidac format //////////////////////////////////////////////// @@ -483,7 +482,7 @@ class ScidacReader : public GridLimeReader { class IldgWriter : public ScidacWriter { - public: +public: /////////////////////////////////// // A little helper @@ -526,7 +525,7 @@ class IldgWriter : public ScidacWriter { header.ildg_lfn = LFN; assert ( (format == std::string("IEEE32BIG")) - ||(format == std::string("IEEE64BIG")) ); + ||(format == std::string("IEEE64BIG")) ); ////////////////////////////////////////////////////// // Fill ILDG header data struct @@ -573,7 +572,7 @@ class IldgWriter : public ScidacWriter { }; class IldgReader : public GridLimeReader { - public: +public: //////////////////////////////////////////////////////////////// // Read either Grid/SciDAC/ILDG configuration @@ -788,9 +787,9 @@ class IldgReader : public GridLimeReader { std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl; } } - }; +}; -}} +NAMESPACE_END(Grid); //HAVE_LIME #endif From be5d70ae6eb923930c60fa609a6ed2840c71e3eb Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 13 Jan 2018 00:02:10 +0000 Subject: [PATCH 037/754] C++ happy --- lib/parallelIO/IldgIOtypes.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/parallelIO/IldgIOtypes.h b/lib/parallelIO/IldgIOtypes.h index 5b397e14..aa909002 100644 --- a/lib/parallelIO/IldgIOtypes.h +++ b/lib/parallelIO/IldgIOtypes.h @@ -32,7 +32,7 @@ extern "C" { // for linkage #include "lime.h" } -namespace Grid { +NAMESPACE_BEGIN(Grid); ///////////////////////////////////////////////////////////////////////////////// // Data representation of records that enter ILDG and SciDac formats @@ -231,6 +231,6 @@ struct usqcdPropInfo : Serializable { }; #endif -} +NAMESPACE_END(Grid); #endif #endif From 6d7bdfb5f58acf42ddf3f9a33ee96f758510769e Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 13 Jan 2018 00:02:53 +0000 Subject: [PATCH 038/754] Emacs happy --- lib/parallelIO/IldgIOtypes.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/lib/parallelIO/IldgIOtypes.h b/lib/parallelIO/IldgIOtypes.h index aa909002..065397f4 100644 --- a/lib/parallelIO/IldgIOtypes.h +++ b/lib/parallelIO/IldgIOtypes.h @@ -23,7 +23,7 @@ with this program; if not, write to the Free Software Foundation, Inc., See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ -/* END LEGAL */ + /* END LEGAL */ #ifndef GRID_ILDGTYPES_IO_H #define GRID_ILDGTYPES_IO_H @@ -51,12 +51,12 @@ NAMESPACE_BEGIN(Grid); // Unused SCIDAC records names; could move to support this functionality #define SCIDAC_SITELIST "scidac-sitelist" - //////////////////////////////////////////////////////////// - const int GRID_IO_SINGLEFILE = 0; // hardcode lift from QIO compat - const int GRID_IO_MULTIFILE = 1; // hardcode lift from QIO compat - const int GRID_IO_FIELD = 0; // hardcode lift from QIO compat - const int GRID_IO_GLOBAL = 1; // hardcode lift from QIO compat - //////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////// +const int GRID_IO_SINGLEFILE = 0; // hardcode lift from QIO compat +const int GRID_IO_MULTIFILE = 1; // hardcode lift from QIO compat +const int GRID_IO_FIELD = 0; // hardcode lift from QIO compat +const int GRID_IO_GLOBAL = 1; // hardcode lift from QIO compat +//////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////// // QIO uses mandatory "private" records fixed format @@ -74,7 +74,7 @@ struct emptyUserRecord : Serializable { // 1.1416 16 16 32 0 //////////////////////// struct scidacFile : Serializable { - public: +public: GRID_SERIALIZABLE_CLASS_MEMBERS(scidacFile, double, version, int, spacetime, @@ -124,7 +124,7 @@ struct scidacFile : Serializable { /////////////////////////////////////////////////////////////////////// struct scidacRecord : Serializable { - public: +public: GRID_SERIALIZABLE_CLASS_MEMBERS(scidacRecord, double, version, std::string, date, @@ -159,7 +159,7 @@ public: // USQCD info //////////////////////// struct usqcdInfo : Serializable { - public: +public: GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdInfo, double, version, double, plaq, @@ -173,7 +173,7 @@ struct usqcdInfo : Serializable { // Scidac Checksum //////////////////////// struct scidacChecksum : Serializable { - public: +public: GRID_SERIALIZABLE_CLASS_MEMBERS(scidacChecksum, double, version, std::string, suma, @@ -200,7 +200,7 @@ struct scidacChecksum : Serializable { // From http://www.physics.utah.edu/~detar/scidac/qio_2p3.pdf //////////////////////////////////////////////////////////////////////////////////////// struct usqcdPropFile : Serializable { - public: +public: GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropFile, double, version, std::string, type, @@ -210,7 +210,7 @@ struct usqcdPropFile : Serializable { }; }; struct usqcdSourceInfo : Serializable { - public: +public: GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdSourceInfo, double, version, std::string, info); @@ -219,7 +219,7 @@ struct usqcdSourceInfo : Serializable { }; }; struct usqcdPropInfo : Serializable { - public: +public: GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropInfo, double, version, int, spin, From c0a9b38c021e95dab952151f703a77f392c7497f Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 13 Jan 2018 00:03:57 +0000 Subject: [PATCH 039/754] C++ NAMESPACE format emacs happy --- lib/parallelIO/MetaData.h | 480 +++++++++++++++++++------------------- 1 file changed, 240 insertions(+), 240 deletions(-) diff --git a/lib/parallelIO/MetaData.h b/lib/parallelIO/MetaData.h index ccc8b18f..d9633f70 100644 --- a/lib/parallelIO/MetaData.h +++ b/lib/parallelIO/MetaData.h @@ -36,294 +36,294 @@ #include #include -namespace Grid { +NAMESPACE_BEGIN(Grid); - /////////////////////////////////////////////////////// - // Precision mapping - /////////////////////////////////////////////////////// - template static std::string getFormatString (void) - { - std::string format; - typedef typename getPrecision::real_scalar_type stype; - if ( sizeof(stype) == sizeof(float) ) { - format = std::string("IEEE32BIG"); - } - if ( sizeof(stype) == sizeof(double) ) { - format = std::string("IEEE64BIG"); - } - return format; +/////////////////////////////////////////////////////// +// Precision mapping +/////////////////////////////////////////////////////// +template static std::string getFormatString (void) +{ + std::string format; + typedef typename getPrecision::real_scalar_type stype; + if ( sizeof(stype) == sizeof(float) ) { + format = std::string("IEEE32BIG"); } - //////////////////////////////////////////////////////////////////////////////// - // header specification/interpretation - //////////////////////////////////////////////////////////////////////////////// - class FieldMetaData : Serializable { - public: + if ( sizeof(stype) == sizeof(double) ) { + format = std::string("IEEE64BIG"); + } + return format; +} +//////////////////////////////////////////////////////////////////////////////// +// header specification/interpretation +//////////////////////////////////////////////////////////////////////////////// +class FieldMetaData : Serializable { +public: - GRID_SERIALIZABLE_CLASS_MEMBERS(FieldMetaData, - int, nd, - std::vector, dimension, - std::vector, boundary, - int, data_start, - std::string, hdr_version, - std::string, storage_format, - double, link_trace, - double, plaquette, - uint32_t, checksum, - uint32_t, scidac_checksuma, - uint32_t, scidac_checksumb, - unsigned int, sequence_number, - std::string, data_type, - std::string, ensemble_id, - std::string, ensemble_label, - std::string, ildg_lfn, - std::string, creator, - std::string, creator_hardware, - std::string, creation_date, - std::string, archive_date, - std::string, floating_point); - FieldMetaData(void) { - nd=4; - dimension.resize(4); - boundary.resize(4); - scidac_checksuma=0; - scidac_checksumb=0; - checksum=0; - } - }; + GRID_SERIALIZABLE_CLASS_MEMBERS(FieldMetaData, + int, nd, + std::vector, dimension, + std::vector, boundary, + int, data_start, + std::string, hdr_version, + std::string, storage_format, + double, link_trace, + double, plaquette, + uint32_t, checksum, + uint32_t, scidac_checksuma, + uint32_t, scidac_checksumb, + unsigned int, sequence_number, + std::string, data_type, + std::string, ensemble_id, + std::string, ensemble_label, + std::string, ildg_lfn, + std::string, creator, + std::string, creator_hardware, + std::string, creation_date, + std::string, archive_date, + std::string, floating_point); + FieldMetaData(void) { + nd=4; + dimension.resize(4); + boundary.resize(4); + scidac_checksuma=0; + scidac_checksumb=0; + checksum=0; + } +}; - namespace QCD { +namespace QCD { - using namespace Grid; + using namespace Grid; - ////////////////////////////////////////////////////////////////////// - // Bit and Physical Checksumming and QA of data - ////////////////////////////////////////////////////////////////////// - inline void GridMetaData(GridBase *grid,FieldMetaData &header) - { - int nd = grid->_ndimension; - header.nd = nd; - header.dimension.resize(nd); - header.boundary.resize(nd); - header.data_start = 0; - for(int d=0;d_fdimensions[d]; - } - for(int d=0;d_ndimension; + header.nd = nd; + header.dimension.resize(nd); + header.boundary.resize(nd); + header.data_start = 0; + for(int d=0;d_fdimensions[d]; } - - inline void MachineCharacteristics(FieldMetaData &header) - { - // Who - struct passwd *pw = getpwuid (getuid()); - if (pw) header.creator = std::string(pw->pw_name); - - // When - std::time_t t = std::time(nullptr); - std::tm tm_ = *std::localtime(&t); - std::ostringstream oss; - // oss << std::put_time(&tm_, "%c %Z"); - header.creation_date = oss.str(); - header.archive_date = header.creation_date; - - // What - struct utsname name; uname(&name); - header.creator_hardware = std::string(name.nodename)+"-"; - header.creator_hardware+= std::string(name.machine)+"-"; - header.creator_hardware+= std::string(name.sysname)+"-"; - header.creator_hardware+= std::string(name.release); + for(int d=0;dpw_name); + + // When + std::time_t t = std::time(nullptr); + std::tm tm_ = *std::localtime(&t); + std::ostringstream oss; + // oss << std::put_time(&tm_, "%c %Z"); + header.creation_date = oss.str(); + header.archive_date = header.creation_date; + + // What + struct utsname name; uname(&name); + header.creator_hardware = std::string(name.nodename)+"-"; + header.creator_hardware+= std::string(name.machine)+"-"; + header.creator_hardware+= std::string(name.sysname)+"-"; + header.creator_hardware+= std::string(name.release); + } #define dump_meta_data(field, s) \ - s << "BEGIN_HEADER" << std::endl; \ - s << "HDR_VERSION = " << field.hdr_version << std::endl; \ - s << "DATATYPE = " << field.data_type << std::endl; \ - s << "STORAGE_FORMAT = " << field.storage_format << std::endl; \ - for(int i=0;i<4;i++){ \ - s << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ; \ - } \ - s << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl; \ - s << "PLAQUETTE = " << std::setprecision(10) << field.plaquette << std::endl; \ - for(int i=0;i<4;i++){ \ - s << "BOUNDARY_"< inline void PrepareMetaData(Lattice & field, FieldMetaData &header) -{ - GridBase *grid = field._grid; - std::string format = getFormatString(); - header.floating_point = format; - header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac - GridMetaData(grid,header); - MachineCharacteristics(header); - } - inline void GaugeStatistics(Lattice & data,FieldMetaData &header) - { - // How to convert data precision etc... - header.link_trace=Grid::QCD::WilsonLoops::linkTrace(data); - header.plaquette =Grid::QCD::WilsonLoops::avgPlaquette(data); - } - inline void GaugeStatistics(Lattice & data,FieldMetaData &header) - { - // How to convert data precision etc... - header.link_trace=Grid::QCD::WilsonLoops::linkTrace(data); - header.plaquette =Grid::QCD::WilsonLoops::avgPlaquette(data); - } - template<> inline void PrepareMetaData(Lattice & field, FieldMetaData &header) - { + template inline void PrepareMetaData(Lattice & field, FieldMetaData &header) + { + GridBase *grid = field._grid; + std::string format = getFormatString(); + header.floating_point = format; + header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac + GridMetaData(grid,header); + MachineCharacteristics(header); + } + inline void GaugeStatistics(Lattice & data,FieldMetaData &header) + { + // How to convert data precision etc... + header.link_trace=Grid::QCD::WilsonLoops::linkTrace(data); + header.plaquette =Grid::QCD::WilsonLoops::avgPlaquette(data); + } + inline void GaugeStatistics(Lattice & data,FieldMetaData &header) + { + // How to convert data precision etc... + header.link_trace=Grid::QCD::WilsonLoops::linkTrace(data); + header.plaquette =Grid::QCD::WilsonLoops::avgPlaquette(data); + } + template<> inline void PrepareMetaData(Lattice & field, FieldMetaData &header) + { - GridBase *grid = field._grid; - std::string format = getFormatString(); - header.floating_point = format; - header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac - GridMetaData(grid,header); - GaugeStatistics(field,header); - MachineCharacteristics(header); - } - template<> inline void PrepareMetaData(Lattice & field, FieldMetaData &header) - { - GridBase *grid = field._grid; - std::string format = getFormatString(); - header.floating_point = format; - header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac - GridMetaData(grid,header); - GaugeStatistics(field,header); - MachineCharacteristics(header); - } + GridBase *grid = field._grid; + std::string format = getFormatString(); + header.floating_point = format; + header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac + GridMetaData(grid,header); + GaugeStatistics(field,header); + MachineCharacteristics(header); + } + template<> inline void PrepareMetaData(Lattice & field, FieldMetaData &header) + { + GridBase *grid = field._grid; + std::string format = getFormatString(); + header.floating_point = format; + header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac + GridMetaData(grid,header); + GaugeStatistics(field,header); + MachineCharacteristics(header); + } - ////////////////////////////////////////////////////////////////////// - // Utilities ; these are QCD aware - ////////////////////////////////////////////////////////////////////// - inline void reconstruct3(LorentzColourMatrix & cm) - { - const int x=0; - const int y=1; - const int z=2; - for(int mu=0;mu using iLorentzColour2x3 = iVector, 2>, Nd >; + //////////////////////////////////////////////////////////////////////////////// + // Some data types for intermediate storage + //////////////////////////////////////////////////////////////////////////////// + template using iLorentzColour2x3 = iVector, 2>, Nd >; - typedef iLorentzColour2x3 LorentzColour2x3; - typedef iLorentzColour2x3 LorentzColour2x3F; - typedef iLorentzColour2x3 LorentzColour2x3D; + typedef iLorentzColour2x3 LorentzColour2x3; + typedef iLorentzColour2x3 LorentzColour2x3F; + typedef iLorentzColour2x3 LorentzColour2x3D; -///////////////////////////////////////////////////////////////////////////////// -// Simple classes for precision conversion -///////////////////////////////////////////////////////////////////////////////// -template -struct BinarySimpleUnmunger { - typedef typename getPrecision::real_scalar_type fobj_stype; - typedef typename getPrecision::real_scalar_type sobj_stype; + ///////////////////////////////////////////////////////////////////////////////// + // Simple classes for precision conversion + ///////////////////////////////////////////////////////////////////////////////// + template + struct BinarySimpleUnmunger { + typedef typename getPrecision::real_scalar_type fobj_stype; + typedef typename getPrecision::real_scalar_type sobj_stype; - void operator()(sobj &in, fobj &out) { - // take word by word and transform accoding to the status - fobj_stype *out_buffer = (fobj_stype *)&out; - sobj_stype *in_buffer = (sobj_stype *)∈ - size_t fobj_words = sizeof(out) / sizeof(fobj_stype); - size_t sobj_words = sizeof(in) / sizeof(sobj_stype); - assert(fobj_words == sobj_words); + void operator()(sobj &in, fobj &out) { + // take word by word and transform accoding to the status + fobj_stype *out_buffer = (fobj_stype *)&out; + sobj_stype *in_buffer = (sobj_stype *)∈ + size_t fobj_words = sizeof(out) / sizeof(fobj_stype); + size_t sobj_words = sizeof(in) / sizeof(sobj_stype); + assert(fobj_words == sobj_words); - for (unsigned int word = 0; word < sobj_words; word++) - out_buffer[word] = in_buffer[word]; // type conversion on the fly + for (unsigned int word = 0; word < sobj_words; word++) + out_buffer[word] = in_buffer[word]; // type conversion on the fly - } -}; + } + }; -template -struct BinarySimpleMunger { - typedef typename getPrecision::real_scalar_type fobj_stype; - typedef typename getPrecision::real_scalar_type sobj_stype; + template + struct BinarySimpleMunger { + typedef typename getPrecision::real_scalar_type fobj_stype; + typedef typename getPrecision::real_scalar_type sobj_stype; - void operator()(fobj &in, sobj &out) { - // take word by word and transform accoding to the status - fobj_stype *in_buffer = (fobj_stype *)∈ - sobj_stype *out_buffer = (sobj_stype *)&out; - size_t fobj_words = sizeof(in) / sizeof(fobj_stype); - size_t sobj_words = sizeof(out) / sizeof(sobj_stype); - assert(fobj_words == sobj_words); + void operator()(fobj &in, sobj &out) { + // take word by word and transform accoding to the status + fobj_stype *in_buffer = (fobj_stype *)∈ + sobj_stype *out_buffer = (sobj_stype *)&out; + size_t fobj_words = sizeof(in) / sizeof(fobj_stype); + size_t sobj_words = sizeof(out) / sizeof(sobj_stype); + assert(fobj_words == sobj_words); - for (unsigned int word = 0; word < sobj_words; word++) - out_buffer[word] = in_buffer[word]; // type conversion on the fly + for (unsigned int word = 0; word < sobj_words; word++) + out_buffer[word] = in_buffer[word]; // type conversion on the fly - } -}; + } + }; - template - struct GaugeSimpleMunger{ - void operator()(fobj &in, sobj &out) { - for (int mu = 0; mu < Nd; mu++) { - for (int i = 0; i < Nc; i++) { + template + struct GaugeSimpleMunger{ + void operator()(fobj &in, sobj &out) { + for (int mu = 0; mu < Nd; mu++) { + for (int i = 0; i < Nc; i++) { for (int j = 0; j < Nc; j++) { out(mu)()(i, j) = in(mu)()(i, j); }} - } - }; + } }; + }; - template - struct GaugeSimpleUnmunger { + template + struct GaugeSimpleUnmunger { - void operator()(sobj &in, fobj &out) { - for (int mu = 0; mu < Nd; mu++) { - for (int i = 0; i < Nc; i++) { + void operator()(sobj &in, fobj &out) { + for (int mu = 0; mu < Nd; mu++) { + for (int i = 0; i < Nc; i++) { for (int j = 0; j < Nc; j++) { out(mu)()(i, j) = in(mu)()(i, j); }} - } - }; + } }; + }; - template - struct Gauge3x2munger{ - void operator() (fobj &in,sobj &out){ - for(int mu=0;mu + struct Gauge3x2munger{ + void operator() (fobj &in,sobj &out){ + for(int mu=0;mu - struct Gauge3x2unmunger{ - void operator() (sobj &in,fobj &out){ - for(int mu=0;mu + struct Gauge3x2unmunger{ + void operator() (sobj &in,fobj &out){ + for(int mu=0;mu Date: Sat, 13 Jan 2018 00:05:33 +0000 Subject: [PATCH 040/754] NAMESPACE and reformat --- lib/parallelIO/NerscIO.h | 491 +++++++++++++++++++-------------------- 1 file changed, 245 insertions(+), 246 deletions(-) diff --git a/lib/parallelIO/NerscIO.h b/lib/parallelIO/NerscIO.h index 786839f2..424cd232 100644 --- a/lib/parallelIO/NerscIO.h +++ b/lib/parallelIO/NerscIO.h @@ -30,324 +30,323 @@ #ifndef GRID_NERSC_IO_H #define GRID_NERSC_IO_H -namespace Grid { - namespace QCD { +NAMESPACE_BEGIN(Grid); - using namespace Grid; +using namespace Grid; - //////////////////////////////////////////////////////////////////////////////// - // Write and read from fstream; comput header offset for payload - //////////////////////////////////////////////////////////////////////////////// - class NerscIO : public BinaryIO { - public: +//////////////////////////////////////////////////////////////////////////////// +// Write and read from fstream; comput header offset for payload +//////////////////////////////////////////////////////////////////////////////// +class NerscIO : public BinaryIO { +public: - static inline void truncate(std::string file){ - std::ofstream fout(file,std::ios::out); - } + static inline void truncate(std::string file){ + std::ofstream fout(file,std::ios::out); + } - static inline unsigned int writeHeader(FieldMetaData &field,std::string file) - { - std::ofstream fout(file,std::ios::out|std::ios::in); - fout.seekp(0,std::ios::beg); - dump_meta_data(field, fout); - field.data_start = fout.tellp(); - return field.data_start; - } + static inline unsigned int writeHeader(FieldMetaData &field,std::string file) + { + std::ofstream fout(file,std::ios::out|std::ios::in); + fout.seekp(0,std::ios::beg); + dump_meta_data(field, fout); + field.data_start = fout.tellp(); + return field.data_start; + } - // for the header-reader - static inline int readHeader(std::string file,GridBase *grid, FieldMetaData &field) - { - int offset=0; - std::map header; - std::string line; + // for the header-reader + static inline int readHeader(std::string file,GridBase *grid, FieldMetaData &field) + { + int offset=0; + std::map header; + std::string line; - ////////////////////////////////////////////////// - // read the header - ////////////////////////////////////////////////// - std::ifstream fin(file); + ////////////////////////////////////////////////// + // read the header + ////////////////////////////////////////////////// + std::ifstream fin(file); - getline(fin,line); // read one line and insist is + getline(fin,line); // read one line and insist is - removeWhitespace(line); - std::cout << GridLogMessage << "* " << line << std::endl; + removeWhitespace(line); + std::cout << GridLogMessage << "* " << line << std::endl; - assert(line==std::string("BEGIN_HEADER")); + assert(line==std::string("BEGIN_HEADER")); - do { + do { getline(fin,line); // read one line std::cout << GridLogMessage << "* "<0) { - std::string key=line.substr(0,eq); - std::string val=line.substr(eq+1); - removeWhitespace(key); - removeWhitespace(val); + std::string key=line.substr(0,eq); + std::string val=line.substr(eq+1); + removeWhitespace(key); + removeWhitespace(val); - header[key] = val; - } + header[key] = val; + } } while( line.find("END_HEADER") == std::string::npos ); - field.data_start = fin.tellg(); + field.data_start = fin.tellg(); - ////////////////////////////////////////////////// - // chomp the values - ////////////////////////////////////////////////// - field.hdr_version = header["HDR_VERSION"]; - field.data_type = header["DATATYPE"]; - field.storage_format = header["STORAGE_FORMAT"]; + ////////////////////////////////////////////////// + // chomp the values + ////////////////////////////////////////////////// + field.hdr_version = header["HDR_VERSION"]; + field.data_type = header["DATATYPE"]; + field.storage_format = header["STORAGE_FORMAT"]; - field.dimension[0] = std::stol(header["DIMENSION_1"]); - field.dimension[1] = std::stol(header["DIMENSION_2"]); - field.dimension[2] = std::stol(header["DIMENSION_3"]); - field.dimension[3] = std::stol(header["DIMENSION_4"]); + field.dimension[0] = std::stol(header["DIMENSION_1"]); + field.dimension[1] = std::stol(header["DIMENSION_2"]); + field.dimension[2] = std::stol(header["DIMENSION_3"]); + field.dimension[3] = std::stol(header["DIMENSION_4"]); - assert(grid->_ndimension == 4); - for(int d=0;d<4;d++){ + assert(grid->_ndimension == 4); + for(int d=0;d<4;d++){ assert(grid->_fdimensions[d]==field.dimension[d]); } - field.link_trace = std::stod(header["LINK_TRACE"]); - field.plaquette = std::stod(header["PLAQUETTE"]); + field.link_trace = std::stod(header["LINK_TRACE"]); + field.plaquette = std::stod(header["PLAQUETTE"]); - field.boundary[0] = header["BOUNDARY_1"]; - field.boundary[1] = header["BOUNDARY_2"]; - field.boundary[2] = header["BOUNDARY_3"]; - field.boundary[3] = header["BOUNDARY_4"]; + field.boundary[0] = header["BOUNDARY_1"]; + field.boundary[1] = header["BOUNDARY_2"]; + field.boundary[2] = header["BOUNDARY_3"]; + field.boundary[3] = header["BOUNDARY_4"]; - field.checksum = std::stoul(header["CHECKSUM"],0,16); - field.ensemble_id = header["ENSEMBLE_ID"]; - field.ensemble_label = header["ENSEMBLE_LABEL"]; - field.sequence_number = std::stol(header["SEQUENCE_NUMBER"]); - field.creator = header["CREATOR"]; - field.creator_hardware = header["CREATOR_HARDWARE"]; - field.creation_date = header["CREATION_DATE"]; - field.archive_date = header["ARCHIVE_DATE"]; - field.floating_point = header["FLOATING_POINT"]; + field.checksum = std::stoul(header["CHECKSUM"],0,16); + field.ensemble_id = header["ENSEMBLE_ID"]; + field.ensemble_label = header["ENSEMBLE_LABEL"]; + field.sequence_number = std::stol(header["SEQUENCE_NUMBER"]); + field.creator = header["CREATOR"]; + field.creator_hardware = header["CREATOR_HARDWARE"]; + field.creation_date = header["CREATION_DATE"]; + field.archive_date = header["ARCHIVE_DATE"]; + field.floating_point = header["FLOATING_POINT"]; - return field.data_start; + return field.data_start; + } + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Now the meat: the object readers + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + template + static inline void readConfiguration(Lattice > &Umu, + FieldMetaData& header, + std::string file) + { + typedef Lattice > GaugeField; + + GridBase *grid = Umu._grid; + int offset = readHeader(file,Umu._grid,header); + + FieldMetaData clone(header); + + std::string format(header.floating_point); + + int ieee32big = (format == std::string("IEEE32BIG")); + int ieee32 = (format == std::string("IEEE32")); + int ieee64big = (format == std::string("IEEE64BIG")); + int ieee64 = (format == std::string("IEEE64")); + + uint32_t nersc_csum,scidac_csuma,scidac_csumb; + // depending on datatype, set up munger; + // munger is a function of + if ( header.data_type == std::string("4D_SU3_GAUGE") ) { + if ( ieee32 || ieee32big ) { + BinaryIO::readLatticeObject, LorentzColour2x3F> + (Umu,file,Gauge3x2munger(), offset,format, + nersc_csum,scidac_csuma,scidac_csumb); + } + if ( ieee64 || ieee64big ) { + BinaryIO::readLatticeObject, LorentzColour2x3D> + (Umu,file,Gauge3x2munger(),offset,format, + nersc_csum,scidac_csuma,scidac_csumb); + } + } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) { + if ( ieee32 || ieee32big ) { + BinaryIO::readLatticeObject,LorentzColourMatrixF> + (Umu,file,GaugeSimpleMunger(),offset,format, + nersc_csum,scidac_csuma,scidac_csumb); + } + if ( ieee64 || ieee64big ) { + BinaryIO::readLatticeObject,LorentzColourMatrixD> + (Umu,file,GaugeSimpleMunger(),offset,format, + nersc_csum,scidac_csuma,scidac_csumb); + } + } else { + assert(0); } - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Now the meat: the object readers - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + GaugeStatistics(Umu,clone); - template - static inline void readConfiguration(Lattice > &Umu, - FieldMetaData& header, - std::string file) - { - typedef Lattice > GaugeField; + std::cout< - if ( header.data_type == std::string("4D_SU3_GAUGE") ) { - if ( ieee32 || ieee32big ) { - BinaryIO::readLatticeObject, LorentzColour2x3F> - (Umu,file,Gauge3x2munger(), offset,format, - nersc_csum,scidac_csuma,scidac_csumb); - } - if ( ieee64 || ieee64big ) { - BinaryIO::readLatticeObject, LorentzColour2x3D> - (Umu,file,Gauge3x2munger(),offset,format, - nersc_csum,scidac_csuma,scidac_csumb); - } - } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) { - if ( ieee32 || ieee32big ) { - BinaryIO::readLatticeObject,LorentzColourMatrixF> - (Umu,file,GaugeSimpleMunger(),offset,format, - nersc_csum,scidac_csuma,scidac_csumb); - } - if ( ieee64 || ieee64big ) { - BinaryIO::readLatticeObject,LorentzColourMatrixD> - (Umu,file,GaugeSimpleMunger(),offset,format, - nersc_csum,scidac_csuma,scidac_csumb); - } - } else { - assert(0); - } - - GaugeStatistics(Umu,clone); - - std::cout<= 1.0e-5 ) { - std::cout << " Plaquette mismatch "<= 1.0e-5 ) { + std::cout << " Plaquette mismatch "< - static inline void writeConfiguration(Lattice > &Umu, - std::string file, - int two_row, - int bits32) - { - typedef Lattice > GaugeField; + template + static inline void writeConfiguration(Lattice > &Umu, + std::string file, + int two_row, + int bits32) + { + typedef Lattice > GaugeField; - typedef iLorentzColourMatrix vobj; - typedef typename vobj::scalar_object sobj; + typedef iLorentzColourMatrix vobj; + typedef typename vobj::scalar_object sobj; - FieldMetaData header; - /////////////////////////////////////////// - // Following should become arguments - /////////////////////////////////////////// - header.sequence_number = 1; - header.ensemble_id = "UKQCD"; - header.ensemble_label = "DWF"; + FieldMetaData header; + /////////////////////////////////////////// + // Following should become arguments + /////////////////////////////////////////// + header.sequence_number = 1; + header.ensemble_id = "UKQCD"; + header.ensemble_label = "DWF"; - typedef LorentzColourMatrixD fobj3D; - typedef LorentzColour2x3D fobj2D; + typedef LorentzColourMatrixD fobj3D; + typedef LorentzColour2x3D fobj2D; - GridBase *grid = Umu._grid; + GridBase *grid = Umu._grid; - GridMetaData(grid,header); - assert(header.nd==4); - GaugeStatistics(Umu,header); - MachineCharacteristics(header); + GridMetaData(grid,header); + assert(header.nd==4); + GaugeStatistics(Umu,header); + MachineCharacteristics(header); - int offset; + int offset; - truncate(file); + truncate(file); - // Sod it -- always write 3x3 double - header.floating_point = std::string("IEEE64BIG"); - header.data_type = std::string("4D_SU3_GAUGE_3x3"); - GaugeSimpleUnmunger munge; - offset = writeHeader(header,file); + // Sod it -- always write 3x3 double + header.floating_point = std::string("IEEE64BIG"); + header.data_type = std::string("4D_SU3_GAUGE_3x3"); + GaugeSimpleUnmunger munge; + offset = writeHeader(header,file); - uint32_t nersc_csum,scidac_csuma,scidac_csumb; - BinaryIO::writeLatticeObject(Umu,file,munge,offset,header.floating_point, - nersc_csum,scidac_csuma,scidac_csumb); - header.checksum = nersc_csum; - writeHeader(header,file); + uint32_t nersc_csum,scidac_csuma,scidac_csumb; + BinaryIO::writeLatticeObject(Umu,file,munge,offset,header.floating_point, + nersc_csum,scidac_csuma,scidac_csumb); + header.checksum = nersc_csum; + writeHeader(header,file); - std::cout< - uint32_t nersc_csum,scidac_csuma,scidac_csumb; - BinaryIO::readRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb); + // depending on datatype, set up munger; + // munger is a function of + uint32_t nersc_csum,scidac_csuma,scidac_csumb; + BinaryIO::readRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb); - if ( nersc_csum != header.checksum ) { - std::cerr << "checksum mismatch "< Date: Sat, 13 Jan 2018 00:08:25 +0000 Subject: [PATCH 041/754] Clean up and format NAMESPACE --- lib/cshift/Cshift.h | 6 ++--- lib/cshift/Cshift_common.h | 50 ++++++++++++++++++++------------------ lib/cshift/Cshift_mpi.h | 12 +++++---- lib/cshift/Cshift_none.h | 11 +++++---- 4 files changed, 42 insertions(+), 37 deletions(-) diff --git a/lib/cshift/Cshift.h b/lib/cshift/Cshift.h index 7d0caeee..07ec8412 100644 --- a/lib/cshift/Cshift.h +++ b/lib/cshift/Cshift.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,8 +23,8 @@ Author: Peter Boyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef _GRID_CSHIFT_H_ #define _GRID_CSHIFT_H_ diff --git a/lib/cshift/Cshift_common.h b/lib/cshift/Cshift_common.h index 1be672e8..d9cc1eb5 100644 --- a/lib/cshift/Cshift_common.h +++ b/lib/cshift/Cshift_common.h @@ -23,12 +23,12 @@ Author: Peter Boyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef _GRID_CSHIFT_COMMON_H_ #define _GRID_CSHIFT_COMMON_H_ -namespace Grid { +NAMESPACE_BEGIN(Grid); /////////////////////////////////////////////////////////////////// // Gather for when there is no need to SIMD split @@ -56,20 +56,20 @@ Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimen } } } else { - int bo=0; - std::vector > table; - for(int n=0;nCheckerBoardFromOindex(o+b); - if ( ocb &cbmask ) { - table.push_back(std::pair (bo++,o+b)); - } - } - } - parallel_for(int i=0;i > table; + for(int n=0;nCheckerBoardFromOindex(o+b); + if ( ocb &cbmask ) { + table.push_back(std::pair (bo++,o+b)); + } + } + } + parallel_for(int i=0;i void Scatter_plane_simple (Lattice &rhs,commVector void Copy_plane(Lattice& lhs,const Lattice &rhs int o =n*stride+b; int ocb=1<CheckerBoardFromOindex(o); if ( ocb&cbmask ) { - //lhs._odata[lo+o]=rhs._odata[ro+o]; + //lhs._odata[lo+o]=rhs._odata[ro+o]; vstream(lhs._odata[lo+o],rhs._odata[ro+o]); } } @@ -270,7 +270,7 @@ template void Copy_plane_permute(Lattice& lhs,const Lattice_slice_stride[dimension]; parallel_for_nest2(int n=0;nCheckerBoardFromOindex(o+b); @@ -278,7 +278,7 @@ template void Copy_plane_permute(Lattice& lhs,const Lattice Lattice Cshift_local(Lattice &ret,const Lattice } return ret; } -} + +NAMESPACE_END(Grid); + #endif diff --git a/lib/cshift/Cshift_mpi.h b/lib/cshift/Cshift_mpi.h index a66b49bf..faf932d6 100644 --- a/lib/cshift/Cshift_mpi.h +++ b/lib/cshift/Cshift_mpi.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -24,13 +24,13 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef _GRID_CSHIFT_MPI_H_ #define _GRID_CSHIFT_MPI_H_ -namespace Grid { +NAMESPACE_BEGIN(Grid); template Lattice Cshift(const Lattice &rhs,int dimension,int shift) { @@ -250,6 +250,8 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef _GRID_CSHIFT_NONE_H_ #define _GRID_CSHIFT_NONE_H_ -namespace Grid { +NAMESPACE_BEGIN(Grid); template Lattice Cshift(const Lattice &rhs,int dimension,int shift) { Lattice ret(rhs._grid); @@ -35,5 +35,6 @@ template Lattice Cshift(const Lattice &rhs,int dimension Cshift_local(ret,rhs,dimension,shift); return ret; } -} +NAMESPACE_END(Grid); + #endif From 7f6bffe5ad0d3f2659e95038b8b67600f7ce8ffa Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 13 Jan 2018 00:11:30 +0000 Subject: [PATCH 042/754] NAMESPACE --- lib/stencil/Lebesgue.cc | 5 +++-- lib/stencil/Lebesgue.h | 6 ++++-- lib/stencil/SimpleCompressor.h | 5 +++-- lib/stencil/Stencil.cc | 5 +++-- lib/stencil/Stencil.h | 5 +++-- 5 files changed, 16 insertions(+), 10 deletions(-) diff --git a/lib/stencil/Lebesgue.cc b/lib/stencil/Lebesgue.cc index 2880e4b6..bd8ed3ad 100644 --- a/lib/stencil/Lebesgue.cc +++ b/lib/stencil/Lebesgue.cc @@ -29,7 +29,7 @@ Author: paboyle #include #include -namespace Grid { +NAMESPACE_BEGIN(Grid); int LebesgueOrder::UseLebesgueOrder; #ifdef KNL @@ -239,4 +239,5 @@ void LebesgueOrder::ZGraph(void) } */ } -} +NAMESPACE_END(Grid); + diff --git a/lib/stencil/Lebesgue.h b/lib/stencil/Lebesgue.h index 7db0cc6b..32287487 100644 --- a/lib/stencil/Lebesgue.h +++ b/lib/stencil/Lebesgue.h @@ -32,7 +32,7 @@ Author: paboyle #include // Lebesgue, Morton, Z-graph ordering assistance -namespace Grid { +NAMESPACE_BEGIN(Grid); class LebesgueOrder { public: @@ -76,5 +76,7 @@ namespace Grid { std::vector _LebesgueReorder; }; -} + +NAMESPACE_END(Grid); + #endif diff --git a/lib/stencil/SimpleCompressor.h b/lib/stencil/SimpleCompressor.h index 58c3bcd2..f4f096de 100644 --- a/lib/stencil/SimpleCompressor.h +++ b/lib/stencil/SimpleCompressor.h @@ -1,7 +1,7 @@ #ifndef _STENCIL_SIMPLE_COMPRESSOR_H_ #define _STENCIL_SIMPLE_COMPRESSOR_H_ -namespace Grid { +NAMESPACE_BEGIN(Grid); template class SimpleCompressor { @@ -25,5 +25,6 @@ public: } }; -} +NAMESPACE_END(Grid); + #endif diff --git a/lib/stencil/Stencil.cc b/lib/stencil/Stencil.cc index e04a5360..9cd6a9ab 100644 --- a/lib/stencil/Stencil.cc +++ b/lib/stencil/Stencil.cc @@ -27,7 +27,7 @@ /* END LEGAL */ #include -namespace Grid { +NAMESPACE_BEGIN(Grid); void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, int off,std::vector > & table) @@ -66,4 +66,5 @@ void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbma } } -} +NAMESPACE_END(Grid); + diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index 69c010f4..ef35e100 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -48,7 +48,7 @@ // ////////////////////////////////////////////////////////////////////////////////////////// -namespace Grid { +NAMESPACE_BEGIN(Grid); /////////////////////////////////////////////////////////////////// // Gather for when there *is* need to SIMD split with compression @@ -1164,5 +1164,6 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal }; }; -} +NAMESPACE_END(Grid); + #endif From 5e48b701ecaa6e9cf184841f0c79aeaf4def1237 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 13 Jan 2018 00:11:53 +0000 Subject: [PATCH 043/754] FOrmatting --- lib/stencil/Lebesgue.cc | 48 ++++++++++++------------- lib/stencil/Lebesgue.h | 72 ++++++++++++++++++------------------- lib/stencil/Stencil.cc | 30 ++++++++-------- lib/stencil/Stencil.h | 78 ++++++++++++++++++++--------------------- 4 files changed, 114 insertions(+), 114 deletions(-) diff --git a/lib/stencil/Lebesgue.cc b/lib/stencil/Lebesgue.cc index bd8ed3ad..c6d7369e 100644 --- a/lib/stencil/Lebesgue.cc +++ b/lib/stencil/Lebesgue.cc @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -24,8 +24,8 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #include #include @@ -72,9 +72,9 @@ void LebesgueOrder::ThreadInterleave(void) for(int t=0;t> blockbits) % threads == t ) { - throrder.push_back(reorder[ss]); - } + if ( ( ss >> blockbits) % threads == t ) { + throrder.push_back(reorder[ss]); + } } } _LebesgueReorder = throrder; @@ -112,9 +112,9 @@ void LebesgueOrder::CartesianBlocking(void) }; void LebesgueOrder::IterateO(int ND,int dim, - std::vector & xo, - std::vector & xi, - std::vector &dims) + std::vector & xo, + std::vector & xi, + std::vector &dims) { for(xo[dim]=0;xo[dim] 0 ) { @@ -126,10 +126,10 @@ void LebesgueOrder::IterateO(int ND,int dim, }; void LebesgueOrder::IterateI(int ND, - int dim, - std::vector & xo, - std::vector & xi, - std::vector &dims) + int dim, + std::vector & xo, + std::vector & xi, + std::vector &dims) { std::vector x(ND); for(xi[dim]=0;xi[dim]_rdimensions); _LebesgueReorder.push_back(index); @@ -227,16 +227,16 @@ void LebesgueOrder::ZGraph(void) assert( _LebesgueReorder.size() == vol ); /* - std::vector coor(4); - for(IndexInteger asite=0;asite coor(4); + for(IndexInteger asite=0;asiteoCoorFromOindex (coor,_LebesgueReorder[asite]); - std::cout << " site "<" << _LebesgueReorder[asite]<< " = [" - << coor[0]<<"," - << coor[1]<<"," - << coor[2]<<"," - << coor[3]<<"]" - <" << _LebesgueReorder[asite]<< " = [" + << coor[0]<<"," + << coor[1]<<"," + << coor[2]<<"," + << coor[3]<<"]" + < 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_LEBESGUE_H #define GRID_LEBESGUE_H @@ -34,48 +34,48 @@ Author: paboyle // Lebesgue, Morton, Z-graph ordering assistance NAMESPACE_BEGIN(Grid); - class LebesgueOrder { - public: +class LebesgueOrder { +public: - typedef int32_t IndexInteger; + typedef int32_t IndexInteger; - static int UseLebesgueOrder; - GridBase *grid; + static int UseLebesgueOrder; + GridBase *grid; - public: - LebesgueOrder(GridBase *_grid); +public: + LebesgueOrder(GridBase *_grid); - inline IndexInteger Reorder(IndexInteger ss) { - return _LebesgueReorder[ss] ; - }; + inline IndexInteger Reorder(IndexInteger ss) { + return _LebesgueReorder[ss] ; + }; - //////////////////////////// - // Space filling fractal for cache oblivious - //////////////////////////// - void ZGraph(void); - IndexInteger alignup(IndexInteger n); + //////////////////////////// + // Space filling fractal for cache oblivious + //////////////////////////// + void ZGraph(void); + IndexInteger alignup(IndexInteger n); - ///////////////////////////////// - // Cartesian stencil blocking strategy - ///////////////////////////////// - static std::vector Block; - void NoBlocking(void); - void CartesianBlocking(void); - void IterateO(int ND,int dim, - std::vector & xo, - std::vector & xi, - std::vector &dims); - void IterateI(int ND,int dim, - std::vector & xo, - std::vector & xi, - std::vector &dims); + ///////////////////////////////// + // Cartesian stencil blocking strategy + ///////////////////////////////// + static std::vector Block; + void NoBlocking(void); + void CartesianBlocking(void); + void IterateO(int ND,int dim, + std::vector & xo, + std::vector & xi, + std::vector &dims); + void IterateI(int ND,int dim, + std::vector & xo, + std::vector & xi, + std::vector &dims); - void ThreadInterleave(void); + void ThreadInterleave(void); - private: - std::vector _LebesgueReorder; +private: + std::vector _LebesgueReorder; - }; +}; NAMESPACE_END(Grid); diff --git a/lib/stencil/Stencil.cc b/lib/stencil/Stencil.cc index 9cd6a9ab..c1b33baa 100644 --- a/lib/stencil/Stencil.cc +++ b/lib/stencil/Stencil.cc @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,14 +23,14 @@ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #include NAMESPACE_BEGIN(Grid); void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, - int off,std::vector > & table) + int off,std::vector > & table) { table.resize(0); @@ -52,17 +52,17 @@ void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbma } } } else { - int bo=0; - table.resize(e1*e2/2); - for(int n=0;nCheckerBoardFromOindexTable(o+b); - if ( ocb &cbmask ) { - table[bo]=std::pair(bo,o+b); bo++; - } - } - } + int bo=0; + table.resize(e1*e2/2); + for(int n=0;nCheckerBoardFromOindexTable(o+b); + if ( ocb &cbmask ) { + table[bo]=std::pair(bo,o+b); bo++; + } + } + } } } diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index ef35e100..aeadd4b9 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -23,30 +23,30 @@ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_STENCIL_H #define GRID_STENCIL_H #include // subdir aggregate #include // subdir aggregate - ////////////////////////////////////////////////////////////////////////////////////////// - // Must not lose sight that goal is to be able to construct really efficient - // gather to a point stencil code. CSHIFT is not the best way, so need - // additional stencil support. - // - // Stencil based code will exchange haloes and use a table lookup for neighbours. - // This will be done with generality to allow easier efficient implementations. - // Overlap of comms and compute is enabled by tabulating off-node connected, - // - // Generic services - // 0) Prebuild neighbour tables - // 1) Compute sizes of all haloes/comms buffers; allocate them. - // 2) Gather all faces, and communicate. - // 3) Loop over result sites, giving nbr index/offnode info for each - // - ////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////////// +// Must not lose sight that goal is to be able to construct really efficient +// gather to a point stencil code. CSHIFT is not the best way, so need +// additional stencil support. +// +// Stencil based code will exchange haloes and use a table lookup for neighbours. +// This will be done with generality to allow easier efficient implementations. +// Overlap of comms and compute is enabled by tabulating off-node connected, +// +// Generic services +// 0) Prebuild neighbour tables +// 1) Compute sizes of all haloes/comms buffers; allocate them. +// 2) Gather all faces, and communicate. +// 3) Loop over result sites, giving nbr index/offnode info for each +// +////////////////////////////////////////////////////////////////////////////////////////// NAMESPACE_BEGIN(Grid); @@ -54,7 +54,7 @@ NAMESPACE_BEGIN(Grid); // Gather for when there *is* need to SIMD split with compression /////////////////////////////////////////////////////////////////// void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, - int off,std::vector > & table); + int off,std::vector > & table); template void Gather_plane_simple_table (std::vector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so) __attribute__((noinline)); @@ -89,21 +89,21 @@ void Gather_plane_exchange_table(std::vector >& table,const L } } - struct StencilEntry { - uint64_t _offset; - uint64_t _byte_offset; - uint16_t _is_local; - uint16_t _permute; - uint16_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline - uint16_t _pad; - }; +struct StencilEntry { + uint64_t _offset; + uint64_t _byte_offset; + uint16_t _is_local; + uint16_t _permute; + uint16_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline + uint16_t _pad; +}; //////////////////////////////////////// // The Stencil Class itself //////////////////////////////////////// template class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in. - public: +public: typedef typename cobj::vector_type vector_type; typedef typename cobj::scalar_type scalar_type; @@ -538,17 +538,17 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal } }; - CartesianStencil(GridBase *grid, - int npoints, - int checkerboard, - const std::vector &directions, - const std::vector &distances) - : _permute_type(npoints), - _comm_buf_size(npoints), - comm_bytes_thr(npoints), - comm_enter_thr(npoints), - comm_leave_thr(npoints), - comm_time_thr(npoints) + CartesianStencil(GridBase *grid, + int npoints, + int checkerboard, + const std::vector &directions, + const std::vector &distances) + : _permute_type(npoints), + _comm_buf_size(npoints), + comm_bytes_thr(npoints), + comm_enter_thr(npoints), + comm_leave_thr(npoints), + comm_time_thr(npoints) { face_table_computed=0; _npoints = npoints; From b45bd8e097fbded25379cd289adf9e937be6be16 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 13 Jan 2018 00:16:34 +0000 Subject: [PATCH 044/754] NAMESPACE --- lib/communicator/Communicator_base.cc | 5 +++-- lib/communicator/Communicator_base.h | 5 +++-- lib/communicator/Communicator_mpi3.cc | 5 ++--- lib/communicator/Communicator_none.cc | 4 ++-- lib/communicator/SharedMemory.cc | 5 ++--- lib/communicator/SharedMemory.h | 17 +++-------------- lib/communicator/SharedMemoryMPI.cc | 5 +++-- lib/communicator/SharedMemoryNone.cc | 5 +++-- 8 files changed, 21 insertions(+), 30 deletions(-) diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc index edbf26af..c5650d35 100644 --- a/lib/communicator/Communicator_base.cc +++ b/lib/communicator/Communicator_base.cc @@ -31,7 +31,7 @@ Author: Peter Boyle #include #include -namespace Grid { +NAMESPACE_BEGIN(Grid); /////////////////////////////////////////////////////////////// // Info that is setup once and indept of cartesian layout @@ -72,5 +72,6 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) GlobalSumVector((double *)c,2*N); } -} +NAMESPACE_END(Grid); + diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h index 359846c9..016bb7b4 100644 --- a/lib/communicator/Communicator_base.h +++ b/lib/communicator/Communicator_base.h @@ -34,7 +34,7 @@ Author: Peter Boyle /////////////////////////////////// #include -namespace Grid { +NAMESPACE_BEGIN(Grid); class CartesianCommunicator : public SharedMemory { @@ -201,6 +201,7 @@ public: }; }; -} + +NAMESPACE_END(Grid); #endif diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index ef47d617..8022b775 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -28,7 +28,7 @@ Author: Peter Boyle #include #include -namespace Grid { +NAMESPACE_BEGIN(Grid); Grid_MPI_Comm CartesianCommunicator::communicator_world; @@ -488,7 +488,6 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t MPI_Type_free(&object); } +NAMESPACE_END(Grid); -} - diff --git a/lib/communicator/Communicator_none.cc b/lib/communicator/Communicator_none.cc index c3763d53..9606e5b9 100644 --- a/lib/communicator/Communicator_none.cc +++ b/lib/communicator/Communicator_none.cc @@ -27,7 +27,7 @@ Author: Peter Boyle /* END LEGAL */ #include -namespace Grid { +NAMESPACE_BEGIN(Grid); /////////////////////////////////////////////////////////////////////////////////////////////////// // Info that is setup once and indept of cartesian layout @@ -160,6 +160,6 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector #include -namespace Grid { +NAMESPACE_BEGIN(Grid); // static data @@ -87,6 +87,5 @@ void *SharedMemory::ShmBufferSelf(void) return ShmCommBufs[ShmRank]; } +NAMESPACE_END(Grid); - -} diff --git a/lib/communicator/SharedMemory.h b/lib/communicator/SharedMemory.h index 0f647dc6..4987b057 100644 --- a/lib/communicator/SharedMemory.h +++ b/lib/communicator/SharedMemory.h @@ -25,18 +25,6 @@ Author: Peter Boyle See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ - - -// TODO -// 1) move includes into SharedMemory.cc -// -// 2) split shared memory into a) optimal communicator creation from comm world -// -// b) shared memory buffers container -// -- static globally shared; init once -// -- per instance set of buffers. -// - #pragma once #include @@ -57,7 +45,7 @@ Author: Peter Boyle #include #endif -namespace Grid { +NAMESPACE_BEGIN(Grid); #if defined (GRID_COMMS_MPI3) typedef MPI_Comm Grid_MPI_Comm; @@ -161,4 +149,5 @@ class SharedMemory }; -} +NAMESPACE_END(Grid); + diff --git a/lib/communicator/SharedMemoryMPI.cc b/lib/communicator/SharedMemoryMPI.cc index d7bd7c65..b10e6615 100644 --- a/lib/communicator/SharedMemoryMPI.cc +++ b/lib/communicator/SharedMemoryMPI.cc @@ -28,7 +28,7 @@ Author: Peter Boyle #include -namespace Grid { +NAMESPACE_BEGIN(Grid); /*Construct from an MPI communicator*/ void GlobalSharedMemory::Init(Grid_MPI_Comm comm) @@ -392,4 +392,5 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p) } } -} +NAMESPACE_END(Grid); + diff --git a/lib/communicator/SharedMemoryNone.cc b/lib/communicator/SharedMemoryNone.cc index 7feed7e4..efd9e585 100644 --- a/lib/communicator/SharedMemoryNone.cc +++ b/lib/communicator/SharedMemoryNone.cc @@ -28,7 +28,7 @@ Author: Peter Boyle #include -namespace Grid { +NAMESPACE_BEGIN(Grid); /*Construct from an MPI communicator*/ void GlobalSharedMemory::Init(Grid_MPI_Comm comm) @@ -123,4 +123,5 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p) return NULL; } -} +NAMESPACE_END(Grid); + From 8cb7a1a88741b7a5b558ec0f182fec86fda2cd11 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 13 Jan 2018 00:17:16 +0000 Subject: [PATCH 045/754] Format --- lib/communicator/Communicator.h | 6 +++--- lib/communicator/Communicator_base.cc | 6 +++--- lib/communicator/Communicator_base.h | 16 ++++++++-------- lib/communicator/Communicator_mpi3.cc | 8 ++++---- lib/communicator/Communicator_none.cc | 10 +++++----- lib/communicator/SharedMemory.h | 18 +++++++++--------- lib/communicator/SharedMemoryMPI.cc | 8 ++++---- lib/communicator/SharedMemoryNone.cc | 8 ++++---- 8 files changed, 40 insertions(+), 40 deletions(-) diff --git a/lib/communicator/Communicator.h b/lib/communicator/Communicator.h index d4ec5a13..c332c04a 100644 --- a/lib/communicator/Communicator.h +++ b/lib/communicator/Communicator.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,8 +23,8 @@ Author: Peter Boyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_COMMUNICATOR_H #define GRID_COMMUNICATOR_H diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc index c5650d35..b10c1a04 100644 --- a/lib/communicator/Communicator_base.cc +++ b/lib/communicator/Communicator_base.cc @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,8 +23,8 @@ Author: Peter Boyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #include #include #include diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h index 016bb7b4..cd64abb7 100644 --- a/lib/communicator/Communicator_base.h +++ b/lib/communicator/Communicator_base.h @@ -1,5 +1,5 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -24,8 +24,8 @@ Author: Peter Boyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_COMMUNICATOR_BASE_H #define GRID_COMMUNICATOR_BASE_H @@ -73,7 +73,7 @@ public: CartesianCommunicator(const std::vector &pdimensions_in); virtual ~CartesianCommunicator(); - private: +private: //////////////////////////////////////////////// // Private initialise from an MPI communicator @@ -81,7 +81,7 @@ public: //////////////////////////////////////////////// void InitFromMPICommunicator(const std::vector &processors, Grid_MPI_Comm communicator_base); - public: +public: //////////////////////////////////////////////////////////////////////////////////////// // Wraps MPI_Cart routines, or implements equivalent on other impls @@ -196,9 +196,9 @@ public: void AllToAll(void *in,void *out,uint64_t words ,uint64_t bytes); template void Broadcast(int root,obj &data) - { - Broadcast(root,(void *)&data,sizeof(data)); - }; + { + Broadcast(root,(void *)&data,sizeof(data)); + }; }; diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index 8022b775..dbe570f8 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -23,8 +23,8 @@ Author: Peter Boyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #include #include @@ -53,8 +53,8 @@ void CartesianCommunicator::Init(int *argc, char ***argv) GlobalSharedMemory::Init(communicator_world); GlobalSharedMemory::SharedMemoryAllocate( - GlobalSharedMemory::MAX_MPI_SHM_BYTES, - GlobalSharedMemory::Hugepages); + GlobalSharedMemory::MAX_MPI_SHM_BYTES, + GlobalSharedMemory::Hugepages); } /////////////////////////////////////////////////////////////////////////// diff --git a/lib/communicator/Communicator_none.cc b/lib/communicator/Communicator_none.cc index 9606e5b9..12a84995 100644 --- a/lib/communicator/Communicator_none.cc +++ b/lib/communicator/Communicator_none.cc @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,8 +23,8 @@ Author: Peter Boyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #include NAMESPACE_BEGIN(Grid); @@ -38,8 +38,8 @@ void CartesianCommunicator::Init(int *argc, char *** arv) { GlobalSharedMemory::Init(communicator_world); GlobalSharedMemory::SharedMemoryAllocate( - GlobalSharedMemory::MAX_MPI_SHM_BYTES, - GlobalSharedMemory::Hugepages); + GlobalSharedMemory::MAX_MPI_SHM_BYTES, + GlobalSharedMemory::Hugepages); } CartesianCommunicator::CartesianCommunicator(const std::vector &processors,const CartesianCommunicator &parent,int &srank) diff --git a/lib/communicator/SharedMemory.h b/lib/communicator/SharedMemory.h index 4987b057..1e213b86 100644 --- a/lib/communicator/SharedMemory.h +++ b/lib/communicator/SharedMemory.h @@ -48,15 +48,15 @@ Author: Peter Boyle NAMESPACE_BEGIN(Grid); #if defined (GRID_COMMS_MPI3) - typedef MPI_Comm Grid_MPI_Comm; - typedef MPI_Request CommsRequest_t; +typedef MPI_Comm Grid_MPI_Comm; +typedef MPI_Request CommsRequest_t; #else - typedef int CommsRequest_t; - typedef int Grid_MPI_Comm; +typedef int CommsRequest_t; +typedef int Grid_MPI_Comm; #endif class GlobalSharedMemory { - private: +private: static const int MAXLOG2RANKSPERNODE = 16; // Init once lock on the buffer allocation @@ -64,7 +64,7 @@ class GlobalSharedMemory { static int _ShmAlloc; static uint64_t _ShmAllocBytes; - public: +public: static int ShmSetup(void) { return _ShmSetup; } static int ShmAlloc(void) { return _ShmAlloc; } static uint64_t ShmAllocBytes(void) { return _ShmAllocBytes; } @@ -104,14 +104,14 @@ class GlobalSharedMemory { ////////////////////////////// class SharedMemory { - private: +private: static const int MAXLOG2RANKSPERNODE = 16; size_t heap_top; size_t heap_bytes; size_t heap_size; - protected: +protected: Grid_MPI_Comm ShmComm; // for barriers int ShmRank; @@ -119,7 +119,7 @@ class SharedMemory std::vector ShmCommBufs; std::vector ShmRanks;// Mapping comm ranks to Shm ranks - public: +public: SharedMemory() {}; /////////////////////////////////////////////////////////////////////////////////////// // set the buffers & sizes diff --git a/lib/communicator/SharedMemoryMPI.cc b/lib/communicator/SharedMemoryMPI.cc index b10e6615..3184d071 100644 --- a/lib/communicator/SharedMemoryMPI.cc +++ b/lib/communicator/SharedMemoryMPI.cc @@ -292,10 +292,10 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) } #endif - //////////////////////////////////////////////////////// - // Global shared functionality finished - // Now move to per communicator functionality - //////////////////////////////////////////////////////// +//////////////////////////////////////////////////////// +// Global shared functionality finished +// Now move to per communicator functionality +//////////////////////////////////////////////////////// void SharedMemory::SetCommunicator(Grid_MPI_Comm comm) { int rank, size; diff --git a/lib/communicator/SharedMemoryNone.cc b/lib/communicator/SharedMemoryNone.cc index efd9e585..928c0332 100644 --- a/lib/communicator/SharedMemoryNone.cc +++ b/lib/communicator/SharedMemoryNone.cc @@ -84,10 +84,10 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) _ShmAlloc=1; }; - //////////////////////////////////////////////////////// - // Global shared functionality finished - // Now move to per communicator functionality - //////////////////////////////////////////////////////// +//////////////////////////////////////////////////////// +// Global shared functionality finished +// Now move to per communicator functionality +//////////////////////////////////////////////////////// void SharedMemory::SetCommunicator(Grid_MPI_Comm comm) { assert(GlobalSharedMemory::ShmAlloc()==1); From f4272aa6fdedceff55fcd2ddd97bc0dcd9193eb3 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 13 Jan 2018 00:19:19 +0000 Subject: [PATCH 046/754] Clean up --- lib/allocator/AlignedAllocator.cc | 21 +++-- lib/allocator/AlignedAllocator.h | 151 +++++++++++++++--------------- 2 files changed, 87 insertions(+), 85 deletions(-) diff --git a/lib/allocator/AlignedAllocator.cc b/lib/allocator/AlignedAllocator.cc index 10b49f4b..2a80dbf9 100644 --- a/lib/allocator/AlignedAllocator.cc +++ b/lib/allocator/AlignedAllocator.cc @@ -1,7 +1,7 @@ #include #include -namespace Grid { +NAMESPACE_BEGIN(Grid); MemoryStats *MemoryProfiler::stats = nullptr; bool MemoryProfiler::debug = false; @@ -49,7 +49,7 @@ void *PointerCache::Insert(void *ptr,size_t bytes) { void *PointerCache::Lookup(size_t bytes) { - if (bytes < 4096 ) return NULL; + if (bytes < 4096 ) return NULL; #ifdef _OPENMP assert(omp_in_parallel()==0); @@ -90,7 +90,7 @@ void check_huge_pages(void *Buf,uint64_t BYTES) ++n4ktotal; if (pageaddr != baseaddr + j * page_size) ++nnothuge; - } + } } int rank = CartesianCommunicator::RankWorld(); printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge); @@ -106,20 +106,21 @@ std::string sizeString(const size_t bytes) double count = bytes; while (count >= 1024 && s < 7) - { + { s++; count /= 1024; - } + } if (count - floor(count) == 0.0) - { + { snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]); - } + } else - { + { snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]); - } + } return std::string(buf); } -} +NAMESPACE_END(Grid); + diff --git a/lib/allocator/AlignedAllocator.h b/lib/allocator/AlignedAllocator.h index 3b27aec9..49798d7a 100644 --- a/lib/allocator/AlignedAllocator.h +++ b/lib/allocator/AlignedAllocator.h @@ -24,8 +24,8 @@ Author: Peter Boyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_ALIGNED_ALLOCATOR_H #define GRID_ALIGNED_ALLOCATOR_H @@ -40,89 +40,89 @@ Author: Peter Boyle #include #endif -namespace Grid { +NAMESPACE_BEGIN(Grid); - class PointerCache { - private: +class PointerCache { +private: - static const int Ncache=8; - static int victim; + static const int Ncache=8; + static int victim; - typedef struct { - void *address; - size_t bytes; - int valid; - } PointerCacheEntry; + typedef struct { + void *address; + size_t bytes; + int valid; + } PointerCacheEntry; - static PointerCacheEntry Entries[Ncache]; + static PointerCacheEntry Entries[Ncache]; - public: +public: - static void *Insert(void *ptr,size_t bytes) ; - static void *Lookup(size_t bytes) ; + static void *Insert(void *ptr,size_t bytes) ; + static void *Lookup(size_t bytes) ; - }; +}; - std::string sizeString(size_t bytes); +std::string sizeString(size_t bytes); - struct MemoryStats - { - size_t totalAllocated{0}, maxAllocated{0}, - currentlyAllocated{0}, totalFreed{0}; - }; +struct MemoryStats +{ + size_t totalAllocated{0}, maxAllocated{0}, + currentlyAllocated{0}, totalFreed{0}; +}; - class MemoryProfiler - { - public: - static MemoryStats *stats; - static bool debug; - }; +class MemoryProfiler +{ +public: + static MemoryStats *stats; + static bool debug; +}; - #define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")" - #define profilerDebugPrint \ - if (MemoryProfiler::stats)\ - {\ - auto s = MemoryProfiler::stats;\ - std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl;\ - std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \ - << std::endl;\ - std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \ - << std::endl;\ - std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \ - << std::endl;\ - std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \ - << std::endl;\ - } +#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")" +#define profilerDebugPrint \ + if (MemoryProfiler::stats) \ + { \ + auto s = MemoryProfiler::stats; \ + std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \ + std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \ + << std::endl; \ + std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \ + << std::endl; \ + std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \ + << std::endl; \ + std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \ + << std::endl; \ + } - #define profilerAllocate(bytes)\ - if (MemoryProfiler::stats)\ - {\ - auto s = MemoryProfiler::stats;\ - s->totalAllocated += (bytes);\ - s->currentlyAllocated += (bytes);\ - s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated);\ - }\ - if (MemoryProfiler::debug)\ - {\ - std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl;\ - profilerDebugPrint;\ - } +#define profilerAllocate(bytes) \ + if (MemoryProfiler::stats) \ + { \ + auto s = MemoryProfiler::stats; \ + s->totalAllocated += (bytes); \ + s->currentlyAllocated += (bytes); \ + s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \ + } \ + if (MemoryProfiler::debug) \ + { \ + std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \ + profilerDebugPrint; \ + } - #define profilerFree(bytes)\ - if (MemoryProfiler::stats)\ - {\ - auto s = MemoryProfiler::stats;\ - s->totalFreed += (bytes);\ - s->currentlyAllocated -= (bytes);\ - }\ - if (MemoryProfiler::debug)\ - {\ - std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl;\ - profilerDebugPrint;\ - } +#define profilerFree(bytes) \ + if (MemoryProfiler::stats) \ + { \ + auto s = MemoryProfiler::stats; \ + s->totalFreed += (bytes); \ + s->currentlyAllocated -= (bytes); \ + } \ + if (MemoryProfiler::debug) \ + { \ + std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \ + profilerDebugPrint; \ + } - void check_huge_pages(void *Buf,uint64_t BYTES); +void check_huge_pages(void *Buf,uint64_t BYTES); //////////////////////////////////////////////////////////////////// // A lattice of something, but assume the something is SIMDized. @@ -159,7 +159,7 @@ public: ////////////////// // Hack 2MB align; could make option probably doesn't need configurability ////////////////// -//define GRID_ALLOC_ALIGN (128) + //define GRID_ALLOC_ALIGN (128) #define GRID_ALLOC_ALIGN (2*1024*1024) #ifdef HAVE_MM_MALLOC_H if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN); @@ -205,8 +205,8 @@ template inline bool operator!=(const alignedAllocator<_Tp>&, con #ifdef GRID_COMMS_SHMEM extern "C" { #include -extern void * shmem_align(size_t, size_t); -extern void shmem_free(void *); + extern void * shmem_align(size_t, size_t); + extern void shmem_free(void *); } #define PARANOID_SYMMETRIC_HEAP #endif @@ -276,7 +276,7 @@ public: #endif uint8_t *cp = (uint8_t *)ptr; if ( ptr ) { - // One touch per 4k page, static OMP loop to catch same loop order + // One touch per 4k page, static OMP loop to catch same loop order #pragma omp parallel for schedule(static) for(size_type n=0;n using Vector = std::vector >; template using commVector = std::vector >; template using Matrix = std::vector > >; -}; // namespace Grid +NAMESPACE_END(Grid); + #endif From c037244874e01153a2c1c3bf74afa26d58d0a4a4 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 13 Jan 2018 00:31:02 +0000 Subject: [PATCH 047/754] Tensor reformatted with NAMESPACE too --- lib/tensors/Tensor_Ta.h | 175 ++++++----- lib/tensors/Tensor_arith.h | 6 +- lib/tensors/Tensor_arith_add.h | 209 +++++++------ lib/tensors/Tensor_arith_mac.h | 80 ++--- lib/tensors/Tensor_arith_mul.h | 209 +++++++------ lib/tensors/Tensor_arith_scalar.h | 11 +- lib/tensors/Tensor_arith_sub.h | 126 ++++---- lib/tensors/Tensor_class.h | 212 ++++++------- lib/tensors/Tensor_determinant.h | 83 ++--- lib/tensors/Tensor_exp.h | 178 ++++++----- lib/tensors/Tensor_extract_merge.h | 476 ++++++++++++++-------------- lib/tensors/Tensor_index.h | 395 ++++++++++++------------ lib/tensors/Tensor_inner.h | 157 +++++----- lib/tensors/Tensor_logical.h | 49 +-- lib/tensors/Tensor_outer.h | 39 +-- lib/tensors/Tensor_reality.h | 131 ++++---- lib/tensors/Tensor_trace.h | 31 +- lib/tensors/Tensor_traits.h | 480 +++++++++++++++-------------- lib/tensors/Tensor_transpose.h | 86 +++--- lib/tensors/Tensor_unary.h | 121 ++++---- lib/tensors/Tensors.h | 6 +- 21 files changed, 1634 insertions(+), 1626 deletions(-) diff --git a/lib/tensors/Tensor_Ta.h b/lib/tensors/Tensor_Ta.h index 45a2257f..5cd1d101 100644 --- a/lib/tensors/Tensor_Ta.h +++ b/lib/tensors/Tensor_Ta.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -24,102 +24,101 @@ Author: neo 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_MATH_TA_H #define GRID_MATH_TA_H -namespace Grid { +NAMESPACE_BEGIN(Grid); - /////////////////////////////////////////////// - // Ta function for scalar, vector, matrix - /////////////////////////////////////////////// - /* +/////////////////////////////////////////////// +// Ta function for scalar, vector, matrix +/////////////////////////////////////////////// +/* inline ComplexF Ta( const ComplexF &arg){ return arg;} inline ComplexD Ta( const ComplexD &arg){ return arg;} inline RealF Ta( const RealF &arg){ return arg;} inline RealD Ta( const RealD &arg){ return arg;} - */ - - template inline iScalar Ta(const iScalar&r) - { - iScalar ret; - ret._internal = Ta(r._internal); - return ret; - } - template inline iVector Ta(const iVector&r) - { - iVector ret; - for(int i=0;i inline iMatrix Ta(const iMatrix &arg) - { - iMatrix ret; - - double factor = (1.0/(double)N); - ret= (arg - adj(arg))*0.5; - ret=ret - (trace(ret)*factor); - return ret; - } - - - /////////////////////////////////////////////// - // ProjectOnGroup function for scalar, vector, matrix - // Projects on orthogonal, unitary group - /////////////////////////////////////////////// - - - template inline iScalar ProjectOnGroup(const iScalar&r) - { - iScalar ret; - ret._internal = ProjectOnGroup(r._internal); - return ret; - } - template inline iVector ProjectOnGroup(const iVector&r) - { - iVector ret; - for(int i=0;i::TensorLevel == 0 >::type * =nullptr> - inline iMatrix ProjectOnGroup(const iMatrix &arg) - { - // need a check for the group type? - iMatrix ret(arg); - vtype nrm; - vtype inner; - for(int c1=0;c1 inline iScalar Ta(const iScalar&r) +{ + iScalar ret; + ret._internal = Ta(r._internal); + return ret; } +template inline iVector Ta(const iVector&r) +{ + iVector ret; + for(int i=0;i inline iMatrix Ta(const iMatrix &arg) +{ + iMatrix ret; + + double factor = (1.0/(double)N); + ret= (arg - adj(arg))*0.5; + ret=ret - (trace(ret)*factor); + return ret; +} + + +/////////////////////////////////////////////// +// ProjectOnGroup function for scalar, vector, matrix +// Projects on orthogonal, unitary group +/////////////////////////////////////////////// + + +template inline iScalar ProjectOnGroup(const iScalar&r) +{ + iScalar ret; + ret._internal = ProjectOnGroup(r._internal); + return ret; +} +template inline iVector ProjectOnGroup(const iVector&r) +{ + iVector ret; + for(int i=0;i::TensorLevel == 0 >::type * =nullptr> +inline iMatrix ProjectOnGroup(const iMatrix &arg) +{ + // need a check for the group type? + iMatrix ret(arg); + vtype nrm; + vtype inner; + for(int c1=0;c1 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_MATH_ARITH_H #define GRID_MATH_ARITH_H diff --git a/lib/tensors/Tensor_arith_add.h b/lib/tensors/Tensor_arith_add.h index 6e01fb19..62dd188e 100644 --- a/lib/tensors/Tensor_arith_add.h +++ b/lib/tensors/Tensor_arith_add.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -24,123 +24,122 @@ Author: neo 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_MATH_ARITH_ADD_H #define GRID_MATH_ARITH_ADD_H -namespace Grid { +NAMESPACE_BEGIN(Grid); - /////////////////////////////////////////////////////////////////////////////////////////////////// - /////////////////////////////////////////// ADD /////////////////////////////////////////// - /////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////// ADD /////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////// // ADD is simple for now; cannot mix types and straightforward template // Scalar +/- Scalar // Vector +/- Vector // Matrix +/- Matrix - template strong_inline void add(iScalar * __restrict__ ret, - const iScalar * __restrict__ lhs, - const iScalar * __restrict__ rhs) - { - add(&ret->_internal,&lhs->_internal,&rhs->_internal); - } - template strong_inline void add(iVector * __restrict__ ret, - const iVector * __restrict__ lhs, - const iVector * __restrict__ rhs) - { - for(int c=0;c_internal[c]=lhs->_internal[c]+rhs->_internal[c]; - } - return; +template strong_inline void add(iScalar * __restrict__ ret, + const iScalar * __restrict__ lhs, + const iScalar * __restrict__ rhs) +{ + add(&ret->_internal,&lhs->_internal,&rhs->_internal); +} +template strong_inline void add(iVector * __restrict__ ret, + const iVector * __restrict__ lhs, + const iVector * __restrict__ rhs) +{ + for(int c=0;c_internal[c]=lhs->_internal[c]+rhs->_internal[c]; } + return; +} - template strong_inline void add(iMatrix * __restrict__ ret, - const iMatrix * __restrict__ lhs, - const iMatrix * __restrict__ rhs) - { - for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal[c1][c2]); - }} - return; - } - template strong_inline void add(iMatrix * __restrict__ ret, - const iScalar * __restrict__ lhs, - const iMatrix * __restrict__ rhs) - { - for(int c2=0;c2_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]); - else - ret->_internal[c1][c2]=lhs->_internal[c1][c2]; - }} - return; - } - template strong_inline void add(iMatrix * __restrict__ ret, - const iMatrix * __restrict__ lhs, - const iScalar * __restrict__ rhs) - { - for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal); - else - ret->_internal[c1][c2]=lhs->_internal[c1][c2]; - }} - return; - } - - - // + operator for scalar, vector, matrix - template - //strong_inline auto operator + (iScalar& lhs,iScalar&& rhs) -> iScalar - strong_inline auto operator + (const iScalar& lhs,const iScalar& rhs) -> iScalar - { - typedef iScalar ret_t; - ret_t ret; - add(&ret,&lhs,&rhs); - return ret; - } - template - strong_inline auto operator + (const iVector& lhs,const iVector& rhs) ->iVector - { - typedef iVector ret_t; - ret_t ret; - add(&ret,&lhs,&rhs); - return ret; - } - template - strong_inline auto operator + (const iMatrix& lhs,const iMatrix& rhs) ->iMatrix - { - typedef iMatrix ret_t; - ret_t ret; - add(&ret,&lhs,&rhs); - return ret; - } - template -strong_inline auto operator + (const iScalar& lhs,const iMatrix& rhs)->iMatrix - { - typedef iMatrix ret_t; - ret_t ret; - add(&ret,&lhs,&rhs); - return ret; - } - - template - strong_inline auto operator + (const iMatrix& lhs,const iScalar& rhs)->iMatrix - { - typedef iMatrix ret_t; - ret_t ret; - add(&ret,&lhs,&rhs); - return ret; - } - - - +template strong_inline void add(iMatrix * __restrict__ ret, + const iMatrix * __restrict__ lhs, + const iMatrix * __restrict__ rhs) +{ + for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal[c1][c2]); + }} + return; +} +template strong_inline void add(iMatrix * __restrict__ ret, + const iScalar * __restrict__ lhs, + const iMatrix * __restrict__ rhs) +{ + for(int c2=0;c2_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]); + else + ret->_internal[c1][c2]=lhs->_internal[c1][c2]; + }} + return; +} +template strong_inline void add(iMatrix * __restrict__ ret, + const iMatrix * __restrict__ lhs, + const iScalar * __restrict__ rhs) +{ + for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal); + else + ret->_internal[c1][c2]=lhs->_internal[c1][c2]; + }} + return; } + +// + operator for scalar, vector, matrix +template +//strong_inline auto operator + (iScalar& lhs,iScalar&& rhs) -> iScalar +strong_inline auto operator + (const iScalar& lhs,const iScalar& rhs) -> iScalar +{ + typedef iScalar ret_t; + ret_t ret; + add(&ret,&lhs,&rhs); + return ret; +} +template +strong_inline auto operator + (const iVector& lhs,const iVector& rhs) ->iVector +{ + typedef iVector ret_t; + ret_t ret; + add(&ret,&lhs,&rhs); + return ret; +} +template +strong_inline auto operator + (const iMatrix& lhs,const iMatrix& rhs) ->iMatrix +{ + typedef iMatrix ret_t; + ret_t ret; + add(&ret,&lhs,&rhs); + return ret; +} +template +strong_inline auto operator + (const iScalar& lhs,const iMatrix& rhs)->iMatrix +{ + typedef iMatrix ret_t; + ret_t ret; + add(&ret,&lhs,&rhs); + return ret; +} + +template +strong_inline auto operator + (const iMatrix& lhs,const iScalar& rhs)->iMatrix +{ + typedef iMatrix ret_t; + ret_t ret; + add(&ret,&lhs,&rhs); + return ret; +} + +NAMESPACE_END(Grid); + + #endif diff --git a/lib/tensors/Tensor_arith_mac.h b/lib/tensors/Tensor_arith_mac.h index c053224b..e05a7e3a 100644 --- a/lib/tensors/Tensor_arith_mac.h +++ b/lib/tensors/Tensor_arith_mac.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,86 +23,86 @@ Author: Peter Boyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_MATH_ARITH_MAC_H #define GRID_MATH_ARITH_MAC_H -namespace Grid { - +NAMESPACE_BEGIN(Grid); /////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////// MAC /////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////// - /////////////////////////// +/////////////////////////// - /////////////////////////// - // Legal multiplication table - /////////////////////////// - // scal x scal = scal - // mat x mat = mat - // mat x scal = mat - // scal x mat = mat - // mat x vec = vec - // vec x scal = vec - // scal x vec = vec - /////////////////////////// +/////////////////////////// +// Legal multiplication table +/////////////////////////// +// scal x scal = scal +// mat x mat = mat +// mat x scal = mat +// scal x mat = mat +// mat x vec = vec +// vec x scal = vec +// scal x vec = vec +/////////////////////////// template strong_inline void mac(iScalar * __restrict__ ret,const iScalar * __restrict__ lhs,const iScalar * __restrict__ rhs) { - mac(&ret->_internal,&lhs->_internal,&rhs->_internal); + mac(&ret->_internal,&lhs->_internal,&rhs->_internal); } template strong_inline void mac(iMatrix * __restrict__ ret,const iMatrix * __restrict__ lhs,const iMatrix * __restrict__ rhs){ - for(int c3=0;c3_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]); - }}} - return; + }}} + return; } template strong_inline void mac(iMatrix * __restrict__ ret,const iMatrix * __restrict__ lhs,const iScalar * __restrict__ rhs){ - for(int c1=0;c1_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal); + mac(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal); }} - return; + return; } template strong_inline void mac(iMatrix * __restrict__ ret,const iScalar * __restrict__ lhs,const iMatrix * __restrict__ rhs){ - for(int c1=0;c1_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]); + mac(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]); }} - return; + return; } template strong_inline void mac(iVector * __restrict__ ret,const iMatrix * __restrict__ lhs,const iVector * __restrict__ rhs) { - for(int c1=0;c1_internal[c1],&lhs->_internal[c1][c2],&rhs->_internal[c2]); + mac(&ret->_internal[c1],&lhs->_internal[c1][c2],&rhs->_internal[c2]); }} - return; + return; } template strong_inline void mac(iVector * __restrict__ ret,const iScalar * __restrict__ lhs,const iVector * __restrict__ rhs) { - for(int c1=0;c1_internal[c1],&lhs->_internal,&rhs->_internal[c1]); - } - return; + for(int c1=0;c1_internal[c1],&lhs->_internal,&rhs->_internal[c1]); + } + return; } template strong_inline void mac(iVector * __restrict__ ret,const iVector * __restrict__ lhs,const iScalar * __restrict__ rhs) { - for(int c1=0;c1_internal[c1],&lhs->_internal[c1],&rhs->_internal); - } - return; -} + for(int c1=0;c1_internal[c1],&lhs->_internal[c1],&rhs->_internal); + } + return; } +NAMESPACE_END(Grid); + #endif diff --git a/lib/tensors/Tensor_arith_mul.h b/lib/tensors/Tensor_arith_mul.h index a474db9c..7d0dfdb7 100644 --- a/lib/tensors/Tensor_arith_mul.h +++ b/lib/tensors/Tensor_arith_mul.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,21 +23,20 @@ Author: Peter Boyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_MATH_ARITH_MUL_H #define GRID_MATH_ARITH_MUL_H -namespace Grid { +NAMESPACE_BEGIN(Grid); - - /////////////////////////////////////////////////////////////////////////////////////////////////// - /////////////////////////////////////////// MUL /////////////////////////////////////////// - /////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////// MUL /////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////// template strong_inline void mult(iScalar * __restrict__ ret,const iScalar * __restrict__ lhs,const iScalar * __restrict__ rhs){ - mult(&ret->_internal,&lhs->_internal,&rhs->_internal); + mult(&ret->_internal,&lhs->_internal,&rhs->_internal); } template @@ -59,48 +58,48 @@ strong_inline void mult(iMatrix * __restrict__ ret,const iMatrix strong_inline void mult(iMatrix * __restrict__ ret,const iMatrix * __restrict__ lhs,const iScalar * __restrict__ rhs){ - for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal); + mult(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal); }} - return; + return; } template strong_inline void mult(iMatrix * __restrict__ ret,const iScalar * __restrict__ lhs,const iMatrix * __restrict__ rhs){ - for(int c2=0;c2_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]); + mult(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]); }} - return; + return; } // Matrix left multiplies vector template strong_inline void mult(iVector * __restrict__ ret,const iMatrix * __restrict__ lhs,const iVector * __restrict__ rhs) { - for(int c1=0;c1_internal[c1],&lhs->_internal[c1][0],&rhs->_internal[0]); - for(int c2=1;c2_internal[c1],&lhs->_internal[c1][c2],&rhs->_internal[c2]); - } + for(int c1=0;c1_internal[c1],&lhs->_internal[c1][0],&rhs->_internal[0]); + for(int c2=1;c2_internal[c1],&lhs->_internal[c1][c2],&rhs->_internal[c2]); } - return; + } + return; } template strong_inline void mult(iVector * __restrict__ ret, - const iScalar * __restrict__ lhs, - const iVector * __restrict__ rhs){ - for(int c1=0;c1_internal[c1],&lhs->_internal,&rhs->_internal[c1]); - } + const iScalar * __restrict__ lhs, + const iVector * __restrict__ rhs){ + for(int c1=0;c1_internal[c1],&lhs->_internal,&rhs->_internal[c1]); + } } template strong_inline void mult(iVector * __restrict__ ret, - const iVector * __restrict__ rhs, - const iScalar * __restrict__ lhs){ - for(int c1=0;c1_internal[c1],&rhs->_internal[c1],&lhs->_internal); - } + const iVector * __restrict__ rhs, + const iScalar * __restrict__ lhs){ + for(int c1=0;c1_internal[c1],&rhs->_internal[c1],&lhs->_internal); + } } @@ -108,25 +107,25 @@ strong_inline void mult(iVector * __restrict__ ret, template strong_inline iVector operator * (const iMatrix& lhs,const iVector& rhs) { - iVector ret; - mult(&ret,&lhs,&rhs); - return ret; + iVector ret; + mult(&ret,&lhs,&rhs); + return ret; } template strong_inline iVector operator * (const iScalar& lhs,const iVector& rhs) { - iVector ret; - mult(&ret,&lhs,&rhs); - return ret; + iVector ret; + mult(&ret,&lhs,&rhs); + return ret; } template strong_inline iVector operator * (const iVector& lhs,const iScalar& rhs) { - iVector ret; - mult(&ret,&lhs,&rhs); - return ret; + iVector ret; + mult(&ret,&lhs,&rhs); + return ret; } ////////////////////////////////////////////////////////////////// @@ -135,119 +134,119 @@ iVector operator * (const iVector& lhs,const iScalar& r template strong_inline iScalar operator / (const iScalar& lhs,const iScalar& rhs) { - iScalar ret; - ret._internal = lhs._internal/rhs._internal; - return ret; + iScalar ret; + ret._internal = lhs._internal/rhs._internal; + return ret; } template strong_inline iVector operator / (const iVector& lhs,const iScalar& rhs) { - iVector ret; - for(int i=0;i ret; + for(int i=0;i strong_inline iMatrix operator / (const iMatrix& lhs,const iScalar& rhs) { - iMatrix ret; - for(int i=0;i ret; + for(int i=0;i x matrix-> matrix - // while matrix x matrix-> matrix - // so return type depends on argument types in nasty way. - ////////////////////////////////////////////////////////////////// - // scal x scal = scal - // mat x mat = mat - // mat x scal = mat - // scal x mat = mat - // mat x vec = vec - // vec x scal = vec - // scal x vec = vec - // - // We can special case scalar_type ?? +////////////////////////////////////////////////////////////////// +// Glue operators to mult routines. Must resolve return type cleverly from typeof(internal) +// since nesting matrix x matrix-> matrix +// while matrix x matrix-> matrix +// so return type depends on argument types in nasty way. +////////////////////////////////////////////////////////////////// +// scal x scal = scal +// mat x mat = mat +// mat x scal = mat +// scal x mat = mat +// mat x vec = vec +// vec x scal = vec +// scal x vec = vec +// +// We can special case scalar_type ?? template strong_inline auto operator * (const iScalar& lhs,const iScalar& rhs) -> iScalar { - typedef iScalar ret_t; - ret_t ret; - mult(&ret,&lhs,&rhs); - return ret; + typedef iScalar ret_t; + ret_t ret; + mult(&ret,&lhs,&rhs); + return ret; } template strong_inline auto operator * (const iMatrix& lhs,const iMatrix& rhs) -> iMatrix { - typedef decltype(lhs._internal[0][0]*rhs._internal[0][0]) ret_t; - iMatrix ret; - mult(&ret,&lhs,&rhs); - return ret; + typedef decltype(lhs._internal[0][0]*rhs._internal[0][0]) ret_t; + iMatrix ret; + mult(&ret,&lhs,&rhs); + return ret; } template strong_inline auto operator * (const iMatrix& lhs,const iScalar& rhs) -> iMatrix { - typedef decltype(lhs._internal[0][0]*rhs._internal) ret_t; + typedef decltype(lhs._internal[0][0]*rhs._internal) ret_t; - iMatrix ret; - for(int c1=0;c1 ret; + for(int c1=0;c1 strong_inline auto operator * (const iScalar& lhs,const iMatrix& rhs) -> iMatrix { - typedef decltype(lhs._internal*rhs._internal[0][0]) ret_t; - iMatrix ret; - for(int c1=0;c1 ret; + for(int c1=0;c1 strong_inline auto operator * (const iMatrix& lhs,const iVector& rhs) -> iVector { - typedef decltype(lhs._internal[0][0]*rhs._internal[0]) ret_t; - iVector ret; - for(int c1=0;c1 ret; + for(int c1=0;c1 strong_inline auto operator * (const iScalar& lhs,const iVector& rhs) -> iVector { - typedef decltype(lhs._internal*rhs._internal[0]) ret_t; - iVector ret; - for(int c1=0;c1 ret; + for(int c1=0;c1 strong_inline auto operator * (const iVector& lhs,const iScalar& rhs) -> iVector { - typedef decltype(lhs._internal[0]*rhs._internal) ret_t; - iVector ret; - for(int c1=0;c1 ret; + for(int c1=0;c1 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_MATH_ARITH_SCALAR_H #define GRID_MATH_ARITH_SCALAR_H -namespace Grid { - +NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////////////////////////////////////////// // Must support native C++ types Integer, Complex, Real @@ -283,6 +282,6 @@ template strong_inline iMatrix operator - (Integer lhs,const return slhs-rhs; } +NAMESPACE_END(Grid); -} #endif diff --git a/lib/tensors/Tensor_arith_sub.h b/lib/tensors/Tensor_arith_sub.h index 8646bdfb..b7593d4d 100644 --- a/lib/tensors/Tensor_arith_sub.h +++ b/lib/tensors/Tensor_arith_sub.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -24,17 +24,16 @@ Author: neo 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_MATH_ARITH_SUB_H #define GRID_MATH_ARITH_SUB_H -namespace Grid { +NAMESPACE_BEGIN(Grid); - - /////////////////////////////////////////////////////////////////////////////////////////////////// - /////////////////////////////////////////// SUB /////////////////////////////////////////// - /////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////// SUB /////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////// // SUB is simple for now; cannot mix types and straightforward template @@ -43,102 +42,101 @@ namespace Grid { // Matrix +/- Matrix // Matrix /- scalar template strong_inline void sub(iScalar * __restrict__ ret, - const iScalar * __restrict__ lhs, - const iScalar * __restrict__ rhs) + const iScalar * __restrict__ lhs, + const iScalar * __restrict__ rhs) { - sub(&ret->_internal,&lhs->_internal,&rhs->_internal); + sub(&ret->_internal,&lhs->_internal,&rhs->_internal); } template strong_inline void sub(iVector * __restrict__ ret, - const iVector * __restrict__ lhs, - const iVector * __restrict__ rhs) + const iVector * __restrict__ lhs, + const iVector * __restrict__ rhs) { - for(int c=0;c_internal[c]=lhs->_internal[c]-rhs->_internal[c]; - } - return; + for(int c=0;c_internal[c]=lhs->_internal[c]-rhs->_internal[c]; + } + return; } template strong_inline void sub(iMatrix * __restrict__ ret, - const iMatrix * __restrict__ lhs, - const iMatrix * __restrict__ rhs){ - for(int c2=0;c2 * __restrict__ lhs, + const iMatrix * __restrict__ rhs){ + for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal[c1][c2]); + sub(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal[c1][c2]); }} - return; + return; } template strong_inline void sub(iMatrix * __restrict__ ret, - const iScalar * __restrict__ lhs, - const iMatrix * __restrict__ rhs){ - for(int c2=0;c2 * __restrict__ lhs, + const iMatrix * __restrict__ rhs){ + for(int c2=0;c2_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]); - } else { - // Fails -- need unary minus. Catalogue other unops? - ret->_internal[c1][c2]=zero; - ret->_internal[c1][c2]=ret->_internal[c1][c2]-rhs->_internal[c1][c2]; + if ( c1==c2) { + sub(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]); + } else { + // Fails -- need unary minus. Catalogue other unops? + ret->_internal[c1][c2]=zero; + ret->_internal[c1][c2]=ret->_internal[c1][c2]-rhs->_internal[c1][c2]; - } + } }} - return; + return; } template strong_inline void sub(iMatrix * __restrict__ ret, - const iMatrix * __restrict__ lhs, - const iScalar * __restrict__ rhs){ - for(int c2=0;c2 * __restrict__ lhs, + const iScalar * __restrict__ rhs){ + for(int c2=0;c2_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal); - else - ret->_internal[c1][c2]=lhs->_internal[c1][c2]; + if ( c1==c2) + sub(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal); + else + ret->_internal[c1][c2]=lhs->_internal[c1][c2]; }} - return; + return; } - // - operator for scalar, vector, matrix +// - operator for scalar, vector, matrix template strong_inline auto operator - (const iScalar& lhs, const iScalar& rhs) -> iScalar { - typedef iScalar ret_t; - ret_t ret; - sub(&ret,&lhs,&rhs); - return ret; + typedef iScalar ret_t; + ret_t ret; + sub(&ret,&lhs,&rhs); + return ret; } template strong_inline auto operator - (const iVector& lhs,const iVector& rhs) ->iVector { - typedef iVector ret_t; - ret_t ret; - sub(&ret,&lhs,&rhs); - return ret; + typedef iVector ret_t; + ret_t ret; + sub(&ret,&lhs,&rhs); + return ret; } template strong_inline auto operator - (const iMatrix& lhs,const iMatrix& rhs) ->iMatrix { - typedef iMatrix ret_t; - ret_t ret; - sub(&ret,&lhs,&rhs); - return ret; + typedef iMatrix ret_t; + ret_t ret; + sub(&ret,&lhs,&rhs); + return ret; } template strong_inline auto operator - (const iScalar& lhs,const iMatrix& rhs)->iMatrix { - typedef iMatrix ret_t; - ret_t ret; - sub(&ret,&lhs,&rhs); - return ret; + typedef iMatrix ret_t; + ret_t ret; + sub(&ret,&lhs,&rhs); + return ret; } template strong_inline auto operator - (const iMatrix& lhs,const iScalar& rhs)->iMatrix { - typedef iMatrix ret_t; - ret_t ret; - sub(&ret,&lhs,&rhs); - return ret; + typedef iMatrix ret_t; + ret_t ret; + sub(&ret,&lhs,&rhs); + return ret; } - -} +NAMESPACE_END(Grid); #endif diff --git a/lib/tensors/Tensor_class.h b/lib/tensors/Tensor_class.h index c7f868db..27e3622c 100644 --- a/lib/tensors/Tensor_class.h +++ b/lib/tensors/Tensor_class.h @@ -20,11 +20,11 @@ with this program; if not, write to the Free Software Foundation, Inc., See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ -/* END LEGAL */ + /* END LEGAL */ #ifndef GRID_MATH_TENSORS_H #define GRID_MATH_TENSORS_H -namespace Grid { +NAMESPACE_BEGIN(Grid); /////////////////////////////////////////////////// // Scalar, Vector, Matrix objects. @@ -44,7 +44,7 @@ class GridTensorBase {}; template class iScalar { - public: +public: vtype _internal; typedef vtype element; @@ -69,10 +69,10 @@ class iScalar { // iScalar::tensor_reduce_level >; iScalar() = default; /* - iScalar(const iScalar ©me)=default; - iScalar(iScalar &©me)=default; - iScalar & operator= (const iScalar ©me) = default; - iScalar & operator= (iScalar &©me) = default; + iScalar(const iScalar ©me)=default; + iScalar(iScalar &©me)=default; + iScalar & operator= (const iScalar ©me) = default; + iScalar & operator= (iScalar &©me) = default; */ // template @@ -109,7 +109,7 @@ class iScalar { friend strong_inline void exchange(iScalar &out1,iScalar &out2, const iScalar &in1,const iScalar &in2,int type){ exchange(out1._internal,out2._internal, - in1._internal, in2._internal,type); + in1._internal, in2._internal,type); } // Unary negation @@ -185,13 +185,13 @@ TensorRemove(T arg) { } template strong_inline auto TensorRemove(iScalar arg) - -> decltype(TensorRemove(arg._internal)) { + -> decltype(TensorRemove(arg._internal)) { return TensorRemove(arg._internal); } template class iVector { - public: +public: vtype _internal[N]; typedef vtype element; @@ -211,7 +211,7 @@ class iVector { typedef iVector::DoublePrecision, N> DoublePrecision; template ::value, T>::type - * = nullptr> + * = nullptr> strong_inline auto operator=(T arg) -> iVector { zeroit(*this); for (int i = 0; i < N; i++) _internal[i] = arg; @@ -222,10 +222,10 @@ class iVector { iVector(const Zero &z) { *this = zero; }; iVector() = default; /* - iVector(const iVector ©me)=default; - iVector(iVector &©me)=default; - iVector & operator= (const iVector ©me) = default; - iVector & operator= (iVector &©me) = default; + iVector(const iVector ©me)=default; + iVector(iVector &©me)=default; + iVector & operator= (const iVector ©me) = default; + iVector & operator= (iVector &©me) = default; */ iVector &operator=(const Zero &hero) { @@ -265,7 +265,7 @@ class iVector { const iVector &in1,const iVector &in2,int type){ for(int i=0;i class iMatrix { - public: +public: vtype _internal[N][N]; typedef vtype element; @@ -344,10 +344,10 @@ class iMatrix { }; // recurse down and hit the constructor for vector_type /* - iMatrix(const iMatrix ©me)=default; - iMatrix(iMatrix &©me)=default; - iMatrix & operator= (const iMatrix ©me) = default; - iMatrix & operator= (iMatrix &©me) = default; + iMatrix(const iMatrix ©me)=default; + iMatrix(iMatrix &©me)=default; + iMatrix & operator= (const iMatrix ©me) = default; + iMatrix & operator= (iMatrix &©me) = default; */ iMatrix &operator=(const Zero &hero) { @@ -355,109 +355,109 @@ class iMatrix { return *this; } template ::value, T>::type - * = nullptr> + * = nullptr> strong_inline auto operator=(T arg) -> iMatrix { zeroit(*this); for (int i = 0; i < N; i++) _internal[i][i] = arg; return *this; } - friend strong_inline void zeroit(iMatrix &that){ - for(int i=0;i &that){ + for(int i=0;i &that){ - for(int i=0;i &that){ + for(int i=0;i &out,const iMatrix &in){ - for(int i=0;i &out,const iMatrix &in,int lane){ - for(int i=0;i &out,const iMatrix &in){ + for(int i=0;i &out,const iMatrix &in,int lane){ + for(int i=0;i &out,const iMatrix &in,int permutetype){ - for(int i=0;i &out,const iMatrix &in,int permutetype){ + for(int i=0;i &out,const iMatrix &in,int rot){ - for(int i=0;i &out,const iMatrix &in,int rot){ + for(int i=0;i &out1,iMatrix &out2, - const iMatrix &in1,const iMatrix &in2,int type){ - for(int i=0;i &out1,iMatrix &out2, + const iMatrix &in1,const iMatrix &in2,int type){ + for(int i=0;i operator-(const iMatrix &r) { - iMatrix ret; - for (int i = 0; i < N; i++) { - for (int j = 0; j < N; j++) { - ret._internal[i][j] = -r._internal[i][j]; - } +// Unary negation +friend strong_inline iMatrix operator-(const iMatrix &r) { + iMatrix ret; + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + ret._internal[i][j] = -r._internal[i][j]; } - return ret; - } - // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour - template - strong_inline iMatrix &operator*=(const T &r) { - *this = (*this) * r; - return *this; - } - template - strong_inline iMatrix &operator-=(const T &r) { - *this = (*this) - r; - return *this; - } - template - strong_inline iMatrix &operator+=(const T &r) { - *this = (*this) + r; - return *this; } + return ret; +} +// *=,+=,-= operators inherit from corresponding "*,-,+" behaviour +template +strong_inline iMatrix &operator*=(const T &r) { + *this = (*this) * r; + return *this; +} +template +strong_inline iMatrix &operator-=(const T &r) { + *this = (*this) - r; + return *this; +} +template +strong_inline iMatrix &operator+=(const T &r) { + *this = (*this) + r; + return *this; +} - // returns an lvalue reference - strong_inline vtype &operator()(int i, int j) { return _internal[i][j]; } - strong_inline const vtype &operator()(int i, int j) const { - return _internal[i][j]; - } - friend std::ostream &operator<<(std::ostream &stream, - const iMatrix &o) { - stream << "M<" << N << ">{"; - for (int i = 0; i < N; i++) { - stream << "{"; - for (int j = 0; j < N; j++) { - stream << o._internal[i][j]; - if (i < N - 1) stream << ","; - } - stream << "}"; - if (i != N - 1) stream << "\n\t\t"; +// returns an lvalue reference +strong_inline vtype &operator()(int i, int j) { return _internal[i][j]; } +strong_inline const vtype &operator()(int i, int j) const { + return _internal[i][j]; +} +friend std::ostream &operator<<(std::ostream &stream, + const iMatrix &o) { + stream << "M<" << N << ">{"; + for (int i = 0; i < N; i++) { + stream << "{"; + for (int j = 0; j < N; j++) { + stream << o._internal[i][j]; + if (i < N - 1) stream << ","; } stream << "}"; - return stream; - }; + if (i != N - 1) stream << "\n\t\t"; + } + stream << "}"; + return stream; +}; - // strong_inline vtype && operator ()(int i,int j) { - // return _internal[i][j]; - // } +// strong_inline vtype && operator ()(int i,int j) { +// return _internal[i][j]; +// } }; template @@ -478,7 +478,9 @@ void vprefetch(const iMatrix &vv) { } } } -} + +NAMESPACE_END(Grid); + #endif diff --git a/lib/tensors/Tensor_determinant.h b/lib/tensors/Tensor_determinant.h index a58a8074..c81f19f8 100644 --- a/lib/tensors/Tensor_determinant.h +++ b/lib/tensors/Tensor_determinant.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,50 +23,51 @@ Author: neo 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_MATH_DET_H #define GRID_MATH_DET_H -namespace Grid { - /////////////////////////////////////////////// - // Determinant function for scalar, vector, matrix - /////////////////////////////////////////////// - inline ComplexF Determinant( const ComplexF &arg){ return arg;} - inline ComplexD Determinant( const ComplexD &arg){ return arg;} - inline RealF Determinant( const RealF &arg){ return arg;} - inline RealD Determinant( const RealD &arg){ return arg;} - - template inline auto Determinant(const iScalar&r) -> iScalar - { - iScalar ret; - ret._internal = Determinant(r._internal); - return ret; - } - - template::TensorLevel == 0 >::type * =nullptr> - inline iScalar Determinant(const iMatrix &arg) - { - iMatrix ret(arg); - iScalar det = vtype(1.0); - /* Conversion of matrix to upper triangular */ - for(int i = 0; i < N; i++){ - for(int j = 0; j < N; j++){ - if(j>i){ - vtype ratio = ret._internal[j][i]/ret._internal[i][i]; - for(int k = 0; k < N; k++){ - ret._internal[j][k] -= ratio * ret._internal[i][k]; - } - } - } - } - - for(int i = 0; i < N; i++) - det *= ret._internal[i][i]; - - return det; - } +NAMESPACE_BEGIN(Grid); +/////////////////////////////////////////////// +// Determinant function for scalar, vector, matrix +/////////////////////////////////////////////// +inline ComplexF Determinant( const ComplexF &arg){ return arg;} +inline ComplexD Determinant( const ComplexD &arg){ return arg;} +inline RealF Determinant( const RealF &arg){ return arg;} +inline RealD Determinant( const RealD &arg){ return arg;} +template inline auto Determinant(const iScalar&r) -> iScalar +{ + iScalar ret; + ret._internal = Determinant(r._internal); + return ret; } + +template::TensorLevel == 0 >::type * =nullptr> +inline iScalar Determinant(const iMatrix &arg) +{ + iMatrix ret(arg); + iScalar det = vtype(1.0); + /* Conversion of matrix to upper triangular */ + for(int i = 0; i < N; i++){ + for(int j = 0; j < N; j++){ + if(j>i){ + vtype ratio = ret._internal[j][i]/ret._internal[i][i]; + for(int k = 0; k < N; k++){ + ret._internal[j][k] -= ratio * ret._internal[i][k]; + } + } + } + } + + for(int i = 0; i < N; i++) + det *= ret._internal[i][i]; + + return det; +} + +NAMESPACE_END(Grid); + #endif diff --git a/lib/tensors/Tensor_exp.h b/lib/tensors/Tensor_exp.h index f7eee8f0..76d659b7 100644 --- a/lib/tensors/Tensor_exp.h +++ b/lib/tensors/Tensor_exp.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,122 +23,120 @@ Author: neo 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_MATH_EXP_H #define GRID_MATH_EXP_H #define DEFAULT_MAT_EXP 12 -namespace Grid { +NAMESPACE_BEGIN(Grid); - /////////////////////////////////////////////// - // Exponentiate function for scalar, vector, matrix - /////////////////////////////////////////////// +/////////////////////////////////////////////// +// Exponentiate function for scalar, vector, matrix +/////////////////////////////////////////////// - template inline iScalar Exponentiate(const iScalar&r, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP) - { - iScalar ret; - ret._internal = Exponentiate(r._internal, alpha, Nexp); - return ret; - } +template inline iScalar Exponentiate(const iScalar&r, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP) +{ + iScalar ret; + ret._internal = Exponentiate(r._internal, alpha, Nexp); + return ret; +} template inline iVector Exponentiate(const iVector&r, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP) - { - iVector ret; - for (int i = 0; i < N; i++) - ret._internal[i] = Exponentiate(r._internal[i], alpha, Nexp); - return ret; - } +{ + iVector ret; + for (int i = 0; i < N; i++) + ret._internal[i] = Exponentiate(r._internal[i], alpha, Nexp); + return ret; +} - // Specialisation: Cayley-Hamilton exponential for SU(3) - template::TensorLevel == 0>::type * =nullptr> - inline iMatrix Exponentiate(const iMatrix &arg, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP ) - { - // for SU(3) 2x faster than the std implementation using Nexp=12 - // notice that it actually computes - // exp ( input matrix ) - // the i sign is coming from outside - // input matrix is anti-hermitian NOT hermitian - typedef iMatrix mat; - typedef iScalar scalar; - mat unit(1.0); - mat temp(unit); - const Complex one_over_three = 1.0 / 3.0; - const Complex one_over_two = 1.0 / 2.0; +// Specialisation: Cayley-Hamilton exponential for SU(3) +template::TensorLevel == 0>::type * =nullptr> +inline iMatrix Exponentiate(const iMatrix &arg, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP ) +{ + // for SU(3) 2x faster than the std implementation using Nexp=12 + // notice that it actually computes + // exp ( input matrix ) + // the i sign is coming from outside + // input matrix is anti-hermitian NOT hermitian + typedef iMatrix mat; + typedef iScalar scalar; + mat unit(1.0); + mat temp(unit); + const Complex one_over_three = 1.0 / 3.0; + const Complex one_over_two = 1.0 / 2.0; - scalar c0, c1, tmp, c0max, theta, u, w; - scalar xi0, u2, w2, cosw; - scalar fden, h0, h1, h2; - scalar e2iu, emiu, ixi0, qt; - scalar f0, f1, f2; - scalar unity(1.0); + scalar c0, c1, tmp, c0max, theta, u, w; + scalar xi0, u2, w2, cosw; + scalar fden, h0, h1, h2; + scalar e2iu, emiu, ixi0, qt; + scalar f0, f1, f2; + scalar unity(1.0); - mat iQ2 = arg*arg*alpha*alpha; - mat iQ3 = arg*iQ2*alpha; - // sign in c0 from the conventions on the Ta - scalar imQ3, reQ2; - imQ3 = imag( trace(iQ3) ); - reQ2 = real( trace(iQ2) ); - c0 = -imQ3 * one_over_three; - c1 = -reQ2 * one_over_two; + mat iQ2 = arg*arg*alpha*alpha; + mat iQ3 = arg*iQ2*alpha; + // sign in c0 from the conventions on the Ta + scalar imQ3, reQ2; + imQ3 = imag( trace(iQ3) ); + reQ2 = real( trace(iQ2) ); + c0 = -imQ3 * one_over_three; + c1 = -reQ2 * one_over_two; - // Cayley Hamilton checks to machine precision, tested - tmp = c1 * one_over_three; - c0max = 2.0 * pow(tmp, 1.5); + // Cayley Hamilton checks to machine precision, tested + tmp = c1 * one_over_three; + c0max = 2.0 * pow(tmp, 1.5); - theta = acos(c0 / c0max) * one_over_three; - u = sqrt(tmp) * cos(theta); - w = sqrt(c1) * sin(theta); + theta = acos(c0 / c0max) * one_over_three; + u = sqrt(tmp) * cos(theta); + w = sqrt(c1) * sin(theta); - xi0 = sin(w) / w; - u2 = u * u; - w2 = w * w; - cosw = cos(w); + xi0 = sin(w) / w; + u2 = u * u; + w2 = w * w; + cosw = cos(w); - ixi0 = timesI(xi0); - emiu = cos(u) - timesI(sin(u)); - e2iu = cos(2.0 * u) + timesI(sin(2.0 * u)); + ixi0 = timesI(xi0); + emiu = cos(u) - timesI(sin(u)); + e2iu = cos(2.0 * u) + timesI(sin(2.0 * u)); - h0 = e2iu * (u2 - w2) + - emiu * ((8.0 * u2 * cosw) + (2.0 * u * (3.0 * u2 + w2) * ixi0)); - h1 = e2iu * (2.0 * u) - emiu * ((2.0 * u * cosw) - (3.0 * u2 - w2) * ixi0); - h2 = e2iu - emiu * (cosw + (3.0 * u) * ixi0); + h0 = e2iu * (u2 - w2) + + emiu * ((8.0 * u2 * cosw) + (2.0 * u * (3.0 * u2 + w2) * ixi0)); + h1 = e2iu * (2.0 * u) - emiu * ((2.0 * u * cosw) - (3.0 * u2 - w2) * ixi0); + h2 = e2iu - emiu * (cosw + (3.0 * u) * ixi0); - fden = unity / (9.0 * u2 - w2); // reals - f0 = h0 * fden; - f1 = h1 * fden; - f2 = h2 * fden; + fden = unity / (9.0 * u2 - w2); // reals + f0 = h0 * fden; + f1 = h1 * fden; + f2 = h2 * fden; - return (f0 * unit + timesMinusI(f1) * arg*alpha - f2 * iQ2); - } + return (f0 * unit + timesMinusI(f1) * arg*alpha - f2 * iQ2); +} // General exponential template::TensorLevel == 0 >::type * =nullptr> - inline iMatrix Exponentiate(const iMatrix &arg, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP ) - { - // notice that it actually computes - // exp ( input matrix ) - // the i sign is coming from outside - // input matrix is anti-hermitian NOT hermitian - typedef iMatrix mat; - mat unit(1.0); - mat temp(unit); - for(int i=Nexp; i>=1;--i){ - temp *= alpha/RealD(i); - temp = unit + temp*arg; - } - return temp; - - } - - - +inline iMatrix Exponentiate(const iMatrix &arg, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP ) +{ + // notice that it actually computes + // exp ( input matrix ) + // the i sign is coming from outside + // input matrix is anti-hermitian NOT hermitian + typedef iMatrix mat; + mat unit(1.0); + mat temp(unit); + for(int i=Nexp; i>=1;--i){ + temp *= alpha/RealD(i); + temp = unit + temp*arg; + } + return temp; } + +NAMESPACE_END(Grid); + #endif diff --git a/lib/tensors/Tensor_extract_merge.h b/lib/tensors/Tensor_extract_merge.h index a32d3785..ff3e94d6 100644 --- a/lib/tensors/Tensor_extract_merge.h +++ b/lib/tensors/Tensor_extract_merge.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -27,8 +27,8 @@ Author: Christopher Kelly 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_EXTRACT_H #define GRID_EXTRACT_H ///////////////////////////////////////////////////////////////// @@ -37,256 +37,256 @@ Author: Christopher Kelly namespace Grid{ -//////////////////////////////////////////////////////////////////////////////////////////////// -// Extract/merge a fundamental vector type, to pointer array with offset -//////////////////////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////////////////////// + // Extract/merge a fundamental vector type, to pointer array with offset + //////////////////////////////////////////////////////////////////////////////////////////////// -template -inline void extract(typename std::enable_if::value, const vsimd >::type * y, + template + inline void extract(typename std::enable_if::value, const vsimd >::type * y, + std::vector &extracted,int offset){ + // FIXME: bounce off memory is painful + static const int Nsimd=sizeof(vsimd)/sizeof(scalar); + int Nextr=extracted.size(); + int s=Nsimd/Nextr; + + scalar*buf = (scalar *)y; + for(int i=0;i + inline void merge(typename std::enable_if::value, vsimd >::type * y, std::vector &extracted,int offset){ - // FIXME: bounce off memory is painful - static const int Nsimd=sizeof(vsimd)/sizeof(scalar); - int Nextr=extracted.size(); - int s=Nsimd/Nextr; - scalar*buf = (scalar *)y; - for(int i=0;i -inline void merge(typename std::enable_if::value, vsimd >::type * y, - std::vector &extracted,int offset){ + static const int Nsimd=sizeof(vsimd)/sizeof(scalar); - static const int Nsimd=sizeof(vsimd)/sizeof(scalar); - - int Nextr=extracted.size(); - int s=Nsimd/Nextr; // can have sparse occupation of simd vector if simd_layout does not fill it - // replicate n-fold. Use to allow Integer masks to - // predicate floating point of various width assignments and maintain conformable. - scalar *buf =(scalar *) y; - for(int i=0;i -inline void extract(typename std::enable_if::value, const vsimd >::type &y,std::vector &extracted){ - - int Nextr=extracted.size(); - int Nsimd=vsimd::Nsimd(); - int s=Nsimd/Nextr; - - scalar *buf = (scalar *)&y; - for(int i=0;i -inline void merge(typename std::enable_if::value, vsimd >::type &y,std::vector &extracted){ - int Nextr=extracted.size(); - static const int Nsimd=vsimd::Nsimd(); - int s=Nsimd/Nextr; - scalar *buf = (scalar *)&y; - - for(int i=0;i inline void extract(const vobj &vec,std::vector &extracted) -{ - typedef typename vobj::scalar_type scalar_type ; - typedef typename vobj::vector_type vector_type ; - - static const int Nsimd=sizeof(vector_type)/sizeof(scalar_type); - static const int words=sizeof(vobj)/sizeof(vector_type); - int Nextr=extracted.size(); - int s=Nsimd/Nextr; - - std::vector pointers(Nextr); - for(int i=0;i(&vp[w],pointers,w); - } -} -//////////////////////////////////////////////////////////////////////// -// Extract to a bunch of scalar object pointers, with offset -//////////////////////////////////////////////////////////////////////// -template inline -void extract(const vobj &vec,std::vector &extracted, int offset) -{ - typedef typename vobj::scalar_type scalar_type ; - typedef typename vobj::vector_type vector_type ; - - static const int words=sizeof(vobj)/sizeof(vector_type); - static const int Nsimd=vobj::vector_type::Nsimd(); - - int Nextr=extracted.size(); - int s = Nsimd/Nextr; - scalar_type * vp = (scalar_type *)&vec; - - for(int w=0;w inline -void extract1(const vobj &vec,std::vector &extracted, int offset) -{ - typedef typename vobj::scalar_type vobj_scalar_type ; - typedef typename vobj::vector_type vobj_vector_type ; - - typedef typename sobj::scalar_type sobj_scalar_type ; - - static const int words=sizeof(vobj)/sizeof(vobj_vector_type); - static const int Nsimd=vobj_vector_type::Nsimd(); - - int Nextr=extracted.size(); - int s = Nsimd/Nextr; - vobj_scalar_type * vp = (vobj_scalar_type *)&vec; - - for(int w=0;w inline -void merge(vobj &vec,std::vector &extracted) -{ - typedef typename vobj::scalar_type scalar_type ; - typedef typename vobj::vector_type vector_type ; - - static const int Nsimd=sizeof(vector_type)/sizeof(scalar_type); - static const int words=sizeof(vobj)/sizeof(vector_type); - - int Nextr = extracted.size(); - int splat=Nsimd/Nextr; - - std::vector pointers(Nextr); - for(int i=0;i(&vp[w],pointers,w); - } -} - -//////////////////////////////////////////////////////////////////////// -// Merge a bunch of different scalar object pointers, with offset -//////////////////////////////////////////////////////////////////////// -template inline -void merge(vobj &vec,std::vector &extracted,int offset) -{ - typedef typename vobj::scalar_type scalar_type ; - typedef typename vobj::vector_type vector_type ; - - const int Nsimd=sizeof(vector_type)/sizeof(scalar_type); - const int words=sizeof(vobj)/sizeof(vector_type); - - int Nextr=extracted.size(); - int s=Nsimd/Nextr; - - scalar_type *pointer; - scalar_type *vp = (scalar_type *)&vec; - - // assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0); - - for(int w=0;w + inline void extract(typename std::enable_if::value, const vsimd >::type &y,std::vector &extracted){ + + int Nextr=extracted.size(); + int Nsimd=vsimd::Nsimd(); + int s=Nsimd/Nextr; + + scalar *buf = (scalar *)&y; + for(int i=0;i + inline void merge(typename std::enable_if::value, vsimd >::type &y,std::vector &extracted){ + int Nextr=extracted.size(); + static const int Nsimd=vsimd::Nsimd(); + int s=Nsimd/Nextr; + scalar *buf = (scalar *)&y; + + for(int i=0;i inline void extract(const vobj &vec,std::vector &extracted) + { + typedef typename vobj::scalar_type scalar_type ; + typedef typename vobj::vector_type vector_type ; + + static const int Nsimd=sizeof(vector_type)/sizeof(scalar_type); + static const int words=sizeof(vobj)/sizeof(vector_type); + int Nextr=extracted.size(); + int s=Nsimd/Nextr; + + std::vector pointers(Nextr); + for(int i=0;i(&vp[w],pointers,w); + } + } + //////////////////////////////////////////////////////////////////////// + // Extract to a bunch of scalar object pointers, with offset + //////////////////////////////////////////////////////////////////////// + template inline + void extract(const vobj &vec,std::vector &extracted, int offset) + { + typedef typename vobj::scalar_type scalar_type ; + typedef typename vobj::vector_type vector_type ; + + static const int words=sizeof(vobj)/sizeof(vector_type); + static const int Nsimd=vobj::vector_type::Nsimd(); + + int Nextr=extracted.size(); + int s = Nsimd/Nextr; + scalar_type * vp = (scalar_type *)&vec; + + for(int w=0;w inline void merge1(vobj &vec,std::vector &extracted,int offset) -{ - typedef typename vobj::scalar_type scalar_type ; - typedef typename vobj::vector_type vector_type ; + //////////////////////////////////////////////////////////////////////// + // Extract to a bunch of scalar object pointers of different scalar type, with offset. Useful for precision change + //////////////////////////////////////////////////////////////////////// + template inline + void extract1(const vobj &vec,std::vector &extracted, int offset) + { + typedef typename vobj::scalar_type vobj_scalar_type ; + typedef typename vobj::vector_type vobj_vector_type ; + + typedef typename sobj::scalar_type sobj_scalar_type ; - static const int Nsimd=vobj::vector_type::Nsimd(); - static const int words=sizeof(vobj)/sizeof(vector_type); + static const int words=sizeof(vobj)/sizeof(vobj_vector_type); + static const int Nsimd=vobj_vector_type::Nsimd(); - scalar_type *vp = (scalar_type *)&vec; + int Nextr=extracted.size(); + int s = Nsimd/Nextr; + vobj_scalar_type * vp = (vobj_scalar_type *)&vec; - // assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0); - - for(int w=0;w inline void merge2(vobj &vec,std::vector &extracted,int offset) -{ - typedef typename vobj::scalar_type scalar_type ; - typedef typename vobj::vector_type vector_type ; - - const int Nsimd=vobj::vector_type::Nsimd(); - const int words=sizeof(vobj)/sizeof(vector_type); - - scalar_type *pointer; - scalar_type *vp = (scalar_type *)&vec; - // assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0); - - for(int w=0;w inline + void merge(vobj &vec,std::vector &extracted) + { + typedef typename vobj::scalar_type scalar_type ; + typedef typename vobj::vector_type vector_type ; + + static const int Nsimd=sizeof(vector_type)/sizeof(scalar_type); + static const int words=sizeof(vobj)/sizeof(vector_type); + + int Nextr = extracted.size(); + int splat=Nsimd/Nextr; + + std::vector pointers(Nextr); + for(int i=0;i(&vp[w],pointers,w); + } + } + + //////////////////////////////////////////////////////////////////////// + // Merge a bunch of different scalar object pointers, with offset + //////////////////////////////////////////////////////////////////////// + template inline + void merge(vobj &vec,std::vector &extracted,int offset) + { + typedef typename vobj::scalar_type scalar_type ; + typedef typename vobj::vector_type vector_type ; + + const int Nsimd=sizeof(vector_type)/sizeof(scalar_type); + const int words=sizeof(vobj)/sizeof(vector_type); + + int Nextr=extracted.size(); + int s=Nsimd/Nextr; + + scalar_type *pointer; + scalar_type *vp = (scalar_type *)&vec; + + // assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0); + + for(int w=0;w inline void merge1(vobj &vec,std::vector &extracted,int offset) + { + typedef typename vobj::scalar_type scalar_type ; + typedef typename vobj::vector_type vector_type ; + + static const int Nsimd=vobj::vector_type::Nsimd(); + static const int words=sizeof(vobj)/sizeof(vector_type); + + scalar_type *vp = (scalar_type *)&vec; + + // assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0); + + for(int w=0;w inline void merge2(vobj &vec,std::vector &extracted,int offset) + { + typedef typename vobj::scalar_type scalar_type ; + typedef typename vobj::vector_type vector_type ; + + const int Nsimd=vobj::vector_type::Nsimd(); + const int words=sizeof(vobj)/sizeof(vector_type); + + scalar_type *pointer; + scalar_type *vp = (scalar_type *)&vec; + // assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0); + + for(int w=0;w 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_TENSOR_INDEX_H #define GRID_TENSOR_INDEX_H @@ -35,18 +35,18 @@ Author: Peter Boyle // trace of a different index can distribute across the vector index in a replicated way // but we do not trace a vector index. -namespace Grid { +NAMESPACE_BEGIN(Grid); - /* Needed? -template inline ComplexF traceIndex(const ComplexF arg) { return arg;} -template inline ComplexD traceIndex(const ComplexD arg) { return arg;} -template inline RealF traceIndex(const RealF arg) { return arg;} -template inline RealD traceIndex(const RealD arg) { return arg;} - */ +/* Needed? + template inline ComplexF traceIndex(const ComplexF arg) { return arg;} + template inline ComplexD traceIndex(const ComplexD arg) { return arg;} + template inline RealF traceIndex(const RealF arg) { return arg;} + template inline RealD traceIndex(const RealD arg) { return arg;} +*/ template class TensorIndexRecursion { - public: +public: //////////////////////////////////////////////////// // Type Queries @@ -76,158 +76,158 @@ class TensorIndexRecursion { ret._internal = TensorIndexRecursion::traceIndex(arg._internal); return ret; } - template - static auto traceIndex(const iVector arg) -> iVector::traceIndex(arg._internal[0])),N> - { - iVector::traceIndex(arg._internal[0])),N> ret; - for(int i=0;i::traceIndex(arg._internal[i]); - } - return ret; +template +static auto traceIndex(const iVector arg) -> iVector::traceIndex(arg._internal[0])),N> +{ + iVector::traceIndex(arg._internal[0])),N> ret; + for(int i=0;i::traceIndex(arg._internal[i]); } + return ret; +} template static auto traceIndex(const iMatrix arg) -> iMatrix::traceIndex(arg._internal[0][0])),N> { iMatrix::traceIndex(arg._internal[0][0])),N> ret; for(int i=0;i::traceIndex(arg._internal[i][j]); - }} - return ret; - } - //////////////////////////////////////////// - // Recursion for peeking a specific index - //////////////////////////////////////////// - template - static auto peekIndex(const iScalar arg,int i) -> iScalar::peekIndex(arg._internal,0))> - { - iScalar::peekIndex(arg._internal,0))> ret; - ret._internal = TensorIndexRecursion::peekIndex(arg._internal,i); - return ret; - } - template - static auto peekIndex(const iScalar arg,int i,int j) -> iScalar::peekIndex(arg._internal,0,0))> - { - iScalar::peekIndex(arg._internal,0,0))> ret; - ret._internal = TensorIndexRecursion::peekIndex(arg._internal,i,j); + for(int j=0;j::traceIndex(arg._internal[i][j]); + }} return ret; } +//////////////////////////////////////////// +// Recursion for peeking a specific index +//////////////////////////////////////////// +template +static auto peekIndex(const iScalar arg,int i) -> iScalar::peekIndex(arg._internal,0))> +{ + iScalar::peekIndex(arg._internal,0))> ret; + ret._internal = TensorIndexRecursion::peekIndex(arg._internal,i); + return ret; +} +template +static auto peekIndex(const iScalar arg,int i,int j) -> iScalar::peekIndex(arg._internal,0,0))> +{ + iScalar::peekIndex(arg._internal,0,0))> ret; + ret._internal = TensorIndexRecursion::peekIndex(arg._internal,i,j); + return ret; +} - template - static auto peekIndex(const iVector arg,int ii) -> iVector::peekIndex(arg._internal[0],0)),N> - { - iVector::peekIndex(arg._internal[0],0)),N> ret; - for(int i=0;i::peekIndex(arg._internal[i],ii); - } - return ret; +template +static auto peekIndex(const iVector arg,int ii) -> iVector::peekIndex(arg._internal[0],0)),N> +{ + iVector::peekIndex(arg._internal[0],0)),N> ret; + for(int i=0;i::peekIndex(arg._internal[i],ii); } - template - static auto peekIndex(const iVector arg,int ii,int jj) -> iVector::peekIndex(arg._internal[0],0,0)),N> - { - iVector::peekIndex(arg._internal[0],0,0)),N> ret; - for(int i=0;i::peekIndex(arg._internal[i],ii,jj); - } - return ret; + return ret; +} +template +static auto peekIndex(const iVector arg,int ii,int jj) -> iVector::peekIndex(arg._internal[0],0,0)),N> +{ + iVector::peekIndex(arg._internal[0],0,0)),N> ret; + for(int i=0;i::peekIndex(arg._internal[i],ii,jj); } + return ret; +} - template - static auto peekIndex(const iMatrix arg,int ii) -> iMatrix::peekIndex(arg._internal[0][0],0)),N> - { - iMatrix::peekIndex(arg._internal[0][0],0)),N> ret; - for(int i=0;i +static auto peekIndex(const iMatrix arg,int ii) -> iMatrix::peekIndex(arg._internal[0][0],0)),N> +{ + iMatrix::peekIndex(arg._internal[0][0],0)),N> ret; + for(int i=0;i::peekIndex(arg._internal[i][j],ii); }} - return ret; - } - template - static auto peekIndex(const iMatrix arg,int ii,int jj) -> iMatrix::peekIndex(arg._internal[0][0],0,0)),N> - { - iMatrix::peekIndex(arg._internal[0][0],0,0)),N> ret; - for(int i=0;i +static auto peekIndex(const iMatrix arg,int ii,int jj) -> iMatrix::peekIndex(arg._internal[0][0],0,0)),N> +{ + iMatrix::peekIndex(arg._internal[0][0],0,0)),N> ret; + for(int i=0;i::peekIndex(arg._internal[i][j],ii,jj); }} - return ret; + return ret; +} +//////////////////////////////////////////// +// Recursion for poking a specific index +//////////////////////////////////////////// + +template inline static +void pokeIndex(iScalar &ret, const iScalar::peekIndex(ret._internal,0))> &arg, int i) +{ + TensorIndexRecursion::pokeIndex(ret._internal,arg._internal,i); +} +template inline static +void pokeIndex(iScalar &ret, const iScalar::peekIndex(ret._internal,0,0))> &arg, int i,int j) +{ + TensorIndexRecursion::pokeIndex(ret._internal,arg._internal,i,j); +} + +template inline static +void pokeIndex(iVector &ret, const iVector::peekIndex(ret._internal[0],0)),N> &arg, int i) +{ + for(int ii=0;ii::pokeIndex(ret._internal[ii],arg._internal[ii],i); } - //////////////////////////////////////////// - // Recursion for poking a specific index - //////////////////////////////////////////// - - template inline static - void pokeIndex(iScalar &ret, const iScalar::peekIndex(ret._internal,0))> &arg, int i) - { - TensorIndexRecursion::pokeIndex(ret._internal,arg._internal,i); - } - template inline static - void pokeIndex(iScalar &ret, const iScalar::peekIndex(ret._internal,0,0))> &arg, int i,int j) - { - TensorIndexRecursion::pokeIndex(ret._internal,arg._internal,i,j); - } - - template inline static - void pokeIndex(iVector &ret, const iVector::peekIndex(ret._internal[0],0)),N> &arg, int i) - { - for(int ii=0;ii::pokeIndex(ret._internal[ii],arg._internal[ii],i); - } - } - template inline static - void pokeIndex(iVector &ret, const iVector::peekIndex(ret._internal[0],0,0)),N> &arg, int i,int j) - { - for(int ii=0;ii::pokeIndex(ret._internal[ii],arg._internal[ii],i,j); - } - } - - template inline static - void pokeIndex(iMatrix &ret, const iMatrix::peekIndex(ret._internal[0][0],0)),N> &arg, int i) - { - for(int ii=0;ii::pokeIndex(ret._internal[ii][jj],arg._internal[ii][jj],i); - }} - } - template inline static - void pokeIndex(iMatrix &ret, const iMatrix::peekIndex(ret._internal[0][0],0,0)),N> &arg, int i,int j) - { - for(int ii=0;ii::pokeIndex(ret._internal[ii][jj],arg._internal[ii][jj],i,j); - }} - } - - //////////////////////////////////////////// - // Recursion for transposing a specific index - //////////////////////////////////////////// - template - static auto transposeIndex(const iScalar arg) -> iScalar - { - iScalar ret; - ret._internal = TensorIndexRecursion::transposeIndex(arg._internal); - return ret; +} +template inline static +void pokeIndex(iVector &ret, const iVector::peekIndex(ret._internal[0],0,0)),N> &arg, int i,int j) +{ + for(int ii=0;ii::pokeIndex(ret._internal[ii],arg._internal[ii],i,j); } - template - static auto transposeIndex(const iVector arg) -> iVector - { - iVector ret; - for(int i=0;i::transposeIndex(arg._internal[i]); - } - return ret; +} + +template inline static +void pokeIndex(iMatrix &ret, const iMatrix::peekIndex(ret._internal[0][0],0)),N> &arg, int i) +{ + for(int ii=0;ii::pokeIndex(ret._internal[ii][jj],arg._internal[ii][jj],i); + }} +} +template inline static +void pokeIndex(iMatrix &ret, const iMatrix::peekIndex(ret._internal[0][0],0,0)),N> &arg, int i,int j) +{ + for(int ii=0;ii::pokeIndex(ret._internal[ii][jj],arg._internal[ii][jj],i,j); + }} +} + +//////////////////////////////////////////// +// Recursion for transposing a specific index +//////////////////////////////////////////// +template +static auto transposeIndex(const iScalar arg) -> iScalar +{ + iScalar ret; + ret._internal = TensorIndexRecursion::transposeIndex(arg._internal); + return ret; +} +template +static auto transposeIndex(const iVector arg) -> iVector +{ + iVector ret; + for(int i=0;i::transposeIndex(arg._internal[i]); } - template - static auto transposeIndex(const iMatrix arg) -> iMatrix - { - iMatrix ret; - for(int i=0;i +static auto transposeIndex(const iMatrix arg) -> iMatrix +{ + iMatrix ret; + for(int i=0;i::transposeIndex(arg._internal[i][j]); }} - return ret; - } + return ret; +} }; //////////////////////////// @@ -236,7 +236,7 @@ class TensorIndexRecursion { #define RemoveCRV(a) typename std::remove_const::type>::type template<> class TensorIndexRecursion<0> { - public: +public: //////////////////////////////////////////////////// // Type Queries //////////////////////////////////////////////////// @@ -266,16 +266,16 @@ class TensorIndexRecursion<0> { ret._internal = arg._internal; return ret; } - template - static auto traceIndex(const iVector arg) -> iScalar - { - iScalar ret; - ret._internal=zero; - for(int i=0;i +static auto traceIndex(const iVector arg) -> iScalar +{ + iScalar ret; + ret._internal=zero; + for(int i=0;i static auto traceIndex(const iMatrix arg) -> iScalar { @@ -286,56 +286,56 @@ class TensorIndexRecursion<0> { } return ret; } - ///////////////////////////////////////// - // Ends recursion for transpose scalar/matrix ; no way to terminate on vector - ///////////////////////////////////////// - template - static auto transposeIndex(const iScalar arg) -> iScalar - { - iScalar ret; - ret._internal = arg._internal; - return ret; - } - template - static auto transposeIndex(const iMatrix arg) -> iMatrix - { - iMatrix ret; - ret=zero; - for(int i=0;i +static auto transposeIndex(const iScalar arg) -> iScalar +{ + iScalar ret; + ret._internal = arg._internal; + return ret; +} +template +static auto transposeIndex(const iMatrix arg) -> iMatrix +{ + iMatrix ret; + ret=zero; + for(int i=0;i - static auto peekIndex(const iVector arg,int ii) -> iScalar - { - iScalar ret; - ret._internal = arg._internal[ii]; - return ret; - } - template - static auto peekIndex(const iMatrix arg,int ii,int jj) -> iScalar - { - iScalar ret; - ret._internal = arg._internal[ii][jj]; - return ret; - } - // Vector poke, one index - template inline static - void pokeIndex(iVector &ret, const iScalar &arg,int i) - { - ret._internal[i] = arg._internal; - } - // Matrix poke two indices - template inline static - void pokeIndex(iMatrix &ret, const iScalar &arg,int i,int j) - { - ret._internal[i][j] = arg._internal; - } + return ret; +} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// End recursion for peeking a specific index; single index on vector, double index on matrix +//////////////////////////////////////////////////////////////////////////////////////////////////////////////// +template +static auto peekIndex(const iVector arg,int ii) -> iScalar +{ + iScalar ret; + ret._internal = arg._internal[ii]; + return ret; +} +template +static auto peekIndex(const iMatrix arg,int ii,int jj) -> iScalar +{ + iScalar ret; + ret._internal = arg._internal[ii][jj]; + return ret; +} +// Vector poke, one index +template inline static +void pokeIndex(iVector &ret, const iScalar &arg,int i) +{ + ret._internal[i] = arg._internal; +} +// Matrix poke two indices +template inline static +void pokeIndex(iMatrix &ret, const iScalar &arg,int i,int j) +{ + ret._internal[i][j] = arg._internal; +} }; @@ -404,5 +404,6 @@ void pokeIndex (vtype &ret,const decltype(TensorIndexRecursion::peekIndex #undef RemoveCRV -} +NAMESPACE_END(Grid); + #endif diff --git a/lib/tensors/Tensor_inner.h b/lib/tensors/Tensor_inner.h index 46185652..e457f936 100644 --- a/lib/tensors/Tensor_inner.h +++ b/lib/tensors/Tensor_inner.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -24,24 +24,26 @@ Author: Peter Boyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_MATH_INNER_H #define GRID_MATH_INNER_H -namespace Grid { - /////////////////////////////////////////////////////////////////////////////////////// - // innerProduct Scalar x Scalar -> Scalar - // innerProduct Vector x Vector -> Scalar - // innerProduct Matrix x Matrix -> Scalar - /////////////////////////////////////////////////////////////////////////////////////// - template inline RealD norm2(const sobj &arg){ - auto nrm = innerProductD(arg,arg); - RealD ret = real(nrm); - return ret; - } - ////////////////////////////////////// - // If single promote to double and sum 2x - ////////////////////////////////////// + +NAMESPACE_BEGIN(Grid); + +/////////////////////////////////////////////////////////////////////////////////////// +// innerProduct Scalar x Scalar -> Scalar +// innerProduct Vector x Vector -> Scalar +// innerProduct Matrix x Matrix -> Scalar +/////////////////////////////////////////////////////////////////////////////////////// +template inline RealD norm2(const sobj &arg){ + auto nrm = innerProductD(arg,arg); + RealD ret = real(nrm); + return ret; +} +////////////////////////////////////// +// If single promote to double and sum 2x +////////////////////////////////////// inline ComplexD innerProductD(const ComplexF &l,const ComplexF &r){ return innerProduct(l,r); } inline ComplexD innerProductD(const ComplexD &l,const ComplexD &r){ return innerProduct(l,r); } @@ -65,73 +67,74 @@ inline vRealD innerProductD(const vRealF &l,const vRealF &r){ return innerProduct(la,ra) + innerProduct(lb,rb); } - template inline - auto innerProductD (const iVector& lhs,const iVector& rhs) -> iScalar - { - typedef decltype(innerProductD(lhs._internal[0],rhs._internal[0])) ret_t; - iScalar ret; - ret=zero; - for(int c1=0;c1 inline +auto innerProductD (const iVector& lhs,const iVector& rhs) -> iScalar +{ + typedef decltype(innerProductD(lhs._internal[0],rhs._internal[0])) ret_t; + iScalar ret; + ret=zero; + for(int c1=0;c1 inline - auto innerProductD (const iMatrix& lhs,const iMatrix& rhs) -> iScalar - { - typedef decltype(innerProductD(lhs._internal[0][0],rhs._internal[0][0])) ret_t; - iScalar ret; - iScalar tmp; - ret=zero; - for(int c1=0;c1 inline +auto innerProductD (const iMatrix& lhs,const iMatrix& rhs) -> iScalar +{ + typedef decltype(innerProductD(lhs._internal[0][0],rhs._internal[0][0])) ret_t; + iScalar ret; + iScalar tmp; + ret=zero; + for(int c1=0;c1 inline +auto innerProductD (const iScalar& lhs,const iScalar& rhs) -> iScalar +{ + typedef decltype(innerProductD(lhs._internal,rhs._internal)) ret_t; + iScalar ret; + ret._internal = innerProductD(lhs._internal,rhs._internal); + return ret; +} +////////////////////// +// Keep same precison +////////////////////// +template inline +auto innerProduct (const iVector& lhs,const iVector& rhs) -> iScalar +{ + typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t; + iScalar ret; + ret=zero; + for(int c1=0;c1 inline - auto innerProductD (const iScalar& lhs,const iScalar& rhs) -> iScalar - { - typedef decltype(innerProductD(lhs._internal,rhs._internal)) ret_t; - iScalar ret; - ret._internal = innerProductD(lhs._internal,rhs._internal); - return ret; - } - ////////////////////// - // Keep same precison - ////////////////////// - template inline - auto innerProduct (const iVector& lhs,const iVector& rhs) -> iScalar - { - typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t; - iScalar ret; - ret=zero; - for(int c1=0;c1 inline - auto innerProduct (const iMatrix& lhs,const iMatrix& rhs) -> iScalar - { - typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t; - iScalar ret; - iScalar tmp; - ret=zero; - for(int c1=0;c1 inline +auto innerProduct (const iMatrix& lhs,const iMatrix& rhs) -> iScalar +{ + typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t; + iScalar ret; + iScalar tmp; + ret=zero; + for(int c1=0;c1 inline - auto innerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar - { - typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t; - iScalar ret; - ret._internal = innerProduct(lhs._internal,rhs._internal); - return ret; - } - + return ret; } +template inline +auto innerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar +{ + typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t; + iScalar ret; + ret._internal = innerProduct(lhs._internal,rhs._internal); + return ret; +} + +NAMESPACE_END(Grid); + #endif diff --git a/lib/tensors/Tensor_logical.h b/lib/tensors/Tensor_logical.h index 7ab3668b..85b586ba 100644 --- a/lib/tensors/Tensor_logical.h +++ b/lib/tensors/Tensor_logical.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,37 +23,38 @@ Author: Azusa Yamaguchi 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_TENSOR_LOGICAL_H #define GRID_TENSOR_LOGICAL_H -namespace Grid { +NAMESPACE_BEGIN(Grid); -#define LOGICAL_BINOP(Op)\ -template strong_inline iScalar operator Op (const iScalar& lhs,const iScalar& rhs) \ -{\ - iScalar ret;\ - ret._internal = lhs._internal Op rhs._internal ;\ - return ret;\ -}\ -template strong_inline iScalar operator Op (const iScalar& lhs,Integer rhs) \ -{\ - typename iScalar::scalar_type t; t=rhs;\ - typename iScalar::tensor_reduced srhs; srhs=t;\ - return lhs Op srhs;\ -}\ -template strong_inline iScalar operator Op (Integer lhs,const iScalar& rhs) \ -{\ - typename iScalar::scalar_type t;t=lhs;\ - typename iScalar::tensor_reduced slhs;slhs=t;\ - return slhs Op rhs;\ -} +#define LOGICAL_BINOP(Op) \ + template strong_inline iScalar operator Op (const iScalar& lhs,const iScalar& rhs) \ + { \ + iScalar ret; \ + ret._internal = lhs._internal Op rhs._internal ; \ + return ret; \ + } \ + template strong_inline iScalar operator Op (const iScalar& lhs,Integer rhs) \ + { \ + typename iScalar::scalar_type t; t=rhs; \ + typename iScalar::tensor_reduced srhs; srhs=t; \ + return lhs Op srhs; \ + } \ + template strong_inline iScalar operator Op (Integer lhs,const iScalar& rhs) \ + { \ + typename iScalar::scalar_type t;t=lhs; \ + typename iScalar::tensor_reduced slhs;slhs=t; \ + return slhs Op rhs; \ + } LOGICAL_BINOP(|); LOGICAL_BINOP(&); LOGICAL_BINOP(||); LOGICAL_BINOP(&&); -} +NAMESPACE_END(Grid); + #endif diff --git a/lib/tensors/Tensor_outer.h b/lib/tensors/Tensor_outer.h index 6429a190..33a34de2 100644 --- a/lib/tensors/Tensor_outer.h +++ b/lib/tensors/Tensor_outer.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,36 +23,38 @@ Author: Peter Boyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_MATH_OUTER_H #define GRID_MATH_OUTER_H -namespace Grid { - /////////////////////////////////////////////////////////////////////////////////////// - // outerProduct Scalar x Scalar -> Scalar - // Vector x Vector -> Matrix - /////////////////////////////////////////////////////////////////////////////////////// + +NAMESPACE_BEGIN(Grid); + +/////////////////////////////////////////////////////////////////////////////////////// +// outerProduct Scalar x Scalar -> Scalar +// Vector x Vector -> Matrix +/////////////////////////////////////////////////////////////////////////////////////// template inline auto outerProduct (const iVector& lhs,const iVector& rhs) -> iMatrix { - typedef decltype(outerProduct(lhs._internal[0],rhs._internal[0])) ret_t; - iMatrix ret; - for(int c1=0;c1 ret; + for(int c1=0;c1 inline auto outerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar { - typedef decltype(outerProduct(lhs._internal,rhs._internal)) ret_t; - iScalar ret; - ret._internal = outerProduct(lhs._internal,rhs._internal); - return ret; + typedef decltype(outerProduct(lhs._internal,rhs._internal)) ret_t; + iScalar ret; + ret._internal = outerProduct(lhs._internal,rhs._internal); + return ret; } @@ -75,5 +77,6 @@ inline RealD outerProduct(const RealD &l, const RealD& r) return l*r; } -} +NAMESPACE_END(Grid); + #endif diff --git a/lib/tensors/Tensor_reality.h b/lib/tensors/Tensor_reality.h index 8a183b88..df0dee4a 100644 --- a/lib/tensors/Tensor_reality.h +++ b/lib/tensors/Tensor_reality.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -24,20 +24,21 @@ Author: neo 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_MATH_REALITY_H #define GRID_MATH_REALITY_H -namespace Grid { + +NAMESPACE_BEGIN(Grid); /////////////////////////////////////////////// // multiply by I; make recursive. /////////////////////////////////////////////// template inline iScalar timesI(const iScalar&r) { - iScalar ret; - timesI(ret._internal,r._internal); - return ret; + iScalar ret; + timesI(ret._internal,r._internal); + return ret; } template inline iVector timesI(const iVector&r) { @@ -51,9 +52,9 @@ template inline iMatrix timesI(const iMatrix ret; for(int i=0;i inline void timesI(iVector &ret,const iVect template inline void timesI(iMatrix &ret,const iMatrix&r) { for(int i=0;i inline iScalar timesMinusI(const iScalar&r) { - iScalar ret; - timesMinusI(ret._internal,r._internal); - return ret; + iScalar ret; + timesMinusI(ret._internal,r._internal); + return ret; } template inline iVector timesMinusI(const iVector&r) { @@ -94,9 +95,9 @@ template inline iMatrix timesMinusI(const iMatrix ret; for(int i=0;i inline void timesMinusI(iVector &ret,const template inline void timesMinusI(iMatrix &ret,const iMatrix&r) { for(int i=0;i inline void timesMinusI(iMatrix &ret,const /////////////////////////////////////////////// template inline iScalar conjugate(const iScalar&r) { - iScalar ret; - ret._internal = conjugate(r._internal); - return ret; + iScalar ret; + ret._internal = conjugate(r._internal); + return ret; } template inline iVector conjugate(const iVector&r) { @@ -140,9 +141,9 @@ template inline iMatrix conjugate(const iMatrix ret; for(int i=0;i inline iMatrix conjugate(const iMatrix inline iScalar adj(const iScalar&r) { - iScalar ret; - ret._internal = adj(r._internal); - return ret; + iScalar ret; + ret._internal = adj(r._internal); + return ret; } template inline iVector adj(const iVector&r) { - iVector ret; - for(int i=0;i ret; + for(int i=0;i inline iMatrix adj(const iMatrix &arg) { - iMatrix ret; - for(int c1=0;c1 ret; + for(int c1=0;c1 inline iMatrix adj(const iMatrix & ///////////////////////////////////////////////////////////////// template inline auto real(const iScalar &z) -> iScalar { - iScalar ret; - ret._internal = real(z._internal); - return ret; + iScalar ret; + ret._internal = real(z._internal); + return ret; } template inline auto real(const iMatrix &z) -> iMatrix { - iMatrix ret; - for(int c1=0;c1 ret; + for(int c1=0;c1 inline auto real(const iVector &z) -> iVector { - iVector ret; - for(int c1=0;c1 ret; + for(int c1=0;c1 inline auto imag(const iScalar &z) -> iScalar { - iScalar ret; - ret._internal = imag(z._internal); - return ret; + iScalar ret; + ret._internal = imag(z._internal); + return ret; } template inline auto imag(const iMatrix &z) -> iMatrix { - iMatrix ret; - for(int c1=0;c1 ret; + for(int c1=0;c1 inline auto imag(const iVector &z) -> iVector { - iVector ret; - for(int c1=0;c1 ret; + for(int c1=0;c1 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_MATH_TRACE_H #define GRID_MATH_TRACE_H -namespace Grid { + +NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////////////////// // Traces: both all indices and a specific index. Indices must be @@ -43,24 +44,24 @@ inline RealD trace( const RealD &arg){ return arg;} template inline auto trace(const iMatrix &arg) -> iScalar { - iScalar ret; - zeroit(ret._internal); - for(int i=0;i ret; + zeroit(ret._internal); + for(int i=0;i inline auto trace(const iScalar &arg) -> iScalar { - iScalar ret; - ret._internal=trace(arg._internal); - return ret; + iScalar ret; + ret._internal=trace(arg._internal); + return ret; } template - inline auto trace(const iVector &arg) -> iVector +inline auto trace(const iVector &arg) -> iVector { iVector ret; for(int i=0;i return ret; } +NAMESPACE_END(Grid); -} #endif diff --git a/lib/tensors/Tensor_traits.h b/lib/tensors/Tensor_traits.h index c1ef397a..dd5986c7 100644 --- a/lib/tensors/Tensor_traits.h +++ b/lib/tensors/Tensor_traits.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/tensors/Tensor_traits.h Copyright (C) 2015 @@ -17,14 +17,14 @@ Author: Christopher Kelly with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_MATH_TRAITS_H #define GRID_MATH_TRAITS_H #include -namespace Grid { +NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////////////////////////////////// // Want to recurse: GridTypeMapper >::scalar_type == ComplexD. @@ -41,254 +41,256 @@ namespace Grid { // ////////////////////////////////////////////////////////////////////////////////// - template class GridTypeMapper { - public: - typedef typename T::scalar_type scalar_type; - typedef typename T::vector_type vector_type; - typedef typename T::vector_typeD vector_typeD; - typedef typename T::tensor_reduced tensor_reduced; - typedef typename T::scalar_object scalar_object; - typedef typename T::Complexified Complexified; - typedef typename T::Realified Realified; - typedef typename T::DoublePrecision DoublePrecision; - enum { TensorLevel = T::TensorLevel }; - }; +template class GridTypeMapper { +public: + typedef typename T::scalar_type scalar_type; + typedef typename T::vector_type vector_type; + typedef typename T::vector_typeD vector_typeD; + typedef typename T::tensor_reduced tensor_reduced; + typedef typename T::scalar_object scalar_object; + typedef typename T::Complexified Complexified; + typedef typename T::Realified Realified; + typedef typename T::DoublePrecision DoublePrecision; + enum { TensorLevel = T::TensorLevel }; +}; ////////////////////////////////////////////////////////////////////////////////// // Recursion stops with these template specialisations ////////////////////////////////////////////////////////////////////////////////// - template<> class GridTypeMapper { - public: - typedef RealF scalar_type; - typedef RealF vector_type; - typedef RealD vector_typeD; - typedef RealF tensor_reduced ; - typedef RealF scalar_object; - typedef ComplexF Complexified; - typedef RealF Realified; - typedef RealD DoublePrecision; - enum { TensorLevel = 0 }; - }; - template<> class GridTypeMapper { - public: - typedef RealD scalar_type; - typedef RealD vector_type; - typedef RealD vector_typeD; - typedef RealD tensor_reduced; - typedef RealD scalar_object; - typedef ComplexD Complexified; - typedef RealD Realified; - typedef RealD DoublePrecision; - enum { TensorLevel = 0 }; - }; - template<> class GridTypeMapper { - public: - typedef ComplexF scalar_type; - typedef ComplexF vector_type; - typedef ComplexD vector_typeD; - typedef ComplexF tensor_reduced; - typedef ComplexF scalar_object; - typedef ComplexF Complexified; - typedef RealF Realified; - typedef ComplexD DoublePrecision; - enum { TensorLevel = 0 }; - }; - template<> class GridTypeMapper { - public: - typedef ComplexD scalar_type; - typedef ComplexD vector_type; - typedef ComplexD vector_typeD; - typedef ComplexD tensor_reduced; - typedef ComplexD scalar_object; - typedef ComplexD Complexified; - typedef RealD Realified; - typedef ComplexD DoublePrecision; - enum { TensorLevel = 0 }; - }; - template<> class GridTypeMapper { - public: - typedef Integer scalar_type; - typedef Integer vector_type; - typedef Integer vector_typeD; - typedef Integer tensor_reduced; - typedef Integer scalar_object; - typedef void Complexified; - typedef void Realified; - typedef void DoublePrecision; - enum { TensorLevel = 0 }; - }; +template<> class GridTypeMapper { +public: + typedef RealF scalar_type; + typedef RealF vector_type; + typedef RealD vector_typeD; + typedef RealF tensor_reduced ; + typedef RealF scalar_object; + typedef ComplexF Complexified; + typedef RealF Realified; + typedef RealD DoublePrecision; + enum { TensorLevel = 0 }; +}; +template<> class GridTypeMapper { +public: + typedef RealD scalar_type; + typedef RealD vector_type; + typedef RealD vector_typeD; + typedef RealD tensor_reduced; + typedef RealD scalar_object; + typedef ComplexD Complexified; + typedef RealD Realified; + typedef RealD DoublePrecision; + enum { TensorLevel = 0 }; +}; +template<> class GridTypeMapper { +public: + typedef ComplexF scalar_type; + typedef ComplexF vector_type; + typedef ComplexD vector_typeD; + typedef ComplexF tensor_reduced; + typedef ComplexF scalar_object; + typedef ComplexF Complexified; + typedef RealF Realified; + typedef ComplexD DoublePrecision; + enum { TensorLevel = 0 }; +}; +template<> class GridTypeMapper { +public: + typedef ComplexD scalar_type; + typedef ComplexD vector_type; + typedef ComplexD vector_typeD; + typedef ComplexD tensor_reduced; + typedef ComplexD scalar_object; + typedef ComplexD Complexified; + typedef RealD Realified; + typedef ComplexD DoublePrecision; + enum { TensorLevel = 0 }; +}; +template<> class GridTypeMapper { +public: + typedef Integer scalar_type; + typedef Integer vector_type; + typedef Integer vector_typeD; + typedef Integer tensor_reduced; + typedef Integer scalar_object; + typedef void Complexified; + typedef void Realified; + typedef void DoublePrecision; + enum { TensorLevel = 0 }; +}; - template<> class GridTypeMapper { - public: - typedef RealF scalar_type; - typedef vRealF vector_type; - typedef vRealD vector_typeD; - typedef vRealF tensor_reduced; - typedef RealF scalar_object; - typedef vComplexF Complexified; - typedef vRealF Realified; - typedef vRealD DoublePrecision; - enum { TensorLevel = 0 }; - }; - template<> class GridTypeMapper { - public: - typedef RealD scalar_type; - typedef vRealD vector_type; - typedef vRealD vector_typeD; - typedef vRealD tensor_reduced; - typedef RealD scalar_object; - typedef vComplexD Complexified; - typedef vRealD Realified; - typedef vRealD DoublePrecision; - enum { TensorLevel = 0 }; - }; - template<> class GridTypeMapper { - public: - typedef ComplexF scalar_type; - typedef vComplexH vector_type; - typedef vComplexD vector_typeD; - typedef vComplexH tensor_reduced; - typedef ComplexF scalar_object; - typedef vComplexH Complexified; - typedef vRealH Realified; - typedef vComplexD DoublePrecision; - enum { TensorLevel = 0 }; - }; - template<> class GridTypeMapper { - public: - typedef ComplexF scalar_type; - typedef vComplexF vector_type; - typedef vComplexD vector_typeD; - typedef vComplexF tensor_reduced; - typedef ComplexF scalar_object; - typedef vComplexF Complexified; - typedef vRealF Realified; - typedef vComplexD DoublePrecision; - enum { TensorLevel = 0 }; - }; - template<> class GridTypeMapper { - public: - typedef ComplexD scalar_type; - typedef vComplexD vector_type; - typedef vComplexD vector_typeD; - typedef vComplexD tensor_reduced; - typedef ComplexD scalar_object; - typedef vComplexD Complexified; - typedef vRealD Realified; - typedef vComplexD DoublePrecision; - enum { TensorLevel = 0 }; - }; - template<> class GridTypeMapper { - public: - typedef Integer scalar_type; - typedef vInteger vector_type; - typedef vInteger vector_typeD; - typedef vInteger tensor_reduced; - typedef Integer scalar_object; - typedef void Complexified; - typedef void Realified; - typedef void DoublePrecision; - enum { TensorLevel = 0 }; - }; +template<> class GridTypeMapper { +public: + typedef RealF scalar_type; + typedef vRealF vector_type; + typedef vRealD vector_typeD; + typedef vRealF tensor_reduced; + typedef RealF scalar_object; + typedef vComplexF Complexified; + typedef vRealF Realified; + typedef vRealD DoublePrecision; + enum { TensorLevel = 0 }; +}; +template<> class GridTypeMapper { +public: + typedef RealD scalar_type; + typedef vRealD vector_type; + typedef vRealD vector_typeD; + typedef vRealD tensor_reduced; + typedef RealD scalar_object; + typedef vComplexD Complexified; + typedef vRealD Realified; + typedef vRealD DoublePrecision; + enum { TensorLevel = 0 }; +}; +template<> class GridTypeMapper { +public: + typedef ComplexF scalar_type; + typedef vComplexH vector_type; + typedef vComplexD vector_typeD; + typedef vComplexH tensor_reduced; + typedef ComplexF scalar_object; + typedef vComplexH Complexified; + typedef vRealH Realified; + typedef vComplexD DoublePrecision; + enum { TensorLevel = 0 }; +}; +template<> class GridTypeMapper { +public: + typedef ComplexF scalar_type; + typedef vComplexF vector_type; + typedef vComplexD vector_typeD; + typedef vComplexF tensor_reduced; + typedef ComplexF scalar_object; + typedef vComplexF Complexified; + typedef vRealF Realified; + typedef vComplexD DoublePrecision; + enum { TensorLevel = 0 }; +}; +template<> class GridTypeMapper { +public: + typedef ComplexD scalar_type; + typedef vComplexD vector_type; + typedef vComplexD vector_typeD; + typedef vComplexD tensor_reduced; + typedef ComplexD scalar_object; + typedef vComplexD Complexified; + typedef vRealD Realified; + typedef vComplexD DoublePrecision; + enum { TensorLevel = 0 }; +}; +template<> class GridTypeMapper { +public: + typedef Integer scalar_type; + typedef vInteger vector_type; + typedef vInteger vector_typeD; + typedef vInteger tensor_reduced; + typedef Integer scalar_object; + typedef void Complexified; + typedef void Realified; + typedef void DoublePrecision; + enum { TensorLevel = 0 }; +}; - // First some of my own traits - template struct isGridTensor { - static const bool value = true; - static const bool notvalue = false; - }; - template<> struct isGridTensor { - static const bool value = false; - static const bool notvalue = true; - }; - template<> struct isGridTensor { - static const bool value = false; - static const bool notvalue = true; - }; - template<> struct isGridTensor { - static const bool value = false; - static const bool notvalue = true; - }; - template<> struct isGridTensor { - static const bool value = false; - static const bool notvalue = true; - }; - template<> struct isGridTensor { - static const bool value = false; - static const bool notvalue = true; - }; - template<> struct isGridTensor { - static const bool value = false; - static const bool notvalue = true; - }; - template<> struct isGridTensor { - static const bool value = false; - static const bool notvalue = true; - }; - template<> struct isGridTensor { - static const bool value = false; - static const bool notvalue = true; - }; - template<> struct isGridTensor { - static const bool value = false; - static const bool notvalue = true; - }; - template<> struct isGridTensor { - static const bool value = false; - static const bool notvalue = true; - }; - template<> struct isGridTensor { - static const bool value = false; - static const bool notvalue = true; - }; +// First some of my own traits +template struct isGridTensor { + static const bool value = true; + static const bool notvalue = false; +}; +template<> struct isGridTensor { + static const bool value = false; + static const bool notvalue = true; +}; +template<> struct isGridTensor { + static const bool value = false; + static const bool notvalue = true; +}; +template<> struct isGridTensor { + static const bool value = false; + static const bool notvalue = true; +}; +template<> struct isGridTensor { + static const bool value = false; + static const bool notvalue = true; +}; +template<> struct isGridTensor { + static const bool value = false; + static const bool notvalue = true; +}; +template<> struct isGridTensor { + static const bool value = false; + static const bool notvalue = true; +}; +template<> struct isGridTensor { + static const bool value = false; + static const bool notvalue = true; +}; +template<> struct isGridTensor { + static const bool value = false; + static const bool notvalue = true; +}; +template<> struct isGridTensor { + static const bool value = false; + static const bool notvalue = true; +}; +template<> struct isGridTensor { + static const bool value = false; + static const bool notvalue = true; +}; +template<> struct isGridTensor { + static const bool value = false; + static const bool notvalue = true; +}; - // Match the index - template struct matchGridTensorIndex { - static const bool value = (Level==T::TensorLevel); - static const bool notvalue = (Level!=T::TensorLevel); - }; - // What is the vtype - template struct isComplex { - static const bool value = false; - }; - template<> struct isComplex { - static const bool value = true; - }; - template<> struct isComplex { - static const bool value = true; - }; +// Match the index +template struct matchGridTensorIndex { + static const bool value = (Level==T::TensorLevel); + static const bool notvalue = (Level!=T::TensorLevel); +}; +// What is the vtype +template struct isComplex { + static const bool value = false; +}; +template<> struct isComplex { + static const bool value = true; +}; +template<> struct isComplex { + static const bool value = true; +}; - //Get the SIMD vector type from a Grid tensor or Lattice - template - struct getVectorType{ - typedef T type; - }; +//Get the SIMD vector type from a Grid tensor or Lattice +template +struct getVectorType{ + typedef T type; +}; - //Query if a tensor or Lattice is SIMD vector or scalar - template - class isSIMDvectorized{ - template - static typename std::enable_if< !std::is_same< typename GridTypeMapper::type>::scalar_type, - typename GridTypeMapper::type>::vector_type>::value, char>::type test(void *); +//Query if a tensor or Lattice is SIMD vector or scalar +template +class isSIMDvectorized{ + template + static typename std::enable_if< !std::is_same< typename GridTypeMapper::type>::scalar_type, + typename GridTypeMapper::type>::vector_type>::value, char>::type test(void *); - template - static double test(...); + template + static double test(...); - public: - enum {value = sizeof(test(0)) == sizeof(char) }; - }; +public: + enum {value = sizeof(test(0)) == sizeof(char) }; +}; - //Get the precision of a Lattice, tensor or scalar type in units of sizeof(float) - template - class getPrecision{ - public: - //get the vector_obj (i.e. a grid Tensor) if its a Lattice, do nothing otherwise (i.e. if fundamental or grid Tensor) - typedef typename getVectorType::type vector_obj; - typedef typename GridTypeMapper::scalar_type scalar_type; //get the associated scalar type. Works on fundamental and tensor types - typedef typename GridTypeMapper::Realified real_scalar_type; //remove any std::complex wrapper, should get us to the fundamental type +//Get the precision of a Lattice, tensor or scalar type in units of sizeof(float) +template +class getPrecision{ +public: + //get the vector_obj (i.e. a grid Tensor) if its a Lattice, do nothing otherwise (i.e. if fundamental or grid Tensor) + typedef typename getVectorType::type vector_obj; + typedef typename GridTypeMapper::scalar_type scalar_type; //get the associated scalar type. Works on fundamental and tensor types + typedef typename GridTypeMapper::Realified real_scalar_type; //remove any std::complex wrapper, should get us to the fundamental type + + enum { value = sizeof(real_scalar_type)/sizeof(float) }; +}; + +NAMESPACE_END(Grid); - enum { value = sizeof(real_scalar_type)/sizeof(float) }; - }; -} #endif diff --git a/lib/tensors/Tensor_transpose.h b/lib/tensors/Tensor_transpose.h index c12d4f09..b2330461 100644 --- a/lib/tensors/Tensor_transpose.h +++ b/lib/tensors/Tensor_transpose.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,13 +23,12 @@ Author: Peter Boyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_MATH_TRANSPOSE_H #define GRID_MATH_TRANSPOSE_H -namespace Grid { - +NAMESPACE_BEGIN(Grid); ///////////////////////////////////////////////////////////////// // Transpose all indices @@ -41,45 +40,45 @@ inline RealD transpose(RealD &rhs){ return rhs;} inline RealF transpose(RealF &rhs){ return rhs;} template - inline typename std::enable_if::value, iMatrix >::type - transpose(iMatrix arg) - { - iMatrix ret; - for(int i=0;i::value, iMatrix >::type +transpose(iMatrix arg) +{ + iMatrix ret; + for(int i=0;i - inline typename std::enable_if::notvalue, iMatrix >::type - transpose(iMatrix arg) - { - iMatrix ret; - for(int i=0;i::notvalue, iMatrix >::type +transpose(iMatrix arg) +{ + iMatrix ret; + for(int i=0;i - inline typename std::enable_if::value, iScalar >::type - transpose(iScalar arg) - { - iScalar ret; - ret._internal = transpose(arg._internal); // NB recurses - return ret; - } +inline typename std::enable_if::value, iScalar >::type +transpose(iScalar arg) +{ + iScalar ret; + ret._internal = transpose(arg._internal); // NB recurses + return ret; +} template - inline typename std::enable_if::notvalue, iScalar >::type - transpose(iScalar arg) - { - iScalar ret; - ret._internal = arg._internal; // NB recursion stops - return ret; - } +inline typename std::enable_if::notvalue, iScalar >::type +transpose(iScalar arg) +{ + iScalar ret; + ret._internal = arg._internal; // NB recursion stops + return ret; +} //////////////////////////////////////////////////////////////////////////////////////////// @@ -88,14 +87,14 @@ template //////////////////////////////////////////////////////////////////////////////////////////// #if 0 template inline - typename std::enable_if,Level>::value, iMatrix >::type +typename std::enable_if,Level>::value, iMatrix >::type transposeIndex (const iMatrix &arg) { iMatrix ret; for(int i=0;i &arg) for(int i=0;i(arg._internal[i][j]); - }} + }} return ret; } template inline @@ -126,5 +125,6 @@ transposeIndex (const iScalar &arg) } #endif -} +NAMESPACE_END(Grid); + #endif diff --git a/lib/tensors/Tensor_unary.h b/lib/tensors/Tensor_unary.h index dd05a4a7..5640e173 100644 --- a/lib/tensors/Tensor_unary.h +++ b/lib/tensors/Tensor_unary.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -25,62 +25,63 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_TENSOR_UNARY_H #define GRID_TENSOR_UNARY_H -namespace Grid { -#define UNARY(func)\ -template inline auto func(const iScalar &z) -> iScalar\ -{\ - iScalar ret;\ - ret._internal = func( (z._internal));\ - return ret;\ -}\ -template inline auto func(const iVector &z) -> iVector\ -{\ - iVector ret;\ - for(int c1=0;c1 inline auto func(const iMatrix &z) -> iMatrix\ -{\ - iMatrix ret;\ - for(int c1=0;c1 inline auto func(const iScalar &z) -> iScalar \ + { \ + iScalar ret; \ + ret._internal = func( (z._internal)); \ + return ret; \ + } \ + template inline auto func(const iVector &z) -> iVector \ + { \ + iVector ret; \ + for(int c1=0;c1 inline auto func(const iMatrix &z) -> iMatrix \ + { \ + iMatrix ret; \ + for(int c1=0;c1 inline iScalar func(const iScalar &z,scal y) \ -{\ - iScalar ret;\ - ret._internal = func(z._internal,y); \ - return ret;\ -}\ - template inline iVector func(const iVector &z,scal y) \ -{\ - iVector ret;\ - for(int c1=0;c1 inline iMatrix func(const iMatrix &z, scal y) \ -{\ - iMatrix ret;\ - for(int c1=0;c1 inline iScalar func(const iScalar &z,scal y) \ + { \ + iScalar ret; \ + ret._internal = func(z._internal,y); \ + return ret; \ + } \ + template inline iVector func(const iVector &z,scal y) \ + { \ + iVector ret; \ + for(int c1=0;c1 inline iMatrix func(const iMatrix &z, scal y) \ + { \ + iMatrix ret; \ + for(int c1=0;c1 inline auto toReal(const iScalar &z) -> typename iScala ret._internal = toReal(z._internal); return ret; } - template inline auto toReal(const iVector &z) -> typename iVector::Realified +template inline auto toReal(const iVector &z) -> typename iVector::Realified { typename iVector::Realified ret; for(int c1=0;c1 inline auto toReal(const iMatrix &z) -> typenam { typename iMatrix::Realified ret; for(int c1=0;c1 inline auto toComplex(const iScalar &z) -> typename iSc ret._internal = toComplex(z._internal); return ret; } - template inline auto toComplex(const iVector &z) -> typename iVector::Complexified +template inline auto toComplex(const iVector &z) -> typename iVector::Complexified { typename iVector::Complexified ret; for(int c1=0;c1 inline auto toComplex(const iMatrix &z) -> type { typename iMatrix::Complexified ret; for(int c1=0;c1 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_MATH_H #define GRID_MATH_H From 1fbab4032b9245d8687b1a2fab18eeaf507b9ca0 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 14 Jan 2018 21:51:19 +0000 Subject: [PATCH 048/754] Namespace changes --- lib/qcd/QCD.h | 818 ++++++++++++++++++++++++-------------------------- 1 file changed, 391 insertions(+), 427 deletions(-) diff --git a/lib/qcd/QCD.h b/lib/qcd/QCD.h index 531b71bd..945d173b 100644 --- a/lib/qcd/QCD.h +++ b/lib/qcd/QCD.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -27,514 +27,478 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_QCD_BASE_H #define GRID_QCD_BASE_H -namespace Grid{ -namespace QCD { +NAMESPACE_BEGIN(Grid); - static const int Xdir = 0; - static const int Ydir = 1; - static const int Zdir = 2; - static const int Tdir = 3; +static const int Xdir = 0; +static const int Ydir = 1; +static const int Zdir = 2; +static const int Tdir = 3; - static const int Xp = 0; - static const int Yp = 1; - static const int Zp = 2; - static const int Tp = 3; - static const int Xm = 4; - static const int Ym = 5; - static const int Zm = 6; - static const int Tm = 7; +static const int Xp = 0; +static const int Yp = 1; +static const int Zp = 2; +static const int Tp = 3; +static const int Xm = 4; +static const int Ym = 5; +static const int Zm = 6; +static const int Tm = 7; - static const int Nc=3; - static const int Ns=4; - static const int Nd=4; - static const int Nhs=2; // half spinor - static const int Nds=8; // double stored gauge field - static const int Ngp=2; // gparity index range +static const int Nc=3; +static const int Ns=4; +static const int Nd=4; +static const int Nhs=2; // half spinor +static const int Nds=8; // double stored gauge field +static const int Ngp=2; // gparity index range - ////////////////////////////////////////////////////////////////////////////// - // QCD iMatrix types - // Index conventions: Lorentz x Spin x Colour - // note: static const int or constexpr will work for type deductions - // with the intel compiler (up to version 17) - ////////////////////////////////////////////////////////////////////////////// - #define ColourIndex 2 - #define SpinIndex 1 - #define LorentzIndex 0 +////////////////////////////////////////////////////////////////////////////// +// QCD iMatrix types +// Index conventions: Lorentz x Spin x Colour +// note: static const int or constexpr will work for type deductions +// with the intel compiler (up to version 17) +////////////////////////////////////////////////////////////////////////////// +#define ColourIndex 2 +#define SpinIndex 1 +#define LorentzIndex 0 - // Also should make these a named enum type - static const int DaggerNo=0; - static const int DaggerYes=1; - static const int InverseNo=0; - static const int InverseYes=1; +// Also should make these a named enum type +static const int DaggerNo=0; +static const int DaggerYes=1; +static const int InverseNo=0; +static const int InverseYes=1; - // Useful traits is this a spin index - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; +// Useful traits is this a spin index +//typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - const int SpinorIndex = 2; - template struct isSpinor { - static const bool value = (SpinorIndex==T::TensorLevel); - }; - template using IfSpinor = Invoke::value,int> > ; - template using IfNotSpinor = Invoke::value,int> > ; +const int SpinorIndex = 2; +template struct isSpinor { + static const bool value = (SpinorIndex==T::TensorLevel); +}; +template using IfSpinor = Invoke::value,int> > ; +template using IfNotSpinor = Invoke::value,int> > ; - // ChrisK very keen to add extra space for Gparity doubling. - // - // Also add domain wall index, in a way where Wilson operator - // naturally distributes across the 5th dimensions. - // - // That probably makes for GridRedBlack4dCartesian grid. +// ChrisK very keen to add extra space for Gparity doubling. +// +// Also add domain wall index, in a way where Wilson operator +// naturally distributes across the 5th dimensions. +// +// That probably makes for GridRedBlack4dCartesian grid. - // s,sp,c,spc,lc - template using iSinglet = iScalar > >; - template using iSpinMatrix = iScalar, Ns> >; - template using iColourMatrix = iScalar > > ; - template using iSpinColourMatrix = iScalar, Ns> >; - template using iLorentzColourMatrix = iVector >, Nd > ; - template using iDoubleStoredColourMatrix = iVector >, Nds > ; - template using iSpinVector = iScalar, Ns> >; - template using iColourVector = iScalar > >; - template using iSpinColourVector = iScalar, Ns> >; - template using iHalfSpinVector = iScalar, Nhs> >; - template using iHalfSpinColourVector = iScalar, Nhs> >; +// s,sp,c,spc,lc +template using iSinglet = iScalar > >; +template using iSpinMatrix = iScalar, Ns> >; +template using iColourMatrix = iScalar > > ; +template using iSpinColourMatrix = iScalar, Ns> >; +template using iLorentzColourMatrix = iVector >, Nd > ; +template using iDoubleStoredColourMatrix = iVector >, Nds > ; +template using iSpinVector = iScalar, Ns> >; +template using iColourVector = iScalar > >; +template using iSpinColourVector = iScalar, Ns> >; +template using iHalfSpinVector = iScalar, Nhs> >; +template using iHalfSpinColourVector = iScalar, Nhs> >; - template using iGparitySpinColourVector = iVector, Ns>, Ngp >; - template using iGparityHalfSpinColourVector = iVector, Nhs>, Ngp >; +template using iGparitySpinColourVector = iVector, Ns>, Ngp >; +template using iGparityHalfSpinColourVector = iVector, Nhs>, Ngp >; - // Spin matrix - typedef iSpinMatrix SpinMatrix; - typedef iSpinMatrix SpinMatrixF; - typedef iSpinMatrix SpinMatrixD; +// Spin matrix +typedef iSpinMatrix SpinMatrix; +typedef iSpinMatrix SpinMatrixF; +typedef iSpinMatrix SpinMatrixD; - typedef iSpinMatrix vSpinMatrix; - typedef iSpinMatrix vSpinMatrixF; - typedef iSpinMatrix vSpinMatrixD; +typedef iSpinMatrix vSpinMatrix; +typedef iSpinMatrix vSpinMatrixF; +typedef iSpinMatrix vSpinMatrixD; - // Colour Matrix - typedef iColourMatrix ColourMatrix; - typedef iColourMatrix ColourMatrixF; - typedef iColourMatrix ColourMatrixD; +// Colour Matrix +typedef iColourMatrix ColourMatrix; +typedef iColourMatrix ColourMatrixF; +typedef iColourMatrix ColourMatrixD; - typedef iColourMatrix vColourMatrix; - typedef iColourMatrix vColourMatrixF; - typedef iColourMatrix vColourMatrixD; +typedef iColourMatrix vColourMatrix; +typedef iColourMatrix vColourMatrixF; +typedef iColourMatrix vColourMatrixD; - // SpinColour matrix - typedef iSpinColourMatrix SpinColourMatrix; - typedef iSpinColourMatrix SpinColourMatrixF; - typedef iSpinColourMatrix SpinColourMatrixD; +// SpinColour matrix +typedef iSpinColourMatrix SpinColourMatrix; +typedef iSpinColourMatrix SpinColourMatrixF; +typedef iSpinColourMatrix SpinColourMatrixD; - typedef iSpinColourMatrix vSpinColourMatrix; - typedef iSpinColourMatrix vSpinColourMatrixF; - typedef iSpinColourMatrix vSpinColourMatrixD; +typedef iSpinColourMatrix vSpinColourMatrix; +typedef iSpinColourMatrix vSpinColourMatrixF; +typedef iSpinColourMatrix vSpinColourMatrixD; - // LorentzColour - typedef iLorentzColourMatrix LorentzColourMatrix; - typedef iLorentzColourMatrix LorentzColourMatrixF; - typedef iLorentzColourMatrix LorentzColourMatrixD; +// LorentzColour +typedef iLorentzColourMatrix LorentzColourMatrix; +typedef iLorentzColourMatrix LorentzColourMatrixF; +typedef iLorentzColourMatrix LorentzColourMatrixD; - typedef iLorentzColourMatrix vLorentzColourMatrix; - typedef iLorentzColourMatrix vLorentzColourMatrixF; - typedef iLorentzColourMatrix vLorentzColourMatrixD; +typedef iLorentzColourMatrix vLorentzColourMatrix; +typedef iLorentzColourMatrix vLorentzColourMatrixF; +typedef iLorentzColourMatrix vLorentzColourMatrixD; - // DoubleStored gauge field - typedef iDoubleStoredColourMatrix DoubleStoredColourMatrix; - typedef iDoubleStoredColourMatrix DoubleStoredColourMatrixF; - typedef iDoubleStoredColourMatrix DoubleStoredColourMatrixD; +// DoubleStored gauge field +typedef iDoubleStoredColourMatrix DoubleStoredColourMatrix; +typedef iDoubleStoredColourMatrix DoubleStoredColourMatrixF; +typedef iDoubleStoredColourMatrix DoubleStoredColourMatrixD; - typedef iDoubleStoredColourMatrix vDoubleStoredColourMatrix; - typedef iDoubleStoredColourMatrix vDoubleStoredColourMatrixF; - typedef iDoubleStoredColourMatrix vDoubleStoredColourMatrixD; +typedef iDoubleStoredColourMatrix vDoubleStoredColourMatrix; +typedef iDoubleStoredColourMatrix vDoubleStoredColourMatrixF; +typedef iDoubleStoredColourMatrix vDoubleStoredColourMatrixD; - // Spin vector - typedef iSpinVector SpinVector; - typedef iSpinVector SpinVectorF; - typedef iSpinVector SpinVectorD; +// Spin vector +typedef iSpinVector SpinVector; +typedef iSpinVector SpinVectorF; +typedef iSpinVector SpinVectorD; - typedef iSpinVector vSpinVector; - typedef iSpinVector vSpinVectorF; - typedef iSpinVector vSpinVectorD; +typedef iSpinVector vSpinVector; +typedef iSpinVector vSpinVectorF; +typedef iSpinVector vSpinVectorD; - // Colour vector - typedef iColourVector ColourVector; - typedef iColourVector ColourVectorF; - typedef iColourVector ColourVectorD; +// Colour vector +typedef iColourVector ColourVector; +typedef iColourVector ColourVectorF; +typedef iColourVector ColourVectorD; - typedef iColourVector vColourVector; - typedef iColourVector vColourVectorF; - typedef iColourVector vColourVectorD; +typedef iColourVector vColourVector; +typedef iColourVector vColourVectorF; +typedef iColourVector vColourVectorD; - // SpinColourVector - typedef iSpinColourVector SpinColourVector; - typedef iSpinColourVector SpinColourVectorF; - typedef iSpinColourVector SpinColourVectorD; +// SpinColourVector +typedef iSpinColourVector SpinColourVector; +typedef iSpinColourVector SpinColourVectorF; +typedef iSpinColourVector SpinColourVectorD; - typedef iSpinColourVector vSpinColourVector; - typedef iSpinColourVector vSpinColourVectorF; - typedef iSpinColourVector vSpinColourVectorD; +typedef iSpinColourVector vSpinColourVector; +typedef iSpinColourVector vSpinColourVectorF; +typedef iSpinColourVector vSpinColourVectorD; - // HalfSpin vector - typedef iHalfSpinVector HalfSpinVector; - typedef iHalfSpinVector HalfSpinVectorF; - typedef iHalfSpinVector HalfSpinVectorD; +// HalfSpin vector +typedef iHalfSpinVector HalfSpinVector; +typedef iHalfSpinVector HalfSpinVectorF; +typedef iHalfSpinVector HalfSpinVectorD; - typedef iHalfSpinVector vHalfSpinVector; - typedef iHalfSpinVector vHalfSpinVectorF; - typedef iHalfSpinVector vHalfSpinVectorD; +typedef iHalfSpinVector vHalfSpinVector; +typedef iHalfSpinVector vHalfSpinVectorF; +typedef iHalfSpinVector vHalfSpinVectorD; - // HalfSpinColour vector - typedef iHalfSpinColourVector HalfSpinColourVector; - typedef iHalfSpinColourVector HalfSpinColourVectorF; - typedef iHalfSpinColourVector HalfSpinColourVectorD; +// HalfSpinColour vector +typedef iHalfSpinColourVector HalfSpinColourVector; +typedef iHalfSpinColourVector HalfSpinColourVectorF; +typedef iHalfSpinColourVector HalfSpinColourVectorD; - typedef iHalfSpinColourVector vHalfSpinColourVector; - typedef iHalfSpinColourVector vHalfSpinColourVectorF; - typedef iHalfSpinColourVector vHalfSpinColourVectorD; +typedef iHalfSpinColourVector vHalfSpinColourVector; +typedef iHalfSpinColourVector vHalfSpinColourVectorF; +typedef iHalfSpinColourVector vHalfSpinColourVectorD; - // singlets - typedef iSinglet TComplex; // FIXME This is painful. Tensor singlet complex type. - typedef iSinglet TComplexF; // FIXME This is painful. Tensor singlet complex type. - typedef iSinglet TComplexD; // FIXME This is painful. Tensor singlet complex type. +// singlets +typedef iSinglet TComplex; // FIXME This is painful. Tensor singlet complex type. +typedef iSinglet TComplexF; // FIXME This is painful. Tensor singlet complex type. +typedef iSinglet TComplexD; // FIXME This is painful. Tensor singlet complex type. - typedef iSinglet vTComplex ; // what if we don't know the tensor structure - typedef iSinglet vTComplexF; // what if we don't know the tensor structure - typedef iSinglet vTComplexD; // what if we don't know the tensor structure +typedef iSinglet vTComplex ; // what if we don't know the tensor structure +typedef iSinglet vTComplexF; // what if we don't know the tensor structure +typedef iSinglet vTComplexD; // what if we don't know the tensor structure - typedef iSinglet TReal; // Shouldn't need these; can I make it work without? - typedef iSinglet TRealF; // Shouldn't need these; can I make it work without? - typedef iSinglet TRealD; // Shouldn't need these; can I make it work without? +typedef iSinglet TReal; // Shouldn't need these; can I make it work without? +typedef iSinglet TRealF; // Shouldn't need these; can I make it work without? +typedef iSinglet TRealD; // Shouldn't need these; can I make it work without? - typedef iSinglet vTReal; - typedef iSinglet vTRealF; - typedef iSinglet vTRealD; +typedef iSinglet vTReal; +typedef iSinglet vTRealF; +typedef iSinglet vTRealD; - typedef iSinglet vTInteger; - typedef iSinglet TInteger; +typedef iSinglet vTInteger; +typedef iSinglet TInteger; - // Lattices of these - typedef Lattice LatticeColourMatrix; - typedef Lattice LatticeColourMatrixF; - typedef Lattice LatticeColourMatrixD; +// Lattices of these +typedef Lattice LatticeColourMatrix; +typedef Lattice LatticeColourMatrixF; +typedef Lattice LatticeColourMatrixD; - typedef Lattice LatticeSpinMatrix; - typedef Lattice LatticeSpinMatrixF; - typedef Lattice LatticeSpinMatrixD; +typedef Lattice LatticeSpinMatrix; +typedef Lattice LatticeSpinMatrixF; +typedef Lattice LatticeSpinMatrixD; - typedef Lattice LatticeSpinColourMatrix; - typedef Lattice LatticeSpinColourMatrixF; - typedef Lattice LatticeSpinColourMatrixD; +typedef Lattice LatticeSpinColourMatrix; +typedef Lattice LatticeSpinColourMatrixF; +typedef Lattice LatticeSpinColourMatrixD; - typedef Lattice LatticeLorentzColourMatrix; - typedef Lattice LatticeLorentzColourMatrixF; - typedef Lattice LatticeLorentzColourMatrixD; +typedef Lattice LatticeLorentzColourMatrix; +typedef Lattice LatticeLorentzColourMatrixF; +typedef Lattice LatticeLorentzColourMatrixD; - // DoubleStored gauge field - typedef Lattice LatticeDoubleStoredColourMatrix; - typedef Lattice LatticeDoubleStoredColourMatrixF; - typedef Lattice LatticeDoubleStoredColourMatrixD; +// DoubleStored gauge field +typedef Lattice LatticeDoubleStoredColourMatrix; +typedef Lattice LatticeDoubleStoredColourMatrixF; +typedef Lattice LatticeDoubleStoredColourMatrixD; - typedef Lattice LatticeSpinVector; - typedef Lattice LatticeSpinVectorF; - typedef Lattice LatticeSpinVectorD; +typedef Lattice LatticeSpinVector; +typedef Lattice LatticeSpinVectorF; +typedef Lattice LatticeSpinVectorD; - typedef Lattice LatticeColourVector; - typedef Lattice LatticeColourVectorF; - typedef Lattice LatticeColourVectorD; +typedef Lattice LatticeColourVector; +typedef Lattice LatticeColourVectorF; +typedef Lattice LatticeColourVectorD; - typedef Lattice LatticeSpinColourVector; - typedef Lattice LatticeSpinColourVectorF; - typedef Lattice LatticeSpinColourVectorD; +typedef Lattice LatticeSpinColourVector; +typedef Lattice LatticeSpinColourVectorF; +typedef Lattice LatticeSpinColourVectorD; - typedef Lattice LatticeHalfSpinVector; - typedef Lattice LatticeHalfSpinVectorF; - typedef Lattice LatticeHalfSpinVectorD; +typedef Lattice LatticeHalfSpinVector; +typedef Lattice LatticeHalfSpinVectorF; +typedef Lattice LatticeHalfSpinVectorD; - typedef Lattice LatticeHalfSpinColourVector; - typedef Lattice LatticeHalfSpinColourVectorF; - typedef Lattice LatticeHalfSpinColourVectorD; +typedef Lattice LatticeHalfSpinColourVector; +typedef Lattice LatticeHalfSpinColourVectorF; +typedef Lattice LatticeHalfSpinColourVectorD; - typedef Lattice LatticeReal; - typedef Lattice LatticeRealF; - typedef Lattice LatticeRealD; +typedef Lattice LatticeReal; +typedef Lattice LatticeRealF; +typedef Lattice LatticeRealD; - typedef Lattice LatticeComplex; - typedef Lattice LatticeComplexF; - typedef Lattice LatticeComplexD; +typedef Lattice LatticeComplex; +typedef Lattice LatticeComplexF; +typedef Lattice LatticeComplexD; - typedef Lattice LatticeInteger; // Predicates for "where" +typedef Lattice LatticeInteger; // Predicates for "where" - /////////////////////////////////////////// - // Physical names for things - /////////////////////////////////////////// - typedef LatticeHalfSpinColourVector LatticeHalfFermion; - typedef LatticeHalfSpinColourVectorF LatticeHalfFermionF; - typedef LatticeHalfSpinColourVectorF LatticeHalfFermionD; +/////////////////////////////////////////// +// Physical names for things +/////////////////////////////////////////// +typedef LatticeHalfSpinColourVector LatticeHalfFermion; +typedef LatticeHalfSpinColourVectorF LatticeHalfFermionF; +typedef LatticeHalfSpinColourVectorF LatticeHalfFermionD; - typedef LatticeSpinColourVector LatticeFermion; - typedef LatticeSpinColourVectorF LatticeFermionF; - typedef LatticeSpinColourVectorD LatticeFermionD; +typedef LatticeSpinColourVector LatticeFermion; +typedef LatticeSpinColourVectorF LatticeFermionF; +typedef LatticeSpinColourVectorD LatticeFermionD; - typedef LatticeSpinColourMatrix LatticePropagator; - typedef LatticeSpinColourMatrixF LatticePropagatorF; - typedef LatticeSpinColourMatrixD LatticePropagatorD; +typedef LatticeSpinColourMatrix LatticePropagator; +typedef LatticeSpinColourMatrixF LatticePropagatorF; +typedef LatticeSpinColourMatrixD LatticePropagatorD; - typedef LatticeLorentzColourMatrix LatticeGaugeField; - typedef LatticeLorentzColourMatrixF LatticeGaugeFieldF; - typedef LatticeLorentzColourMatrixD LatticeGaugeFieldD; +typedef LatticeLorentzColourMatrix LatticeGaugeField; +typedef LatticeLorentzColourMatrixF LatticeGaugeFieldF; +typedef LatticeLorentzColourMatrixD LatticeGaugeFieldD; - typedef LatticeDoubleStoredColourMatrix LatticeDoubledGaugeField; - typedef LatticeDoubleStoredColourMatrixF LatticeDoubledGaugeFieldF; - typedef LatticeDoubleStoredColourMatrixD LatticeDoubledGaugeFieldD; +typedef LatticeDoubleStoredColourMatrix LatticeDoubledGaugeField; +typedef LatticeDoubleStoredColourMatrixF LatticeDoubledGaugeFieldF; +typedef LatticeDoubleStoredColourMatrixD LatticeDoubledGaugeFieldD; - template using LorentzScalar = Lattice >; +template using LorentzScalar = Lattice >; - // Uhgg... typing this hurt ;) - // (my keyboard got burning hot when I typed this, must be the anti-Fermion) - typedef Lattice LatticeStaggeredFermion; - typedef Lattice LatticeStaggeredFermionF; - typedef Lattice LatticeStaggeredFermionD; +// Uhgg... typing this hurt ;) +// (my keyboard got burning hot when I typed this, must be the anti-Fermion) +typedef Lattice LatticeStaggeredFermion; +typedef Lattice LatticeStaggeredFermionF; +typedef Lattice LatticeStaggeredFermionD; - typedef Lattice LatticeStaggeredPropagator; - typedef Lattice LatticeStaggeredPropagatorF; - typedef Lattice LatticeStaggeredPropagatorD; +typedef Lattice LatticeStaggeredPropagator; +typedef Lattice LatticeStaggeredPropagatorF; +typedef Lattice LatticeStaggeredPropagatorD; - ////////////////////////////////////////////////////////////////////////////// - // Peek and Poke named after physics attributes - ////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////// +// Peek and Poke named after physics attributes +////////////////////////////////////////////////////////////////////////////// - //spin - template auto peekSpin(const vobj &rhs,int i) -> decltype(PeekIndex(rhs,0)) - { - return PeekIndex(rhs,i); - } - template auto peekSpin(const vobj &rhs,int i,int j) -> decltype(PeekIndex(rhs,0,0)) - { - return PeekIndex(rhs,i,j); - } - template auto peekSpin(const Lattice &rhs,int i) -> decltype(PeekIndex(rhs,0)) - { - return PeekIndex(rhs,i); - } - template auto peekSpin(const Lattice &rhs,int i,int j) -> decltype(PeekIndex(rhs,0,0)) - { - return PeekIndex(rhs,i,j); - } - //colour - template auto peekColour(const vobj &rhs,int i) -> decltype(PeekIndex(rhs,0)) - { - return PeekIndex(rhs,i); - } - template auto peekColour(const vobj &rhs,int i,int j) -> decltype(PeekIndex(rhs,0,0)) - { - return PeekIndex(rhs,i,j); - } - template auto peekColour(const Lattice &rhs,int i) -> decltype(PeekIndex(rhs,0)) - { - return PeekIndex(rhs,i); - } - template auto peekColour(const Lattice &rhs,int i,int j) -> decltype(PeekIndex(rhs,0,0)) - { - return PeekIndex(rhs,i,j); - } - //lorentz - template auto peekLorentz(const vobj &rhs,int i) -> decltype(PeekIndex(rhs,0)) - { - return PeekIndex(rhs,i); - } - template auto peekLorentz(const Lattice &rhs,int i) -> decltype(PeekIndex(rhs,0)) - { - return PeekIndex(rhs,i); - } +//spin +template auto peekSpin(const vobj &rhs,int i) -> decltype(PeekIndex(rhs,0)) +{ + return PeekIndex(rhs,i); +} +template auto peekSpin(const vobj &rhs,int i,int j) -> decltype(PeekIndex(rhs,0,0)) +{ + return PeekIndex(rhs,i,j); +} +template auto peekSpin(const Lattice &rhs,int i) -> decltype(PeekIndex(rhs,0)) +{ + return PeekIndex(rhs,i); +} +template auto peekSpin(const Lattice &rhs,int i,int j) -> decltype(PeekIndex(rhs,0,0)) +{ + return PeekIndex(rhs,i,j); +} +//colour +template auto peekColour(const vobj &rhs,int i) -> decltype(PeekIndex(rhs,0)) +{ + return PeekIndex(rhs,i); +} +template auto peekColour(const vobj &rhs,int i,int j) -> decltype(PeekIndex(rhs,0,0)) +{ + return PeekIndex(rhs,i,j); +} +template auto peekColour(const Lattice &rhs,int i) -> decltype(PeekIndex(rhs,0)) +{ + return PeekIndex(rhs,i); +} +template auto peekColour(const Lattice &rhs,int i,int j) -> decltype(PeekIndex(rhs,0,0)) +{ + return PeekIndex(rhs,i,j); +} +//lorentz +template auto peekLorentz(const vobj &rhs,int i) -> decltype(PeekIndex(rhs,0)) +{ + return PeekIndex(rhs,i); +} +template auto peekLorentz(const Lattice &rhs,int i) -> decltype(PeekIndex(rhs,0)) +{ + return PeekIndex(rhs,i); +} - ////////////////////////////////////////////// - // Poke lattice - ////////////////////////////////////////////// - template - void pokeColour(Lattice &lhs, - const Lattice(lhs._odata[0],0))> & rhs, - int i) - { - PokeIndex(lhs,rhs,i); - } - template - void pokeColour(Lattice &lhs, - const Lattice(lhs._odata[0],0,0))> & rhs, - int i,int j) - { - PokeIndex(lhs,rhs,i,j); - } - template - void pokeSpin(Lattice &lhs, +////////////////////////////////////////////// +// Poke lattice +////////////////////////////////////////////// +template +void pokeColour(Lattice &lhs, + const Lattice(lhs._odata[0],0))> & rhs, + int i) +{ + PokeIndex(lhs,rhs,i); +} +template +void pokeColour(Lattice &lhs, + const Lattice(lhs._odata[0],0,0))> & rhs, + int i,int j) +{ + PokeIndex(lhs,rhs,i,j); +} +template +void pokeSpin(Lattice &lhs, const Lattice(lhs._odata[0],0))> & rhs, int i) - { - PokeIndex(lhs,rhs,i); - } - template - void pokeSpin(Lattice &lhs, +{ + PokeIndex(lhs,rhs,i); +} +template +void pokeSpin(Lattice &lhs, const Lattice(lhs._odata[0],0,0))> & rhs, int i,int j) - { - PokeIndex(lhs,rhs,i,j); - } - template - void pokeLorentz(Lattice &lhs, - const Lattice(lhs._odata[0],0))> & rhs, - int i) - { - PokeIndex(lhs,rhs,i); - } +{ + PokeIndex(lhs,rhs,i,j); +} +template +void pokeLorentz(Lattice &lhs, + const Lattice(lhs._odata[0],0))> & rhs, + int i) +{ + PokeIndex(lhs,rhs,i); +} - ////////////////////////////////////////////// - // Poke scalars - ////////////////////////////////////////////// - template void pokeSpin(vobj &lhs,const decltype(peekIndex(lhs,0)) & rhs,int i) - { - pokeIndex(lhs,rhs,i); - } - template void pokeSpin(vobj &lhs,const decltype(peekIndex(lhs,0,0)) & rhs,int i,int j) - { - pokeIndex(lhs,rhs,i,j); - } +////////////////////////////////////////////// +// Poke scalars +////////////////////////////////////////////// +template void pokeSpin(vobj &lhs,const decltype(peekIndex(lhs,0)) & rhs,int i) +{ + pokeIndex(lhs,rhs,i); +} +template void pokeSpin(vobj &lhs,const decltype(peekIndex(lhs,0,0)) & rhs,int i,int j) +{ + pokeIndex(lhs,rhs,i,j); +} - template void pokeColour(vobj &lhs,const decltype(peekIndex(lhs,0)) & rhs,int i) - { - pokeIndex(lhs,rhs,i); - } - template void pokeColour(vobj &lhs,const decltype(peekIndex(lhs,0,0)) & rhs,int i,int j) - { - pokeIndex(lhs,rhs,i,j); - } +template void pokeColour(vobj &lhs,const decltype(peekIndex(lhs,0)) & rhs,int i) +{ + pokeIndex(lhs,rhs,i); +} +template void pokeColour(vobj &lhs,const decltype(peekIndex(lhs,0,0)) & rhs,int i,int j) +{ + pokeIndex(lhs,rhs,i,j); +} - template void pokeLorentz(vobj &lhs,const decltype(peekIndex(lhs,0)) & rhs,int i) - { - pokeIndex(lhs,rhs,i); - } +template void pokeLorentz(vobj &lhs,const decltype(peekIndex(lhs,0)) & rhs,int i) +{ + pokeIndex(lhs,rhs,i); +} - ////////////////////////////////////////////// - // Fermion <-> propagator assignements - ////////////////////////////////////////////// - template - void FermToProp(Prop &p, const Ferm &f, const int s, const int c) +////////////////////////////////////////////// +// Fermion <-> propagator assignements +////////////////////////////////////////////// +template +void FermToProp(Prop &p, const Ferm &f, const int s, const int c) +{ + for(int j = 0; j < Ns; ++j) { - for(int j = 0; j < Ns; ++j) - { - auto pjs = peekSpin(p, j, s); - auto fj = peekSpin(f, j); + auto pjs = peekSpin(p, j, s); + auto fj = peekSpin(f, j); - for(int i = 0; i < Nc; ++i) - { - pokeColour(pjs, peekColour(fj, i), i, c); - } - pokeSpin(p, pjs, j, s); - } + for(int i = 0; i < Nc; ++i) + { + pokeColour(pjs, peekColour(fj, i), i, c); + } + pokeSpin(p, pjs, j, s); } +} - template - void PropToFerm(Ferm &f, const Prop &p, const int s, const int c) +template +void PropToFerm(Ferm &f, const Prop &p, const int s, const int c) +{ + for(int j = 0; j < Ns; ++j) { - for(int j = 0; j < Ns; ++j) - { - auto pjs = peekSpin(p, j, s); - auto fj = peekSpin(f, j); + auto pjs = peekSpin(p, j, s); + auto fj = peekSpin(f, j); - for(int i = 0; i < Nc; ++i) - { - pokeColour(fj, peekColour(pjs, i, c), i); - } - pokeSpin(f, fj, j); - } + for(int i = 0; i < Nc; ++i) + { + pokeColour(fj, peekColour(pjs, i, c), i); + } + pokeSpin(f, fj, j); } +} - ////////////////////////////////////////////// - // transpose array and scalar - ////////////////////////////////////////////// - template inline Lattice transposeSpin(const Lattice &lhs){ - return transposeIndex(lhs); - } - template inline Lattice transposeColour(const Lattice &lhs){ - return transposeIndex(lhs); - } - template inline vobj transposeSpin(const vobj &lhs){ - return transposeIndex(lhs); - } - template inline vobj transposeColour(const vobj &lhs){ - return transposeIndex(lhs); - } +////////////////////////////////////////////// +// transpose array and scalar +////////////////////////////////////////////// +template inline Lattice transposeSpin(const Lattice &lhs){ + return transposeIndex(lhs); +} +template inline Lattice transposeColour(const Lattice &lhs){ + return transposeIndex(lhs); +} +template inline vobj transposeSpin(const vobj &lhs){ + return transposeIndex(lhs); +} +template inline vobj transposeColour(const vobj &lhs){ + return transposeIndex(lhs); +} - ////////////////////////////////////////// - // Trace lattice and non-lattice - ////////////////////////////////////////// - template - inline auto traceSpin(const Lattice &lhs) -> Lattice(lhs._odata[0]))> - { - return traceIndex(lhs); - } - template - inline auto traceColour(const Lattice &lhs) -> Lattice(lhs._odata[0]))> - { - return traceIndex(lhs); - } - template - inline auto traceSpin(const vobj &lhs) -> Lattice(lhs))> - { - return traceIndex(lhs); - } - template - inline auto traceColour(const vobj &lhs) -> Lattice(lhs))> - { - return traceIndex(lhs); - } +////////////////////////////////////////// +// Trace lattice and non-lattice +////////////////////////////////////////// +template +inline auto traceSpin(const Lattice &lhs) -> Lattice(lhs._odata[0]))> +{ + return traceIndex(lhs); +} +template +inline auto traceColour(const Lattice &lhs) -> Lattice(lhs._odata[0]))> +{ + return traceIndex(lhs); +} +template +inline auto traceSpin(const vobj &lhs) -> Lattice(lhs))> +{ + return traceIndex(lhs); +} +template +inline auto traceColour(const vobj &lhs) -> Lattice(lhs))> +{ + return traceIndex(lhs); +} - ////////////////////////////////////////// - // Current types - ////////////////////////////////////////// - GRID_SERIALIZABLE_ENUM(Current, undef, - Vector, 0, - Axial, 1, - Tadpole, 2); - -} //namespace QCD -} // Grid - -/* -<<<<<<< HEAD -#include -#include -#include -#include -#include - -// Include representations -#include -#include -#include -#include - -// Scalar field -#include - -#include - -#include - -#include -#include -#include -#include - - -//#include -======= - ->>>>>>> develop -*/ +////////////////////////////////////////// +// Current types +////////////////////////////////////////// +GRID_SERIALIZABLE_ENUM(Current, undef, + Vector, 0, + Axial, 1, + Tadpole, 2); +NAMESPACE_END(Grid); #endif From 59ba9ff3bbe1c0d38c33f3ef32226e471bc5d50b Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 14 Jan 2018 21:52:27 +0000 Subject: [PATCH 049/754] NAMESPACE & format --- lib/qcd/spin/Dirac.h | 92 ++++++++++++++++++++++---------------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/lib/qcd/spin/Dirac.h b/lib/qcd/spin/Dirac.h index ed22ec97..bedd6a40 100644 --- a/lib/qcd/spin/Dirac.h +++ b/lib/qcd/spin/Dirac.h @@ -28,35 +28,35 @@ with this program; if not, write to the Free Software Foundation, Inc., See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ -/* END LEGAL */ + /* END LEGAL */ #ifndef GRID_QCD_DIRAC_H #define GRID_QCD_DIRAC_H -// Gamma matrices using the code generated by the Mathematica notebook -// gamma-gen/gamma-gen.nb in Gamma.cc & Gamma.h -//////////////////////////////////////////////////////////////////////////////// + // Gamma matrices using the code generated by the Mathematica notebook + // gamma-gen/gamma-gen.nb in Gamma.cc & Gamma.h + //////////////////////////////////////////////////////////////////////////////// #include -namespace Grid { +NAMESPACE_BEGIN(Grid); // Dirac algebra adjoint operator (not in QCD:: to overload other adj) inline QCD::Gamma adj(const QCD::Gamma &g) { - return QCD::Gamma (QCD::Gamma::adj[g.g]); + return QCD::Gamma (QCD::Gamma::adj[g.g]); } -namespace QCD { + // Dirac algebra mutliplication operator inline Gamma operator*(const Gamma &g1, const Gamma &g2) { - return Gamma (Gamma::mul[g1.g][g2.g]); + return Gamma (Gamma::mul[g1.g][g2.g]); } // general left multiply template inline auto operator*(const Gamma &G, const iScalar &arg) -->typename std::enable_if,SpinorIndex>::notvalue,iScalar>::type + ->typename std::enable_if,SpinorIndex>::notvalue,iScalar>::type { iScalar ret; ret._internal=G*arg._internal; @@ -65,7 +65,7 @@ inline auto operator*(const Gamma &G, const iScalar &arg) template inline auto operator*(const Gamma &G, const iVector &arg) -->typename std::enable_if,SpinorIndex>::notvalue,iVector>::type + ->typename std::enable_if,SpinorIndex>::notvalue,iVector>::type { iVector ret; for(int i=0;i &arg) template inline auto operator*(const Gamma &G, const iMatrix &arg) -->typename std::enable_if,SpinorIndex>::notvalue,iMatrix>::type + ->typename std::enable_if,SpinorIndex>::notvalue,iMatrix>::type { iMatrix ret; for(int i=0;i inline auto operator*(const iScalar &arg, const Gamma &G) -->typename std::enable_if,SpinorIndex>::notvalue,iScalar>::type + ->typename std::enable_if,SpinorIndex>::notvalue,iScalar>::type { iScalar ret; ret._internal=arg._internal*G; @@ -98,13 +98,13 @@ inline auto operator*(const iScalar &arg, const Gamma &G) template inline auto operator * (const iMatrix &arg, const Gamma &G) -->typename std::enable_if,SpinorIndex>::notvalue,iMatrix>::type + ->typename std::enable_if,SpinorIndex>::notvalue,iMatrix>::type { iMatrix ret; for(int i=0;i inline auto operator*(const GammaL &gl, const iVector &arg) -->typename std::enable_if, SpinorIndex>::value, iVector>::type + ->typename std::enable_if, SpinorIndex>::value, iVector>::type { iVector buf; @@ -138,17 +138,17 @@ inline auto operator*(const GammaL &gl, const iVector &arg) // matrix left multiply template inline auto operator*(const GammaL &gl, const iMatrix &arg) -->typename std::enable_if, SpinorIndex>::value, iMatrix>::type + ->typename std::enable_if, SpinorIndex>::value, iMatrix>::type { iMatrix buf; for(unsigned int i = 0; i < Ns; ++i) - { - buf(0, i) = 0.; - buf(1, i) = 0.; - buf(2, i) = 2.*arg(2, i); - buf(3, i) = 2.*arg(3, i); - } + { + buf(0, i) = 0.; + buf(1, i) = 0.; + buf(2, i) = 2.*arg(2, i); + buf(3, i) = 2.*arg(3, i); + } return gl.gamma*buf; }; @@ -156,18 +156,18 @@ inline auto operator*(const GammaL &gl, const iMatrix &arg) // matrix right multiply template inline auto operator*(const iMatrix &arg, const GammaL &gl) -->typename std::enable_if, SpinorIndex>::value, iMatrix>::type + ->typename std::enable_if, SpinorIndex>::value, iMatrix>::type { iMatrix buf; buf = arg*gl.gamma; for(unsigned int i = 0; i < Ns; ++i) - { - buf(i, 0) = 0.; - buf(i, 1) = 0.; - buf(i, 2) = 2.*buf(i, 2); - buf(i, 3) = 2.*buf(i, 3); - } + { + buf(i, 0) = 0.; + buf(i, 1) = 0.; + buf(i, 2) = 2.*buf(i, 2); + buf(i, 3) = 2.*buf(i, 3); + } return buf; }; @@ -175,7 +175,7 @@ inline auto operator*(const iMatrix &arg, const GammaL &gl) //general left multiply template inline auto operator*(const GammaL &gl, const iScalar &arg) -->typename std::enable_if,SpinorIndex>::notvalue,iScalar>::type + ->typename std::enable_if,SpinorIndex>::notvalue,iScalar>::type { iScalar ret; ret._internal=gl*arg._internal; @@ -184,7 +184,7 @@ inline auto operator*(const GammaL &gl, const iScalar &arg) template inline auto operator*(const GammaL &gl, const iVector &arg) -->typename std::enable_if,SpinorIndex>::notvalue,iVector>::type + ->typename std::enable_if,SpinorIndex>::notvalue,iVector>::type { iVector ret; for(int i=0;i &arg) template inline auto operator*(const GammaL &gl, const iMatrix &arg) -->typename std::enable_if,SpinorIndex>::notvalue,iMatrix>::type + ->typename std::enable_if,SpinorIndex>::notvalue,iMatrix>::type { iMatrix ret; for(int i=0;i inline auto operator*(const iScalar &arg, const GammaL &gl) -->typename std::enable_if,SpinorIndex>::notvalue,iScalar>::type + ->typename std::enable_if,SpinorIndex>::notvalue,iScalar>::type { iScalar ret; ret._internal=arg._internal*gl; @@ -217,16 +217,16 @@ inline auto operator*(const iScalar &arg, const GammaL &gl) template inline auto operator * (const iMatrix &arg, const GammaL &gl) -->typename std::enable_if,SpinorIndex>::notvalue,iMatrix>::type + ->typename std::enable_if,SpinorIndex>::notvalue,iMatrix>::type { iMatrix ret; for(int i=0;i Date: Sun, 14 Jan 2018 21:53:05 +0000 Subject: [PATCH 050/754] Namespace --- lib/qcd/utils/WilsonLoops.h | 206 ++++++++++++++++++------------------ 1 file changed, 103 insertions(+), 103 deletions(-) diff --git a/lib/qcd/utils/WilsonLoops.h b/lib/qcd/utils/WilsonLoops.h index 90d905d3..d2e350c3 100644 --- a/lib/qcd/utils/WilsonLoops.h +++ b/lib/qcd/utils/WilsonLoops.h @@ -31,8 +31,8 @@ directory /* END LEGAL */ #ifndef QCD_UTILS_WILSON_LOOPS_H #define QCD_UTILS_WILSON_LOOPS_H -namespace Grid { -namespace QCD { + +NAMESPACE_BEGIN(Grid); // Common wilson loop observables template class WilsonLoops : public Gimpl { @@ -55,16 +55,16 @@ public: // purpose of deriving // from Gimpl. /* - plaq = Gimpl::CovShiftBackward( - U[mu], mu, Gimpl::CovShiftBackward( - U[nu], nu, Gimpl::CovShiftForward(U[mu], mu, U[nu]))); - */ + plaq = Gimpl::CovShiftBackward( + U[mu], mu, Gimpl::CovShiftBackward( + U[nu], nu, Gimpl::CovShiftForward(U[mu], mu, U[nu]))); + */ // _ //|< _| plaq = Gimpl::CovShiftForward(U[mu],mu, - Gimpl::CovShiftForward(U[nu],nu, - Gimpl::CovShiftBackward(U[mu],mu, - Gimpl::CovShiftIdentityBackward(U[nu], nu)))); + Gimpl::CovShiftForward(U[nu],nu, + Gimpl::CovShiftBackward(U[mu],mu, + Gimpl::CovShiftIdentityBackward(U[nu], nu)))); @@ -170,11 +170,11 @@ public: // staple += Gimpl::ShiftStaple( - Gimpl::CovShiftForward( - U[nu], nu, - Gimpl::CovShiftBackward( - U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))), - mu); + Gimpl::CovShiftForward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))), + mu); // __ // | @@ -182,15 +182,15 @@ public: // // staple += Gimpl::ShiftStaple( - Gimpl::CovShiftBackward(U[nu], nu, - Gimpl::CovShiftBackward(U[mu], mu, U[nu])), - mu); + Gimpl::CovShiftBackward(U[nu], nu, + Gimpl::CovShiftBackward(U[mu], mu, U[nu])), + mu); } } -// For the force term -static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) { + // For the force term + static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) { GridBase *grid = Umu._grid; std::vector U(Nd, grid); for (int d = 0; d < Nd; d++) { @@ -212,7 +212,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) { } } staple = U[mu]*staple; -} + } ////////////////////////////////////////////////// // the sum over all staples on each site @@ -241,11 +241,11 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) { // staple += Gimpl::ShiftStaple( - Gimpl::CovShiftForward( - U[nu], nu, - Gimpl::CovShiftBackward( - U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))), - mu); + Gimpl::CovShiftForward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))), + mu); // __ // | @@ -254,8 +254,8 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) { // staple += Gimpl::ShiftStaple( - Gimpl::CovShiftBackward(U[nu], nu, - Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu); + Gimpl::CovShiftBackward(U[nu], nu, + Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu); } } } @@ -283,11 +283,11 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) { // staple = Gimpl::ShiftStaple( - Gimpl::CovShiftForward( - U[nu], nu, - Gimpl::CovShiftBackward( - U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))), - mu); + Gimpl::CovShiftForward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))), + mu); } } @@ -314,8 +314,8 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) { // // staple = Gimpl::ShiftStaple( - Gimpl::CovShiftBackward(U[nu], nu, - Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu); + Gimpl::CovShiftBackward(U[nu], nu, + Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu); } } @@ -323,19 +323,19 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) { // Field Strength ////////////////////////////////////////////////////// static void FieldStrength(GaugeMat &FS, const GaugeLorentz &Umu, int mu, int nu){ - // Fmn +--<--+ Ut +--<--+ - // | | | | - // (x)+-->--+ +-->--+(x) - // | | | | - // +--<--+ +--<--+ + // Fmn +--<--+ Ut +--<--+ + // | | | | + // (x)+-->--+ +-->--+(x) + // | | | | + // +--<--+ +--<--+ - GaugeMat Vup(Umu._grid), Vdn(Umu._grid); - StapleUpper(Vup, Umu, mu, nu); - StapleLower(Vdn, Umu, mu, nu); - GaugeMat v = Vup - Vdn; - GaugeMat u = PeekIndex(Umu, mu); // some redundant copies - GaugeMat vu = v*u; - FS = 0.25*Ta(u*v + Cshift(vu, mu, -1)); + GaugeMat Vup(Umu._grid), Vdn(Umu._grid); + StapleUpper(Vup, Umu, mu, nu); + StapleLower(Vdn, Umu, mu, nu); + GaugeMat v = Vup - Vdn; + GaugeMat u = PeekIndex(Umu, mu); // some redundant copies + GaugeMat vu = v*u; + FS = 0.25*Ta(u*v + Cshift(vu, mu, -1)); } static Real TopologicalCharge(GaugeLorentz &U){ @@ -366,14 +366,14 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) { static void dirRectangle(GaugeMat &rect, const std::vector &U, const int mu, const int nu) { rect = Gimpl::CovShiftForward( - U[mu], mu, Gimpl::CovShiftForward(U[mu], mu, U[nu])) * // ->->| - adj(Gimpl::CovShiftForward( - U[nu], nu, Gimpl::CovShiftForward(U[mu], mu, U[mu]))); + U[mu], mu, Gimpl::CovShiftForward(U[mu], mu, U[nu])) * // ->->| + adj(Gimpl::CovShiftForward( + U[nu], nu, Gimpl::CovShiftForward(U[mu], mu, U[mu]))); rect = rect + - Gimpl::CovShiftForward( - U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[nu])) * // ->|| - adj(Gimpl::CovShiftForward( - U[nu], nu, Gimpl::CovShiftForward(U[nu], nu, U[mu]))); + Gimpl::CovShiftForward( + U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[nu])) * // ->|| + adj(Gimpl::CovShiftForward( + U[nu], nu, Gimpl::CovShiftForward(U[nu], nu, U[mu]))); } static void traceDirRectangle(ComplexField &rect, const std::vector &U, const int mu, @@ -537,52 +537,52 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) { // | __ | // Stap += Gimpl::ShiftStaple( - Gimpl::CovShiftForward( - U[mu], mu, - Gimpl::CovShiftForward( - U[nu], nu, - Gimpl::CovShiftBackward( - U[mu], mu, - Gimpl::CovShiftBackward( - U[mu], mu, - Gimpl::CovShiftIdentityBackward(U[nu], nu))))), - mu); + Gimpl::CovShiftForward( + U[mu], mu, + Gimpl::CovShiftForward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[mu], mu, + Gimpl::CovShiftBackward( + U[mu], mu, + Gimpl::CovShiftIdentityBackward(U[nu], nu))))), + mu); // __ // |__ __ | Stap += Gimpl::ShiftStaple( - Gimpl::CovShiftForward( - U[mu], mu, - Gimpl::CovShiftBackward( - U[nu], nu, - Gimpl::CovShiftBackward( - U[mu], mu, Gimpl::CovShiftBackward(U[mu], mu, U[nu])))), - mu); + Gimpl::CovShiftForward( + U[mu], mu, + Gimpl::CovShiftBackward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[mu], mu, Gimpl::CovShiftBackward(U[mu], mu, U[nu])))), + mu); // __ // |__ __ | Stap += Gimpl::ShiftStaple( - Gimpl::CovShiftBackward( - U[nu], nu, - Gimpl::CovShiftBackward( - U[mu], mu, - Gimpl::CovShiftBackward( - U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[mu])))), - mu); + Gimpl::CovShiftBackward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[mu], mu, + Gimpl::CovShiftBackward( + U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[mu])))), + mu); // __ ___ // |__ | Stap += Gimpl::ShiftStaple( - Gimpl::CovShiftForward( - U[nu], nu, - Gimpl::CovShiftBackward( - U[mu], mu, - Gimpl::CovShiftBackward( - U[mu], mu, Gimpl::CovShiftBackward(U[nu], nu, U[mu])))), - mu); + Gimpl::CovShiftForward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[mu], mu, + Gimpl::CovShiftBackward( + U[mu], mu, Gimpl::CovShiftBackward(U[nu], nu, U[mu])))), + mu); // -- // | | @@ -590,16 +590,16 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) { // | | Stap += Gimpl::ShiftStaple( - Gimpl::CovShiftForward( - U[nu], nu, - Gimpl::CovShiftForward( - U[nu], nu, - Gimpl::CovShiftBackward( - U[mu], mu, - Gimpl::CovShiftBackward( - U[nu], nu, - Gimpl::CovShiftIdentityBackward(U[nu], nu))))), - mu); + Gimpl::CovShiftForward( + U[nu], nu, + Gimpl::CovShiftForward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[mu], mu, + Gimpl::CovShiftBackward( + U[nu], nu, + Gimpl::CovShiftIdentityBackward(U[nu], nu))))), + mu); // | | // @@ -607,13 +607,13 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) { // -- Stap += Gimpl::ShiftStaple( - Gimpl::CovShiftBackward( - U[nu], nu, - Gimpl::CovShiftBackward( - U[nu], nu, - Gimpl::CovShiftBackward( - U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[nu])))), - mu); + Gimpl::CovShiftBackward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[nu])))), + mu); } } } @@ -623,7 +623,7 @@ typedef WilsonLoops ColourWilsonLoops; typedef WilsonLoops U1WilsonLoops; typedef WilsonLoops SU2WilsonLoops; typedef WilsonLoops SU3WilsonLoops; -} -} + +NAMESPACE_END(Grid); #endif From c1438cbbe3599dcc9b255b8b60def74a36a21851 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 14 Jan 2018 21:53:39 +0000 Subject: [PATCH 051/754] Namespace --- lib/qcd/utils/SpaceTimeGrid.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/qcd/utils/SpaceTimeGrid.h b/lib/qcd/utils/SpaceTimeGrid.h index 61613e4d..f7d0a13e 100644 --- a/lib/qcd/utils/SpaceTimeGrid.h +++ b/lib/qcd/utils/SpaceTimeGrid.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,15 +23,15 @@ Author: Peter Boyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_QCD_SPACE_TIME_GRID_H #define GRID_QCD_SPACE_TIME_GRID_H -namespace Grid { -namespace QCD { + +NAMESPACE_BEGIN(Grid); class SpaceTimeGrid { - public: +public: static GridCartesian *makeFourDimGrid(const std::vector & latt,const std::vector &simd,const std::vector &mpi); static GridRedBlackCartesian *makeFourDimRedBlackGrid (const GridCartesian *FourDimGrid); @@ -45,6 +45,6 @@ class SpaceTimeGrid { }; -}} +NAMESPACE_END(Grid); #endif From 1dbea9aa6967bbe1657ec44f9428c2c3ad657473 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 14 Jan 2018 21:54:28 +0000 Subject: [PATCH 052/754] Namespace --- lib/qcd/utils/SpaceTimeGrid.cc | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/lib/qcd/utils/SpaceTimeGrid.cc b/lib/qcd/utils/SpaceTimeGrid.cc index b2b5d9c8..93567b86 100644 --- a/lib/qcd/utils/SpaceTimeGrid.cc +++ b/lib/qcd/utils/SpaceTimeGrid.cc @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,13 +23,12 @@ Author: Peter Boyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #include #include -namespace Grid { - namespace QCD { +NAMESPACE_BEGIN(Grid); ///////////////////////////////////////////////////////////////// // Public interface @@ -58,7 +57,7 @@ GridCartesian *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian for(int d=0;d_fdimensions[d]); simd5.push_back(FourDimGrid->_simd_layout[d]); - mpi5.push_back(FourDimGrid->_processors[d]); + mpi5.push_back(FourDimGrid->_processors[d]); } return new GridCartesian(latt5,simd5,mpi5,*FourDimGrid); } @@ -70,7 +69,7 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridC int cbd=1; std::vector cb5(1,0); for(int d=0;d_fdimensions[d]); simd5.push_back(1); - mpi5.push_back(FourDimGrid->_processors[d]); + mpi5.push_back(FourDimGrid->_processors[d]); } return new GridCartesian(latt5,simd5,mpi5,*FourDimGrid); } @@ -105,7 +104,7 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(int Ls,const Gr int cbd=1; std::vector cb5(1,0); for(int d=0;d Date: Sun, 14 Jan 2018 21:55:47 +0000 Subject: [PATCH 053/754] Namespace --- lib/qcd/utils/ScalarObjs.h | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/lib/qcd/utils/ScalarObjs.h b/lib/qcd/utils/ScalarObjs.h index a284af40..a8e563c5 100644 --- a/lib/qcd/utils/ScalarObjs.h +++ b/lib/qcd/utils/ScalarObjs.h @@ -28,15 +28,13 @@ directory /* END LEGAL */ #ifndef SCALAR_OBJS_H #define SCALAR_OBJS_H -namespace Grid { - // FIXME drop the QCD namespace in Nd - +NAMESPACE_BEGIN(Grid); // Scalar field obs template class ScalarObs { - public: +public: ////////////////////////////////////////////////// // squared field ////////////////////////////////////////////////// @@ -61,7 +59,7 @@ class ScalarObs { static void phider(typename Impl::Field &fsq, const typename Impl::Field &f) { fsq = Cshift(f, 0, -1) * f; - for (int mu = 1; mu < QCD::Nd; mu++) fsq += Cshift(f, mu, -1) * f; + for (int mu = 1; mu < Nd; mu++) fsq += Cshift(f, mu, -1) * f; } ////////////////////////////////////////////////// @@ -71,7 +69,7 @@ class ScalarObs { static RealD sumphider(const typename Impl::Field &f) { typename Impl::Field tmp(f._grid); tmp = Cshift(f, 0, -1) * f; - for (int mu = 1; mu < QCD::Nd; mu++) { + for (int mu = 1; mu < Nd; mu++) { tmp += Cshift(f, mu, -1) * f; } return -sum(trace(tmp)); @@ -90,7 +88,6 @@ class ScalarObs { } }; - -} +NAMESPACE_END(Grid); #endif From d58b7cf9b9b4e389960ed82e0fe8b0e75962998b Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 14 Jan 2018 21:56:55 +0000 Subject: [PATCH 054/754] Namespace changes --- lib/qcd/utils/SUnTwoIndex.h | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/lib/qcd/utils/SUnTwoIndex.h b/lib/qcd/utils/SUnTwoIndex.h index 0c09f5e2..c9b301e8 100644 --- a/lib/qcd/utils/SUnTwoIndex.h +++ b/lib/qcd/utils/SUnTwoIndex.h @@ -26,8 +26,7 @@ #define QCD_UTIL_SUN2INDEX_H -namespace Grid { -namespace QCD { +NAMESPACE_BEGIN(Grid); enum TwoIndexSymmetry { Symmetric = 1, AntiSymmetric = -1 }; @@ -35,7 +34,7 @@ inline Real delta(int a, int b) { return (a == b) ? 1.0 : 0.0; } template class SU_TwoIndex : public SU { - public: +public: static const int Dimension = ncolour * (ncolour + S) / 2; static const int NumGenerators = SU::AdjointDimension; @@ -55,11 +54,11 @@ class SU_TwoIndex : public SU { typedef Lattice LatticeTwoIndexMatrixD; typedef Lattice >, Nd> > - LatticeTwoIndexField; + LatticeTwoIndexField; typedef Lattice >, Nd> > - LatticeTwoIndexFieldF; + LatticeTwoIndexFieldF; typedef Lattice >, Nd> > - LatticeTwoIndexFieldD; + LatticeTwoIndexFieldD; template using iSUnMatrix = iScalar > >; @@ -109,7 +108,7 @@ class SU_TwoIndex : public SU { for (int k = 0; k < ncolour; k++) for (int l = 0; l < ncolour; l++) eij()()(l, k) = delta(i, k) * delta(j, l) + - S * delta(j, k) * delta(i, l); + S * delta(j, k) * delta(i, l); RealD nrm = 1. / std::sqrt(2.0); eij = eij * nrm; @@ -128,7 +127,7 @@ class SU_TwoIndex : public SU { template static void generator(int Index, iSUnTwoIndexMatrix &i2indTa) { Vector::template iSUnMatrix > ta( - ncolour * ncolour - 1); + ncolour * ncolour - 1); Vector::template iSUnMatrix > eij(Dimension); typename SU::template iSUnMatrix tmp; i2indTa = zero; @@ -142,7 +141,7 @@ class SU_TwoIndex : public SU { tmp = transpose(ta[Index]) * adj(eij[a]) + adj(eij[a]) * ta[Index]; for (int b = 0; b < Dimension; b++) { typename SU::template iSUnMatrix tmp1 = - tmp * eij[b]; + tmp * eij[b]; Complex iTr = TensorRemove(timesI(trace(tmp1))); i2indTa()()(a, b) = iTr; } @@ -197,8 +196,8 @@ class SU_TwoIndex : public SU { } static void TwoIndexLieAlgebraMatrix( - const typename SU::LatticeAlgebraVector &h, - LatticeTwoIndexMatrix &out, Real scale = 1.0) { + const typename SU::LatticeAlgebraVector &h, + LatticeTwoIndexMatrix &out, Real scale = 1.0) { conformable(h, out); GridBase *grid = out._grid; LatticeTwoIndexMatrix la(grid); @@ -216,8 +215,8 @@ class SU_TwoIndex : public SU { // Projects the algebra components // of a lattice matrix ( of dimension ncol*ncol -1 ) static void projectOnAlgebra( - typename SU::LatticeAlgebraVector &h_out, - const LatticeTwoIndexMatrix &in, Real scale = 1.0) { + typename SU::LatticeAlgebraVector &h_out, + const LatticeTwoIndexMatrix &in, Real scale = 1.0) { conformable(h_out, in); h_out = zero; TIMatrix i2indTa; @@ -245,8 +244,8 @@ class SU_TwoIndex : public SU { } Real coefficient = - -2.0 / (ncolour + 2 * S) * scale; // 2/(Nc +/- 2) for the normalization - // of the trace in the two index rep + -2.0 / (ncolour + 2 * S) * scale; // 2/(Nc +/- 2) for the normalization + // of the trace in the two index rep for (int a = 0; a < ncolour * ncolour - 1; a++) { auto tmp = real(trace(i2indTa[a] * in)) * coefficient; @@ -269,8 +268,6 @@ typedef SU_TwoIndex<3, AntiSymmetric> SU3TwoIndexAntiSymm; typedef SU_TwoIndex<4, AntiSymmetric> SU4TwoIndexAntiSymm; typedef SU_TwoIndex<5, AntiSymmetric> SU5TwoIndexAntiSymm; - -} -} +NAMESPACE_END(Grid); #endif From 66f8a2f082454e774e500e3a17cb6c066a70f607 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 14 Jan 2018 21:57:46 +0000 Subject: [PATCH 055/754] Namespace --- lib/qcd/utils/SUnAdjoint.h | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/lib/qcd/utils/SUnAdjoint.h b/lib/qcd/utils/SUnAdjoint.h index 9d9b77bd..a37d0cdb 100644 --- a/lib/qcd/utils/SUnAdjoint.h +++ b/lib/qcd/utils/SUnAdjoint.h @@ -22,17 +22,16 @@ // //////////////////////////////////////////////////////////////////////// -namespace Grid { -namespace QCD { +NAMESPACE_BEGIN(Grid); template class SU_Adjoint : public SU { - public: +public: static const int Dimension = ncolour * ncolour - 1; template using iSUnAdjointMatrix = - iScalar > >; + iScalar > >; // Actually the adjoint matrices are real... // Consider this overhead... FIXME @@ -49,11 +48,11 @@ class SU_Adjoint : public SU { typedef Lattice LatticeAdjMatrixD; typedef Lattice >, Nd> > - LatticeAdjField; + LatticeAdjField; typedef Lattice >, Nd> > - LatticeAdjFieldF; + LatticeAdjFieldF; typedef Lattice >, Nd> > - LatticeAdjFieldD; + LatticeAdjFieldD; @@ -73,7 +72,7 @@ class SU_Adjoint : public SU { tmp = ta[a] * ta[Index] - ta[Index] * ta[a]; for (int b = 0; b < (ncolour * ncolour - 1); b++) { typename SU::template iSUnMatrix tmp1 = - 2.0 * tmp * ta[b]; // 2.0 from the normalization + 2.0 * tmp * ta[b]; // 2.0 from the normalization Complex iTr = TensorRemove(timesI(trace(tmp1))); //iAdjTa()()(b, a) = iTr; iAdjTa()()(a, b) = iTr; @@ -112,8 +111,8 @@ class SU_Adjoint : public SU { } static void AdjointLieAlgebraMatrix( - const typename SU::LatticeAlgebraVector &h, - LatticeAdjMatrix &out, Real scale = 1.0) { + const typename SU::LatticeAlgebraVector &h, + LatticeAdjMatrix &out, Real scale = 1.0) { conformable(h, out); GridBase *grid = out._grid; LatticeAdjMatrix la(grid); @@ -150,11 +149,11 @@ class SU_Adjoint : public SU { static bool precalculated = false; if (!precalculated){ precalculated = true; - for (int a = 0; a < Dimension; a++) generator(a, iTa[a]); + for (int a = 0; a < Dimension; a++) generator(a, iTa[a]); } Real coefficient = -1.0 / (ncolour) * scale; // 1/Nc for the normalization of - // the trace in the adj rep + // the trace in the adj rep for (int a = 0; a < Dimension; a++) { auto tmp = real(trace(iTa[a] * in)) * coefficient; @@ -176,7 +175,7 @@ typedef SU_Adjoint<4> SU4Adjoint; typedef SU_Adjoint<5> SU5Adjoint; typedef SU_Adjoint AdjointMatrices; -} -} + +NAMESPACE_END(Grid); #endif From b331ecea78edc776dea766ed11432ab183d20f64 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 14 Jan 2018 21:58:47 +0000 Subject: [PATCH 056/754] Namespace --- lib/qcd/utils/SUn.h | 132 ++++++++++++++++++++++---------------------- 1 file changed, 65 insertions(+), 67 deletions(-) diff --git a/lib/qcd/utils/SUn.h b/lib/qcd/utils/SUn.h index cdc6c961..318f001a 100644 --- a/lib/qcd/utils/SUn.h +++ b/lib/qcd/utils/SUn.h @@ -28,16 +28,15 @@ with this program; if not, write to the Free Software Foundation, Inc., See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ -/* END LEGAL */ + /* END LEGAL */ #ifndef QCD_UTIL_SUN_H #define QCD_UTIL_SUN_H -namespace Grid { -namespace QCD { +NAMESPACE_BEGIN(Grid); template class SU { - public: +public: static const int Dimension = ncolour; static const int AdjointDimension = ncolour * ncolour - 1; static int su2subgroups(void) { return (ncolour * (ncolour - 1)) / 2; } @@ -48,7 +47,7 @@ class SU { using iSU2Matrix = iScalar > >; template using iSUnAlgebraVector = - iScalar > >; + iScalar > >; ////////////////////////////////////////////////////////////////////////////////////////////////// // Types can be accessed as SU<2>::Matrix , SU<2>::vSUnMatrix, @@ -238,7 +237,7 @@ class SU { // this should be purely real Determinant._odata[ss] = - Sigma()()(0, 0) * Sigma()()(1, 1) - Sigma()()(0, 1) * Sigma()()(1, 0); + Sigma()()(0, 0) * Sigma()()(1, 1) - Sigma()()(0, 1) * Sigma()()(1, 0); } } @@ -273,11 +272,11 @@ class SU { // /////////////////////////////////////////////// static void SubGroupHeatBath( - GridSerialRNG &sRNG, GridParallelRNG &pRNG, - RealD beta, // coeff multiplying staple in action (with no 1/Nc) - LatticeMatrix &link, - const LatticeMatrix &barestaple, // multiplied by action coeffs so th - int su2_subgroup, int nheatbath, LatticeInteger &wheremask) { + GridSerialRNG &sRNG, GridParallelRNG &pRNG, + RealD beta, // coeff multiplying staple in action (with no 1/Nc) + LatticeMatrix &link, + const LatticeMatrix &barestaple, // multiplied by action coeffs so th + int su2_subgroup, int nheatbath, LatticeInteger &wheremask) { GridBase *grid = link._grid; int ntrials = 0; @@ -293,7 +292,7 @@ class SU { // Subgroup manipulation in the lie algebra space LatticeSU2Matrix u( - grid); // Kennedy pendleton "u" real projected normalised Sigma + grid); // Kennedy pendleton "u" real projected normalised Sigma LatticeSU2Matrix uinv(grid); LatticeSU2Matrix ua(grid); // a in pauli form LatticeSU2Matrix b(grid); // rotated matrix after hb @@ -314,41 +313,41 @@ class SU { mask_false = 0; /* - PLB 156 P393 (1985) (Kennedy and Pendleton) + PLB 156 P393 (1985) (Kennedy and Pendleton) - Note: absorb "beta" into the def of sigma compared to KP paper; staple - passed to this routine has "beta" already multiplied in + Note: absorb "beta" into the def of sigma compared to KP paper; staple + passed to this routine has "beta" already multiplied in - Action linear in links h and of form: + Action linear in links h and of form: beta S = beta Sum_p (1 - 1/Nc Re Tr Plaq ) - Writing Sigma = 1/Nc (beta Sigma') where sum over staples is "Sigma' " + Writing Sigma = 1/Nc (beta Sigma') where sum over staples is "Sigma' " - beta S = const - beta/Nc Re Tr h Sigma' - = const - Re Tr h Sigma + beta S = const - beta/Nc Re Tr h Sigma' + = const - Re Tr h Sigma - Decompose h and Sigma into (1, sigma_j) ; h_i real, h^2=1, Sigma_i complex - arbitrary. + Decompose h and Sigma into (1, sigma_j) ; h_i real, h^2=1, Sigma_i complex + arbitrary. Tr h Sigma = h_i Sigma_j Tr (sigma_i sigma_j) = h_i Sigma_j 2 delta_ij - Re Tr h Sigma = 2 h_j Re Sigma_j + Re Tr h Sigma = 2 h_j Re Sigma_j - Normalised re Sigma_j = xi u_j + Normalised re Sigma_j = xi u_j - With u_j a unit vector and U can be in SU(2); + With u_j a unit vector and U can be in SU(2); - Re Tr h Sigma = 2 h_j Re Sigma_j = 2 xi (h.u) + Re Tr h Sigma = 2 h_j Re Sigma_j = 2 xi (h.u) - 4xi^2 = Det [ Sig - Sig^dag + 1 Tr Sigdag] - u = 1/2xi [ Sig - Sig^dag + 1 Tr Sigdag] + 4xi^2 = Det [ Sig - Sig^dag + 1 Tr Sigdag] + u = 1/2xi [ Sig - Sig^dag + 1 Tr Sigdag] - xi = sqrt(Det)/2; + xi = sqrt(Det)/2; - Write a= u h in SU(2); a has pauli decomp a_j; + Write a= u h in SU(2); a has pauli decomp a_j; - Note: Product b' xi is unvariant because scaling Sigma leaves - normalised vector "u" fixed; Can rescale Sigma so b' = 1. + Note: Product b' xi is unvariant because scaling Sigma leaves + normalised vector "u" fixed; Can rescale Sigma so b' = 1. */ //////////////////////////////////////////////////////// @@ -386,7 +385,7 @@ class SU { xi = 0.5 * sqrt(udet); // 4xi^2 = Det [ Sig - Sig^dag + 1 Tr Sigdag] u = 0.5 * u * - pow(xi, -1.0); // u = 1/2xi [ Sig - Sig^dag + 1 Tr Sigdag] + pow(xi, -1.0); // u = 1/2xi [ Sig - Sig^dag + 1 Tr Sigdag] // Debug test for sanity uinv = adj(u); @@ -394,36 +393,36 @@ class SU { assert(norm2(b) < 1.0e-4); /* - Measure: Haar measure dh has d^4a delta(1-|a^2|) - In polars: - da = da0 r^2 sin theta dr dtheta dphi delta( 1 - r^2 -a0^2) - = da0 r^2 sin theta dr dtheta dphi delta( (sqrt(1-a0^) - r)(sqrt(1-a0^) + - r) ) - = da0 r/2 sin theta dr dtheta dphi delta( (sqrt(1-a0^) - r) ) + Measure: Haar measure dh has d^4a delta(1-|a^2|) + In polars: + da = da0 r^2 sin theta dr dtheta dphi delta( 1 - r^2 -a0^2) + = da0 r^2 sin theta dr dtheta dphi delta( (sqrt(1-a0^) - r)(sqrt(1-a0^) + + r) ) + = da0 r/2 sin theta dr dtheta dphi delta( (sqrt(1-a0^) - r) ) - Action factor Q(h) dh = e^-S[h] dh = e^{ xi Tr uh} dh // beta enters - through xi - = e^{2 xi (h.u)} dh - = e^{2 xi h0u0}.e^{2 xi h1u1}.e^{2 xi - h2u2}.e^{2 xi h3u3} dh + Action factor Q(h) dh = e^-S[h] dh = e^{ xi Tr uh} dh // beta enters + through xi + = e^{2 xi (h.u)} dh + = e^{2 xi h0u0}.e^{2 xi h1u1}.e^{2 xi + h2u2}.e^{2 xi h3u3} dh - Therefore for each site, take xi for that site - i) generate |a0|<1 with dist - (1-a0^2)^0.5 e^{2 xi a0 } da0 + Therefore for each site, take xi for that site + i) generate |a0|<1 with dist + (1-a0^2)^0.5 e^{2 xi a0 } da0 - Take alpha = 2 xi = 2 xi [ recall 2 beta/Nc unmod staple norm]; hence 2.0/Nc - factor in Chroma ] - A. Generate two uniformly distributed pseudo-random numbers R and R', R'', - R''' in the unit interval; - B. Set X = -(ln R)/alpha, X' =-(ln R')/alpha; - C. Set C = cos^2(2pi R"), with R" another uniform random number in [0,1] ; - D. Set A = XC; - E. Let d = X'+A; - F. If R'''^2 :> 1 - 0.5 d, go back to A; - G. Set a0 = 1 - d; + Take alpha = 2 xi = 2 xi [ recall 2 beta/Nc unmod staple norm]; hence 2.0/Nc + factor in Chroma ] + A. Generate two uniformly distributed pseudo-random numbers R and R', R'', + R''' in the unit interval; + B. Set X = -(ln R)/alpha, X' =-(ln R')/alpha; + C. Set C = cos^2(2pi R"), with R" another uniform random number in [0,1] ; + D. Set A = XC; + E. Let d = X'+A; + F. If R'''^2 :> 1 - 0.5 d, go back to A; + G. Set a0 = 1 - d; - Note that in step D setting B ~ X - A and using B in place of A in step E will - generate a second independent a 0 value. + Note that in step D setting B ~ X - A and using B in place of A in step E will + generate a second independent a 0 value. */ ///////////////////////////////////////////////////////// @@ -518,7 +517,7 @@ class SU { a[3] = a123mag * cos_theta; ua = toComplex(a[0]) * ident + toComplex(a[1]) * pauli1 + - toComplex(a[2]) * pauli2 + toComplex(a[3]) * pauli3; + toComplex(a[2]) * pauli2 + toComplex(a[3]) * pauli3; b = 1.0; b = where(wheremask, uinv * ua, b); @@ -616,7 +615,7 @@ class SU { typedef Lattice LatticeComplexType; typedef typename GridTypeMapper< - typename LatticeMatrixType::vector_object>::scalar_object MatrixType; + typename LatticeMatrixType::vector_object>::scalar_object MatrixType; LatticeComplexType ca(grid); LatticeMatrixType lie(grid); @@ -675,11 +674,11 @@ class SU { out += la; } } -/* - add GaugeTrans -*/ + /* + add GaugeTrans + */ -template + template static void GaugeTransform( GaugeField &Umu, GaugeMat &g){ GridBase *grid = Umu._grid; conformable(grid,g._grid); @@ -694,7 +693,7 @@ template } } template - static void GaugeTransform( std::vector &U, GaugeMat &g){ + static void GaugeTransform( std::vector &U, GaugeMat &g){ GridBase *grid = g._grid; GaugeMat ag(grid); ag = adj(g); for(int mu=0;mu SU5; typedef SU FundamentalMatrices; -} -} +NAMESPACE_END(Grid); #endif From 6a62a9c6a524c5d92d6241ebc95ed205534fabcb Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 14 Jan 2018 21:59:48 +0000 Subject: [PATCH 057/754] Namespace --- lib/qcd/utils/Metric.h | 42 ++++++++++++++++-------------------------- 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/lib/qcd/utils/Metric.h b/lib/qcd/utils/Metric.h index 60a9bfc5..c4a0ce1a 100644 --- a/lib/qcd/utils/Metric.h +++ b/lib/qcd/utils/Metric.h @@ -25,13 +25,12 @@ with this program; if not, write to the Free Software Foundation, Inc., See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ -/* END LEGAL */ -//-------------------------------------------------------------------- + /* END LEGAL */ + //-------------------------------------------------------------------- #ifndef METRIC_H #define METRIC_H -namespace Grid{ -namespace QCD{ +NAMESPACE_BEGIN(Grid); template class Metric{ @@ -168,20 +167,20 @@ public: void AuxiliaryFieldsDerivative(MomentaField& der){ der = zero; if (1){ - // Auxiliary fields - MomentaField der_temp(der._grid); - MomentaField X(der._grid); - X=zero; - //M.M(AuxMom, X); // X = M Aux - // Two derivative terms - // the Mderiv need separation of left and right terms - M.MDeriv(AuxMom, der); + // Auxiliary fields + MomentaField der_temp(der._grid); + MomentaField X(der._grid); + X=zero; + //M.M(AuxMom, X); // X = M Aux + // Two derivative terms + // the Mderiv need separation of left and right terms + M.MDeriv(AuxMom, der); - // this one should not be necessary (identical to the previous one) - //M.MDeriv(X, AuxMom, der_temp); der += der_temp; + // this one should not be necessary (identical to the previous one) + //M.MDeriv(X, AuxMom, der_temp); der += der_temp; - der = -1.0*Implementation::projectForce(der); + der = -1.0*Implementation::projectForce(der); } } @@ -212,15 +211,6 @@ public: }; +NAMESPACE_END(Grid); - - - - - - -} -} - - -#endif //METRIC_H \ No newline at end of file +#endif //METRIC_H From fdcbe0a0d117378018d2ff5aafc265ab4acaf5ef Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 14 Jan 2018 22:00:29 +0000 Subject: [PATCH 058/754] Namespace --- lib/qcd/utils/LinalgUtils.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/qcd/utils/LinalgUtils.h b/lib/qcd/utils/LinalgUtils.h index 5eaf1c2a..b96cb63c 100644 --- a/lib/qcd/utils/LinalgUtils.h +++ b/lib/qcd/utils/LinalgUtils.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -25,13 +25,13 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_QCD_LINALG_UTILS_H #define GRID_QCD_LINALG_UTILS_H -namespace Grid{ -namespace QCD{ +NAMESPACE_BEGIN(Grid); + //////////////////////////////////////////////////////////////////////// //This file brings additional linear combination assist that is helpful //to QCD such as chiral projectors and spin matrices applied to one of the inputs. @@ -174,5 +174,5 @@ void G5R5(Lattice &z,const Lattice &x) } } -}} +NAMESPACE_END(Grid); #endif From 9aa34dc8030f2399792641d4e695de852e572d2f Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 14 Jan 2018 22:01:17 +0000 Subject: [PATCH 059/754] Namespace --- lib/qcd/utils/GaugeFix.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/lib/qcd/utils/GaugeFix.h b/lib/qcd/utils/GaugeFix.h index c4ea31aa..041a7b6f 100644 --- a/lib/qcd/utils/GaugeFix.h +++ b/lib/qcd/utils/GaugeFix.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* grid` physics library, www.github.com/paboyle/Grid @@ -22,18 +22,18 @@ Author: Peter Boyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ //#include #ifndef GRID_QCD_GAUGE_FIX_H #define GRID_QCD_GAUGE_FIX_H -namespace Grid { -namespace QCD { + +NAMESPACE_BEGIN(Grid); template class FourierAcceleratedGaugeFixer : public Gimpl { - public: +public: INHERIT_GIMPL_TYPES(Gimpl); typedef typename Gimpl::GaugeLinkField GaugeMat; @@ -60,7 +60,7 @@ class FourierAcceleratedGaugeFixer : public Gimpl { Real trG; std::vector U(Nd,grid); - GaugeMat dmuAmu(grid); + GaugeMat dmuAmu(grid); for(int i=0;i(Umu,mu); @@ -154,8 +154,8 @@ class FourierAcceleratedGaugeFixer : public Gimpl { Fp = psqMax*one/psq; /* - static int once; - if ( once == 0 ) { + static int once; + if ( once == 0 ) { std::cout << " Fp " << Fp < Date: Sun, 14 Jan 2018 22:02:09 +0000 Subject: [PATCH 060/754] Namespace --- lib/qcd/utils/CovariantLaplacian.h | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/lib/qcd/utils/CovariantLaplacian.h b/lib/qcd/utils/CovariantLaplacian.h index 0c99b03e..6d026013 100644 --- a/lib/qcd/utils/CovariantLaplacian.h +++ b/lib/qcd/utils/CovariantLaplacian.h @@ -25,13 +25,12 @@ with this program; if not, write to the Free Software Foundation, Inc., See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ -/* END LEGAL */ + /* END LEGAL */ #ifndef COVARIANT_LAPLACIAN_H #define COVARIANT_LAPLACIAN_H -namespace Grid { -namespace QCD { +NAMESPACE_BEGIN(Grid); struct LaplacianParams : Serializable { GRID_SERIALIZABLE_CLASS_MEMBERS(LaplacianParams, @@ -80,19 +79,19 @@ class LaplacianAdjointField: public Metric { MultiShiftFunction PowerHalf; MultiShiftFunction PowerInvHalf; - public: +public: INHERIT_GIMPL_TYPES(Impl); LaplacianAdjointField(GridBase* grid, OperatorFunction& S, LaplacianParams& p, const RealD k = 1.0) - : U(Nd, grid), Solver(S), param(p), kappa(k){ - AlgRemez remez(param.lo,param.hi,param.precision); - std::cout< { - private: +private: RealD kappa; std::vector U; }; -} -} +NAMESPACE_END(Grid); #endif From 0e080a7abce823bc52bc6db3740fc8a7643c00ae Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 14 Jan 2018 22:03:14 +0000 Subject: [PATCH 061/754] Namespace --- lib/qcd/utils/CovariantCshift.h | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/lib/qcd/utils/CovariantCshift.h b/lib/qcd/utils/CovariantCshift.h index 2f7561f8..35aa1ca5 100644 --- a/lib/qcd/utils/CovariantCshift.h +++ b/lib/qcd/utils/CovariantCshift.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -24,13 +24,13 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef QCD_UTILS_COVARIANT_CSHIFT_H #define QCD_UTILS_COVARIANT_CSHIFT_H -namespace Grid { -namespace QCD { +NAMESPACE_BEGIN(Grid); + //////////////////////////////////////////////////////////////////////// // Low performance implementation of CovariantCshift API //////////////////////////////////////////////////////////////////////// @@ -39,8 +39,8 @@ namespace QCD { namespace PeriodicBC { template Lattice CovShiftForward(const Lattice &Link, - int mu, - const Lattice &field) + int mu, + const Lattice &field) { return Link*Cshift(field,mu,1);// moves towards negative mu } @@ -84,8 +84,8 @@ namespace ConjugateBC { // -- // -------> template Lattice CovShiftForward(const Lattice &Link, - int mu, - const Lattice &field) + int mu, + const Lattice &field) { GridBase * grid = Link._grid; @@ -122,9 +122,8 @@ namespace ConjugateBC { return Cshift(tmp,mu,-1);// moves towards positive mu } - } -}} +NAMESPACE_END(Grid); #endif From 4491d87766110f73825aee1ea6d313602f6a9fe5 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 14 Jan 2018 22:04:21 +0000 Subject: [PATCH 062/754] Namespace --- lib/qcd/spin/TwoSpinor.h | 1739 +++++++++++++++++++------------------- 1 file changed, 870 insertions(+), 869 deletions(-) diff --git a/lib/qcd/spin/TwoSpinor.h b/lib/qcd/spin/TwoSpinor.h index 76cdbcf8..3d2143e0 100644 --- a/lib/qcd/spin/TwoSpinor.h +++ b/lib/qcd/spin/TwoSpinor.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -24,1067 +24,1068 @@ Author: Peter Boyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_QCD_TWOSPIN_H #define GRID_QCD_TWOSPIN_H -namespace Grid{ -namespace QCD { - ////////////////////////////////////////////////////////////////////////////////////////////////////// - // Normalisation alert; the g5 project is 1/2(1+-G5) - // the xyzt projects are (1+-Gxyzt) - // - // * xyzt project - // - // This is because this is how the Wilson operator is normally written as - // (m+4r) - \frac{1}{2} D_{hop} - // and / or - // 1 - \frac{1}{2 m+8r} D_{hop} = 1 - kappa D_{hop} - // - // Note that the free, critical hopping parameter kappa is then 1/8 th for r=1. - // - // However, the xyzt 2spin "projectors" are not really projectors because they do not - // square to 1, however the ChiralProjector is a true projector. - // - // For this reason there is NO provision in Grid of a four spinor result from the - // xyzt projectors. They are intended to be used only in combination with "reconstruct" in the - // wilson dirac operator and closely related actions. - // - // I also do NOT provide lattice wide operators of these, since the dirac operator is best implemented - // via Stencils and single site variants will be needed only for the cache friendly high perf dirac operator. - // - // * chiral project - // - // Both four spinor and two spinor result variants are provided. - // - // The four spinor project will be recursively provided to Lattice wide routines, and likely used in - // the domain wall and mobius implementations. - // - ////////////////////////////////////////////////////////////////////////////////////////////////////// +NAMESPACE_BEGIN(Grid); - /* Gx - * 0 0 0 i [0]+-i[3] - * 0 0 i 0 [1]+-i[2] - * 0 -i 0 0 - * -i 0 0 0 - */ - // To fail is not to err (Cryptic clue: suggest to Google SFINAE ;) ) - template > = 0> strong_inline void spProjXp (iVector &hspin,const iVector &fspin) - { - hspin(0)=fspin(0)+timesI(fspin(3)); - hspin(1)=fspin(1)+timesI(fspin(2)); - } - template > = 0> strong_inline void spProjXm (iVector &hspin,const iVector &fspin) - { - hspin(0)=fspin(0)-timesI(fspin(3)); - hspin(1)=fspin(1)-timesI(fspin(2)); - } +////////////////////////////////////////////////////////////////////////////////////////////////////// +// Normalisation alert; the g5 project is 1/2(1+-G5) +// the xyzt projects are (1+-Gxyzt) +// +// * xyzt project +// +// This is because this is how the Wilson operator is normally written as +// (m+4r) - \frac{1}{2} D_{hop} +// and / or +// 1 - \frac{1}{2 m+8r} D_{hop} = 1 - kappa D_{hop} +// +// Note that the free, critical hopping parameter kappa is then 1/8 th for r=1. +// +// However, the xyzt 2spin "projectors" are not really projectors because they do not +// square to 1, however the ChiralProjector is a true projector. +// +// For this reason there is NO provision in Grid of a four spinor result from the +// xyzt projectors. They are intended to be used only in combination with "reconstruct" in the +// wilson dirac operator and closely related actions. +// +// I also do NOT provide lattice wide operators of these, since the dirac operator is best implemented +// via Stencils and single site variants will be needed only for the cache friendly high perf dirac operator. +// +// * chiral project +// +// Both four spinor and two spinor result variants are provided. +// +// The four spinor project will be recursively provided to Lattice wide routines, and likely used in +// the domain wall and mobius implementations. +// +////////////////////////////////////////////////////////////////////////////////////////////////////// - // 0 0 0 -1 [0] -+ [3] - // 0 0 1 0 [1] +- [2] - // 0 1 0 0 - // -1 0 0 0 - template > = 0> strong_inline void spProjYp (iVector &hspin,const iVector &fspin) - { - hspin(0)=fspin(0)-fspin(3); - hspin(1)=fspin(1)+fspin(2); - } - template > = 0> strong_inline void spProjYm (iVector &hspin,const iVector &fspin) - { - hspin(0)=fspin(0)+fspin(3); - hspin(1)=fspin(1)-fspin(2); - } - /*Gz - * 0 0 i 0 [0]+-i[2] - * 0 0 0 -i [1]-+i[3] - * -i 0 0 0 - * 0 i 0 0 - */ - template > = 0> strong_inline void spProjZp (iVector &hspin,const iVector &fspin) - { - hspin(0)=fspin(0)+timesI(fspin(2)); - hspin(1)=fspin(1)-timesI(fspin(3)); - } - template > = 0> strong_inline void spProjZm (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - hspin(0)=fspin(0)-timesI(fspin(2)); - hspin(1)=fspin(1)+timesI(fspin(3)); - } - /*Gt - * 0 0 1 0 [0]+-[2] - * 0 0 0 1 [1]+-[3] - * 1 0 0 0 - * 0 1 0 0 - */ - template > = 0> strong_inline void spProjTp (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - hspin(0)=fspin(0)+fspin(2); - hspin(1)=fspin(1)+fspin(3); - } - template > = 0> strong_inline void spProjTm (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - hspin(0)=fspin(0)-fspin(2); - hspin(1)=fspin(1)-fspin(3); - } - /*G5 - * 1 0 0 0 - * 0 1 0 0 - * 0 0 -1 0 - * 0 0 0 -1 - */ +/* Gx + * 0 0 0 i [0]+-i[3] + * 0 0 i 0 [1]+-i[2] + * 0 -i 0 0 + * -i 0 0 0 + */ - template > = 0> strong_inline void spProj5p (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - hspin(0)=fspin(0); - hspin(1)=fspin(1); - } +// To fail is not to err (Cryptic clue: suggest to Google SFINAE ;) ) +template > = 0> strong_inline void spProjXp (iVector &hspin,const iVector &fspin) +{ + hspin(0)=fspin(0)+timesI(fspin(3)); + hspin(1)=fspin(1)+timesI(fspin(2)); +} +template > = 0> strong_inline void spProjXm (iVector &hspin,const iVector &fspin) +{ + hspin(0)=fspin(0)-timesI(fspin(3)); + hspin(1)=fspin(1)-timesI(fspin(2)); +} - template > = 0> strong_inline void spProj5m (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - hspin(0)=fspin(2); - hspin(1)=fspin(3); - } +// 0 0 0 -1 [0] -+ [3] +// 0 0 1 0 [1] +- [2] +// 0 1 0 0 +// -1 0 0 0 +template > = 0> strong_inline void spProjYp (iVector &hspin,const iVector &fspin) +{ + hspin(0)=fspin(0)-fspin(3); + hspin(1)=fspin(1)+fspin(2); +} +template > = 0> strong_inline void spProjYm (iVector &hspin,const iVector &fspin) +{ + hspin(0)=fspin(0)+fspin(3); + hspin(1)=fspin(1)-fspin(2); +} +/*Gz + * 0 0 i 0 [0]+-i[2] + * 0 0 0 -i [1]-+i[3] + * -i 0 0 0 + * 0 i 0 0 + */ +template > = 0> strong_inline void spProjZp (iVector &hspin,const iVector &fspin) +{ + hspin(0)=fspin(0)+timesI(fspin(2)); + hspin(1)=fspin(1)-timesI(fspin(3)); +} +template > = 0> strong_inline void spProjZm (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + hspin(0)=fspin(0)-timesI(fspin(2)); + hspin(1)=fspin(1)+timesI(fspin(3)); +} +/*Gt + * 0 0 1 0 [0]+-[2] + * 0 0 0 1 [1]+-[3] + * 1 0 0 0 + * 0 1 0 0 + */ +template > = 0> strong_inline void spProjTp (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + hspin(0)=fspin(0)+fspin(2); + hspin(1)=fspin(1)+fspin(3); +} +template > = 0> strong_inline void spProjTm (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + hspin(0)=fspin(0)-fspin(2); + hspin(1)=fspin(1)-fspin(3); +} +/*G5 + * 1 0 0 0 + * 0 1 0 0 + * 0 0 -1 0 + * 0 0 0 -1 + */ + +template > = 0> strong_inline void spProj5p (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + hspin(0)=fspin(0); + hspin(1)=fspin(1); +} + +template > = 0> strong_inline void spProj5m (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + hspin(0)=fspin(2); + hspin(1)=fspin(3); +} - // template strong_inline void fspProj5p (iVector &rfspin,const iVector &fspin) - template > = 0> strong_inline void spProj5p (iVector &rfspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - rfspin(0)=fspin(0); - rfspin(1)=fspin(1); - rfspin(2)=zero; - rfspin(3)=zero; - } - // template strong_inline void fspProj5m (iVector &rfspin,const iVector &fspin) - template > = 0> strong_inline void spProj5m (iVector &rfspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - rfspin(0)=zero; - rfspin(1)=zero; - rfspin(2)=fspin(2); - rfspin(3)=fspin(3); - } +// template strong_inline void fspProj5p (iVector &rfspin,const iVector &fspin) +template > = 0> strong_inline void spProj5p (iVector &rfspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + rfspin(0)=fspin(0); + rfspin(1)=fspin(1); + rfspin(2)=zero; + rfspin(3)=zero; +} +// template strong_inline void fspProj5m (iVector &rfspin,const iVector &fspin) +template > = 0> strong_inline void spProj5m (iVector &rfspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + rfspin(0)=zero; + rfspin(1)=zero; + rfspin(2)=fspin(2); + rfspin(3)=fspin(3); +} - //////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Reconstruction routines to move back again to four spin - //////////////////////////////////////////////////////////////////////////////////////////////////////////////// - /* Gx - * 0 0 0 i [0]+-i[3] - * 0 0 i 0 [1]+-i[2] - * 0 -i 0 0 -i[1]+-[2] == -i ([0]+-i[3]) = -i (1) - * -i 0 0 0 -i[0]+-[3] == -i ([1]+-i[2]) = -i (0) - */ - template > = 0> strong_inline void spReconXp (iVector &fspin,const iVector &hspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - fspin(0)=hspin(0); - fspin(1)=hspin(1); - fspin(2)=timesMinusI(hspin(1)); - fspin(3)=timesMinusI(hspin(0)); - } - template > = 0> strong_inline void spReconXm (iVector &fspin,const iVector &hspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - fspin(0)=hspin(0); - fspin(1)=hspin(1); - fspin(2)=timesI(hspin(1)); - fspin(3)=timesI(hspin(0)); - } - template > = 0> strong_inline void accumReconXp (iVector &fspin,const iVector &hspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - fspin(0)+=hspin(0); - fspin(1)+=hspin(1); - fspin(2)-=timesI(hspin(1)); - fspin(3)-=timesI(hspin(0)); - } - template > = 0> strong_inline void accumReconXm (iVector &fspin,const iVector &hspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - fspin(0)+=hspin(0); - fspin(1)+=hspin(1); - fspin(2)+=timesI(hspin(1)); - fspin(3)+=timesI(hspin(0)); - } +//////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Reconstruction routines to move back again to four spin +//////////////////////////////////////////////////////////////////////////////////////////////////////////////// +/* Gx + * 0 0 0 i [0]+-i[3] + * 0 0 i 0 [1]+-i[2] + * 0 -i 0 0 -i[1]+-[2] == -i ([0]+-i[3]) = -i (1) + * -i 0 0 0 -i[0]+-[3] == -i ([1]+-i[2]) = -i (0) + */ +template > = 0> strong_inline void spReconXp (iVector &fspin,const iVector &hspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + fspin(0)=hspin(0); + fspin(1)=hspin(1); + fspin(2)=timesMinusI(hspin(1)); + fspin(3)=timesMinusI(hspin(0)); +} +template > = 0> strong_inline void spReconXm (iVector &fspin,const iVector &hspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + fspin(0)=hspin(0); + fspin(1)=hspin(1); + fspin(2)=timesI(hspin(1)); + fspin(3)=timesI(hspin(0)); +} +template > = 0> strong_inline void accumReconXp (iVector &fspin,const iVector &hspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + fspin(0)+=hspin(0); + fspin(1)+=hspin(1); + fspin(2)-=timesI(hspin(1)); + fspin(3)-=timesI(hspin(0)); +} +template > = 0> strong_inline void accumReconXm (iVector &fspin,const iVector &hspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + fspin(0)+=hspin(0); + fspin(1)+=hspin(1); + fspin(2)+=timesI(hspin(1)); + fspin(3)+=timesI(hspin(0)); +} - // 0 0 0 -1 [0] -+ [3] - // 0 0 1 0 [1] +- [2] - // 0 1 0 0 == 1(1) - // -1 0 0 0 ==-1(0) +// 0 0 0 -1 [0] -+ [3] +// 0 0 1 0 [1] +- [2] +// 0 1 0 0 == 1(1) +// -1 0 0 0 ==-1(0) - template > = 0> strong_inline void spReconYp (iVector &fspin,const iVector &hspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - fspin(0)=hspin(0); - fspin(1)=hspin(1); - fspin(2)= hspin(1); - fspin(3)=-hspin(0);//Unary minus? - } - template > = 0> strong_inline void spReconYm (iVector &fspin,const iVector &hspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - fspin(0)=hspin(0); - fspin(1)=hspin(1); - fspin(2)=-hspin(1); - fspin(3)= hspin(0); - } - template > = 0> strong_inline void accumReconYp (iVector &fspin,const iVector &hspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - fspin(0)+=hspin(0); - fspin(1)+=hspin(1); - fspin(2)+=hspin(1); - fspin(3)-=hspin(0); - } - template > = 0> strong_inline void accumReconYm (iVector &fspin,const iVector &hspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - fspin(0)+=hspin(0); - fspin(1)+=hspin(1); - fspin(2)-=hspin(1); - fspin(3)+=hspin(0); - } +template > = 0> strong_inline void spReconYp (iVector &fspin,const iVector &hspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + fspin(0)=hspin(0); + fspin(1)=hspin(1); + fspin(2)= hspin(1); + fspin(3)=-hspin(0);//Unary minus? +} +template > = 0> strong_inline void spReconYm (iVector &fspin,const iVector &hspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + fspin(0)=hspin(0); + fspin(1)=hspin(1); + fspin(2)=-hspin(1); + fspin(3)= hspin(0); +} +template > = 0> strong_inline void accumReconYp (iVector &fspin,const iVector &hspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + fspin(0)+=hspin(0); + fspin(1)+=hspin(1); + fspin(2)+=hspin(1); + fspin(3)-=hspin(0); +} +template > = 0> strong_inline void accumReconYm (iVector &fspin,const iVector &hspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + fspin(0)+=hspin(0); + fspin(1)+=hspin(1); + fspin(2)-=hspin(1); + fspin(3)+=hspin(0); +} - /*Gz - * 0 0 i 0 [0]+-i[2] - * 0 0 0 -i [1]-+i[3] - * -i 0 0 0 => -i (0) - * 0 i 0 0 => i (1) - */ - template > = 0> strong_inline void spReconZp (iVector &fspin,const iVector &hspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - fspin(0)=hspin(0); - fspin(1)=hspin(1); - fspin(2)=timesMinusI(hspin(0)); - fspin(3)=timesI(hspin(1)); - } - template > = 0> strong_inline void spReconZm (iVector &fspin,const iVector &hspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - fspin(0)=hspin(0); - fspin(1)=hspin(1); - fspin(2)= timesI(hspin(0)); - fspin(3)=timesMinusI(hspin(1)); - } - template > = 0> strong_inline void accumReconZp (iVector &fspin,const iVector &hspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - fspin(0)+=hspin(0); - fspin(1)+=hspin(1); - fspin(2)-=timesI(hspin(0)); - fspin(3)+=timesI(hspin(1)); - } - template > = 0> strong_inline void accumReconZm (iVector &fspin,const iVector &hspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - fspin(0)+=hspin(0); - fspin(1)+=hspin(1); - fspin(2)+=timesI(hspin(0)); - fspin(3)-=timesI(hspin(1)); - } - /*Gt - * 0 0 1 0 [0]+-[2] - * 0 0 0 1 [1]+-[3] - * 1 0 0 0 => (0) - * 0 1 0 0 => (1) - */ - template > = 0> strong_inline void spReconTp (iVector &fspin,const iVector &hspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - fspin(0)=hspin(0); - fspin(1)=hspin(1); - fspin(2)=hspin(0); - fspin(3)=hspin(1); - } - template > = 0> strong_inline void spReconTm (iVector &fspin,const iVector &hspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - fspin(0)=hspin(0); - fspin(1)=hspin(1); - fspin(2)=-hspin(0); - fspin(3)=-hspin(1); - } - template > = 0> strong_inline void accumReconTp (iVector &fspin,const iVector &hspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - fspin(0)+=hspin(0); - fspin(1)+=hspin(1); - fspin(2)+=hspin(0); - fspin(3)+=hspin(1); - } - template > = 0> strong_inline void accumReconTm (iVector &fspin,const iVector &hspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - fspin(0)+=hspin(0); - fspin(1)+=hspin(1); - fspin(2)-=hspin(0); - fspin(3)-=hspin(1); - } - /*G5 - * 1 0 0 0 - * 0 1 0 0 - * 0 0 -1 0 - * 0 0 0 -1 - */ - template > = 0> strong_inline void spRecon5p (iVector &fspin,const iVector &hspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - fspin(0)=hspin(0)+hspin(0); // add is lower latency than mul - fspin(1)=hspin(1)+hspin(1); // probably no measurable diffence though - fspin(2)=zero; - fspin(3)=zero; - } - template > = 0> strong_inline void spRecon5m (iVector &fspin,const iVector &hspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - fspin(0)=zero; - fspin(1)=zero; - fspin(2)=hspin(0)+hspin(0); - fspin(3)=hspin(1)+hspin(1); - } - template > = 0> strong_inline void accumRecon5p (iVector &fspin,const iVector &hspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - fspin(0)+=hspin(0)+hspin(0); - fspin(1)+=hspin(1)+hspin(1); - } - template > = 0> strong_inline void accumRecon5m (iVector &fspin,const iVector &hspin) - { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - fspin(2)+=hspin(0)+hspin(0); - fspin(3)+=hspin(1)+hspin(1); - } +/*Gz + * 0 0 i 0 [0]+-i[2] + * 0 0 0 -i [1]-+i[3] + * -i 0 0 0 => -i (0) + * 0 i 0 0 => i (1) + */ +template > = 0> strong_inline void spReconZp (iVector &fspin,const iVector &hspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + fspin(0)=hspin(0); + fspin(1)=hspin(1); + fspin(2)=timesMinusI(hspin(0)); + fspin(3)=timesI(hspin(1)); +} +template > = 0> strong_inline void spReconZm (iVector &fspin,const iVector &hspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + fspin(0)=hspin(0); + fspin(1)=hspin(1); + fspin(2)= timesI(hspin(0)); + fspin(3)=timesMinusI(hspin(1)); +} +template > = 0> strong_inline void accumReconZp (iVector &fspin,const iVector &hspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + fspin(0)+=hspin(0); + fspin(1)+=hspin(1); + fspin(2)-=timesI(hspin(0)); + fspin(3)+=timesI(hspin(1)); +} +template > = 0> strong_inline void accumReconZm (iVector &fspin,const iVector &hspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + fspin(0)+=hspin(0); + fspin(1)+=hspin(1); + fspin(2)+=timesI(hspin(0)); + fspin(3)-=timesI(hspin(1)); +} +/*Gt + * 0 0 1 0 [0]+-[2] + * 0 0 0 1 [1]+-[3] + * 1 0 0 0 => (0) + * 0 1 0 0 => (1) + */ +template > = 0> strong_inline void spReconTp (iVector &fspin,const iVector &hspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + fspin(0)=hspin(0); + fspin(1)=hspin(1); + fspin(2)=hspin(0); + fspin(3)=hspin(1); +} +template > = 0> strong_inline void spReconTm (iVector &fspin,const iVector &hspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + fspin(0)=hspin(0); + fspin(1)=hspin(1); + fspin(2)=-hspin(0); + fspin(3)=-hspin(1); +} +template > = 0> strong_inline void accumReconTp (iVector &fspin,const iVector &hspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + fspin(0)+=hspin(0); + fspin(1)+=hspin(1); + fspin(2)+=hspin(0); + fspin(3)+=hspin(1); +} +template > = 0> strong_inline void accumReconTm (iVector &fspin,const iVector &hspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + fspin(0)+=hspin(0); + fspin(1)+=hspin(1); + fspin(2)-=hspin(0); + fspin(3)-=hspin(1); +} +/*G5 + * 1 0 0 0 + * 0 1 0 0 + * 0 0 -1 0 + * 0 0 0 -1 + */ +template > = 0> strong_inline void spRecon5p (iVector &fspin,const iVector &hspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + fspin(0)=hspin(0)+hspin(0); // add is lower latency than mul + fspin(1)=hspin(1)+hspin(1); // probably no measurable diffence though + fspin(2)=zero; + fspin(3)=zero; +} +template > = 0> strong_inline void spRecon5m (iVector &fspin,const iVector &hspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + fspin(0)=zero; + fspin(1)=zero; + fspin(2)=hspin(0)+hspin(0); + fspin(3)=hspin(1)+hspin(1); +} +template > = 0> strong_inline void accumRecon5p (iVector &fspin,const iVector &hspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + fspin(0)+=hspin(0)+hspin(0); + fspin(1)+=hspin(1)+hspin(1); +} +template > = 0> strong_inline void accumRecon5m (iVector &fspin,const iVector &hspin) +{ + //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; + fspin(2)+=hspin(0)+hspin(0); + fspin(3)+=hspin(1)+hspin(1); +} - ////////////////////////////////////////////////////////////////////////////////////////////// - // Recursively apply these until we hit the spin index - ////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////////////// +// Recursively apply these until we hit the spin index +////////////////////////////////////////////////////////////////////////////////////////////// - ////////// - // Xp - ////////// - template > = 0> strong_inline void spProjXp (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i > = 0> strong_inline void spProjXp (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i strong_inline void spProjXp (iScalar &hspin,const iScalar &fspin) - { - spProjXp(hspin._internal,fspin._internal); - } - template strong_inline void spProjXp (iMatrix &hspin,const iMatrix &fspin) - { - for(int i=0;i strong_inline void spProjXp (iScalar &hspin,const iScalar &fspin) +{ + spProjXp(hspin._internal,fspin._internal); +} +template strong_inline void spProjXp (iMatrix &hspin,const iMatrix &fspin) +{ + for(int i=0;i strong_inline void spReconXp (iScalar &hspin,const iScalar &fspin) - { - spReconXp(hspin._internal,fspin._internal); +template strong_inline void spReconXp (iScalar &hspin,const iScalar &fspin) +{ + spReconXp(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void spReconXp (iVector &hspin,const iVector &fspin) +{ + for(int i=0;i > = 0> strong_inline void spReconXp (iVector &hspin,const iVector &fspin) - { - for(int i=0;i strong_inline void spReconXp (iMatrix &hspin,const iMatrix &fspin) - { - for(int i=0;i strong_inline void spReconXp (iMatrix &hspin,const iMatrix &fspin) +{ + for(int i=0;i strong_inline void accumReconXp (iScalar &hspin,const iScalar &fspin) - { - accumReconXp(hspin._internal,fspin._internal); +template strong_inline void accumReconXp (iScalar &hspin,const iScalar &fspin) +{ + accumReconXp(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void accumReconXp (iVector &hspin,const iVector &fspin) +{ + for(int i=0;i > = 0> strong_inline void accumReconXp (iVector &hspin,const iVector &fspin) - { - for(int i=0;i strong_inline void accumReconXp (iMatrix &hspin,const iMatrix &fspin) - { - for(int i=0;i strong_inline void accumReconXp (iMatrix &hspin,const iMatrix &fspin) +{ + for(int i=0;i strong_inline void spProjXm (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - spProjXm(hspin._internal,fspin._internal); +//////// +// Xm +//////// +template strong_inline void spProjXm (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + spProjXm(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void spProjXm (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void spProjXm (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void spProjXm (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void spProjXm (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void spReconXm (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - spReconXm(hspin._internal,fspin._internal); +template strong_inline void spReconXm (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + spReconXm(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void spReconXm (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void spReconXm (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void spReconXm (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void spReconXm (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void accumReconXm (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - accumReconXm(hspin._internal,fspin._internal); +template strong_inline void accumReconXm (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + accumReconXm(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void accumReconXm (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void accumReconXm (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void accumReconXm (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void accumReconXm (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void spProjYp (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - spProjYp(hspin._internal,fspin._internal); +//////// +// Yp +//////// +template strong_inline void spProjYp (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + spProjYp(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void spProjYp (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void spProjYp (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void spProjYp (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void spProjYp (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void spReconYp (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - spReconYp(hspin._internal,fspin._internal); +template strong_inline void spReconYp (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + spReconYp(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void spReconYp (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void spReconYp (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void spReconYp (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void spReconYp (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void accumReconYp (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - accumReconYp(hspin._internal,fspin._internal); +template strong_inline void accumReconYp (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + accumReconYp(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void accumReconYp (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void accumReconYp (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void accumReconYp (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void accumReconYp (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void spProjYm (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - spProjYm(hspin._internal,fspin._internal); +//////// +// Ym +//////// +template strong_inline void spProjYm (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + spProjYm(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void spProjYm (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void spProjYm (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void spProjYm (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void spProjYm (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void spReconYm (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - spReconYm(hspin._internal,fspin._internal); +template strong_inline void spReconYm (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + spReconYm(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void spReconYm (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,const iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void spReconYm (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,const iVector >::type *temp; - for(int i=0;i strong_inline void spReconYm (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void spReconYm (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void accumReconYm (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - accumReconYm(hspin._internal,fspin._internal); +template strong_inline void accumReconYm (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + accumReconYm(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void accumReconYm (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void accumReconYm (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void accumReconYm (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void accumReconYm (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void spProjZp (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - spProjZp(hspin._internal,fspin._internal); +//////// +// Zp +//////// +template strong_inline void spProjZp (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + spProjZp(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void spProjZp (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void spProjZp (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void spProjZp (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void spProjZp (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void spReconZp (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - spReconZp(hspin._internal,fspin._internal); +template strong_inline void spReconZp (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + spReconZp(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void spReconZp (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void spReconZp (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void spReconZp (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void spReconZp (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void accumReconZp (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - accumReconZp(hspin._internal,fspin._internal); +template strong_inline void accumReconZp (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + accumReconZp(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void accumReconZp (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void accumReconZp (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void accumReconZp (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void accumReconZp (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void spProjZm (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - spProjZm(hspin._internal,fspin._internal); +//////// +// Zm +//////// +template strong_inline void spProjZm (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + spProjZm(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void spProjZm (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void spProjZm (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void spProjZm (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void spProjZm (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void spReconZm (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - spReconZm(hspin._internal,fspin._internal); +template strong_inline void spReconZm (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + spReconZm(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void spReconZm (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void spReconZm (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void spReconZm (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void spReconZm (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void accumReconZm (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - accumReconZm(hspin._internal,fspin._internal); +template strong_inline void accumReconZm (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + accumReconZm(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void accumReconZm (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void accumReconZm (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void accumReconZm (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void accumReconZm (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void spProjTp (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - spProjTp(hspin._internal,fspin._internal); +//////// +// Tp +//////// +template strong_inline void spProjTp (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + spProjTp(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void spProjTp (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void spProjTp (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void spProjTp (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void spProjTp (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void spReconTp (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - spReconTp(hspin._internal,fspin._internal); +template strong_inline void spReconTp (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + spReconTp(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void spReconTp (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void spReconTp (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void spReconTp (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void spReconTp (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void accumReconTp (iScalar &hspin, iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - accumReconTp(hspin._internal,fspin._internal); +template strong_inline void accumReconTp (iScalar &hspin, iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + accumReconTp(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void accumReconTp (iVector &hspin, const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void accumReconTp (iVector &hspin, const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void accumReconTp (iMatrix &hspin, const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void accumReconTp (iMatrix &hspin, const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void spProjTm (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - spProjTm(hspin._internal,fspin._internal); +//////// +// Tm +//////// +template strong_inline void spProjTm (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + spProjTm(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void spProjTm (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void spProjTm (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void spProjTm (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void spProjTm (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void spReconTm (iScalar &hspin, const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - spReconTm(hspin._internal,fspin._internal); +template strong_inline void spReconTm (iScalar &hspin, const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + spReconTm(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void spReconTm (iVector &hspin, const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void spReconTm (iVector &hspin, const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void spReconTm (iMatrix &hspin, const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void spReconTm (iMatrix &hspin, const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void accumReconTm (iScalar &hspin, const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - accumReconTm(hspin._internal,fspin._internal); +template strong_inline void accumReconTm (iScalar &hspin, const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + accumReconTm(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void accumReconTm (iVector &hspin, const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void accumReconTm (iVector &hspin, const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void accumReconTm (iMatrix &hspin, const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void accumReconTm (iMatrix &hspin, const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void spProj5p (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - spProj5p(hspin._internal,fspin._internal); +//////// +// 5p +//////// +template strong_inline void spProj5p (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + spProj5p(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void spProj5p (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void spProj5p (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void spProj5p (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void spProj5p (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void spRecon5p (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - spRecon5p(hspin._internal,fspin._internal); +template strong_inline void spRecon5p (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + spRecon5p(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void spRecon5p (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void spRecon5p (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void spRecon5p (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void spRecon5p (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void accumRecon5p (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - accumRecon5p(hspin._internal,fspin._internal); +template strong_inline void accumRecon5p (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + accumRecon5p(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void accumRecon5p (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void accumRecon5p (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void accumRecon5p (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void accumRecon5p (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void fspProj5p (iScalar &hspin,const iScalar &fspin) - template strong_inline void spProj5p (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - spProj5p(hspin._internal,fspin._internal); +// four spinor projectors for chiral proj +// template strong_inline void fspProj5p (iScalar &hspin,const iScalar &fspin) +template strong_inline void spProj5p (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + spProj5p(hspin._internal,fspin._internal); +} +// template strong_inline void fspProj5p (iVector &hspin,iVector &fspin) +template > = 0> strong_inline void spProj5p (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i strong_inline void fspProj5p (iVector &hspin,iVector &fspin) - template > = 0> strong_inline void spProj5p (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void fspProj5p (iMatrix &hspin,iMatrix &fspin) - template strong_inline void spProj5p (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void fspProj5p (iMatrix &hspin,iMatrix &fspin) +template strong_inline void spProj5p (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void spProj5m (iScalar &hspin,const iScalar &fspin) - { - spProj5m(hspin._internal,fspin._internal); +template strong_inline void spProj5m (iScalar &hspin,const iScalar &fspin) +{ + spProj5m(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void spProj5m (iVector &hspin,const iVector &fspin) +{ + for(int i=0;i > = 0> strong_inline void spProj5m (iVector &hspin,const iVector &fspin) - { - for(int i=0;i strong_inline void spProj5m (iMatrix &hspin,const iMatrix &fspin) - { - for(int i=0;i strong_inline void spProj5m (iMatrix &hspin,const iMatrix &fspin) +{ + for(int i=0;i strong_inline void spRecon5m (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - spRecon5m(hspin._internal,fspin._internal); +template strong_inline void spRecon5m (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + spRecon5m(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void spRecon5m (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void spRecon5m (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void spRecon5m (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void spRecon5m (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void accumRecon5m (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - accumRecon5m(hspin._internal,fspin._internal); +template strong_inline void accumRecon5m (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + accumRecon5m(hspin._internal,fspin._internal); +} +template > = 0> strong_inline void accumRecon5m (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i > = 0> strong_inline void accumRecon5m (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void accumRecon5m (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void accumRecon5m (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i strong_inline void fspProj5m (iScalar &hspin,const iScalar &fspin) - template strong_inline void spProj5m (iScalar &hspin,const iScalar &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; - spProj5m(hspin._internal,fspin._internal); +// four spinor projectors for chiral proj +// template strong_inline void fspProj5m (iScalar &hspin,const iScalar &fspin) +template strong_inline void spProj5m (iScalar &hspin,const iScalar &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; + spProj5m(hspin._internal,fspin._internal); +} +// template strong_inline void fspProj5m (iVector &hspin,iVector &fspin) +template > = 0> strong_inline void spProj5m (iVector &hspin,const iVector &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; + for(int i=0;i strong_inline void fspProj5m (iVector &hspin,iVector &fspin) - template > = 0> strong_inline void spProj5m (iVector &hspin,const iVector &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; - for(int i=0;i strong_inline void fspProj5m (iMatrix &hspin,iMatrix &fspin) - template strong_inline void spProj5m (iMatrix &hspin,const iMatrix &fspin) - { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i strong_inline void fspProj5m (iMatrix &hspin,iMatrix &fspin) +template strong_inline void spProj5m (iMatrix &hspin,const iMatrix &fspin) +{ + //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; + for(int i=0;i Date: Sun, 14 Jan 2018 22:06:01 +0000 Subject: [PATCH 063/754] Namespace --- lib/qcd/spin/Gamma.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/qcd/spin/Gamma.h b/lib/qcd/spin/Gamma.h index bc32eab9..df04de74 100644 --- a/lib/qcd/spin/Gamma.h +++ b/lib/qcd/spin/Gamma.h @@ -5,8 +5,7 @@ #include -namespace Grid { -namespace QCD { +NAMESPACE_BEGIN(Grid); class Gamma { public: @@ -1343,6 +1342,6 @@ inline auto operator*(const iMatrix &arg, const Gamma &G) return ret; } -}} +NAMESPACE_END(Grid); #endif // GRID_QCD_GAMMA_H From 1f49f781bfb66c30d26337978719bf42d9f4f1a1 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 14 Jan 2018 22:07:27 +0000 Subject: [PATCH 064/754] Namespace --- lib/qcd/spin/Gamma.cc | 2253 ++++++++++++++++++++--------------------- 1 file changed, 1126 insertions(+), 1127 deletions(-) diff --git a/lib/qcd/spin/Gamma.cc b/lib/qcd/spin/Gamma.cc index 5d9b7719..1056f330 100644 --- a/lib/qcd/spin/Gamma.cc +++ b/lib/qcd/spin/Gamma.cc @@ -2,1139 +2,1138 @@ #include -namespace Grid { -namespace QCD { +NAMESPACE_BEGIN(Grid); const std::array Gamma::gmu = {{ - Gamma(Gamma::Algebra::GammaX), - Gamma(Gamma::Algebra::GammaY), - Gamma(Gamma::Algebra::GammaZ), - Gamma(Gamma::Algebra::GammaT)}}; + Gamma(Gamma::Algebra::GammaX), + Gamma(Gamma::Algebra::GammaY), + Gamma(Gamma::Algebra::GammaZ), + Gamma(Gamma::Algebra::GammaT)}}; const std::array Gamma::name = {{ - "-Gamma5 ", - "Gamma5 ", - "-GammaT ", - "GammaT ", - "-GammaTGamma5", - "GammaTGamma5 ", - "-GammaX ", - "GammaX ", - "-GammaXGamma5", - "GammaXGamma5 ", - "-GammaY ", - "GammaY ", - "-GammaYGamma5", - "GammaYGamma5 ", - "-GammaZ ", - "GammaZ ", - "-GammaZGamma5", - "GammaZGamma5 ", - "-Identity ", - "Identity ", - "-SigmaXT ", - "SigmaXT ", - "-SigmaXY ", - "SigmaXY ", - "-SigmaXZ ", - "SigmaXZ ", - "-SigmaYT ", - "SigmaYT ", - "-SigmaYZ ", - "SigmaYZ ", - "-SigmaZT ", - "SigmaZT "}}; + "-Gamma5 ", + "Gamma5 ", + "-GammaT ", + "GammaT ", + "-GammaTGamma5", + "GammaTGamma5 ", + "-GammaX ", + "GammaX ", + "-GammaXGamma5", + "GammaXGamma5 ", + "-GammaY ", + "GammaY ", + "-GammaYGamma5", + "GammaYGamma5 ", + "-GammaZ ", + "GammaZ ", + "-GammaZGamma5", + "GammaZGamma5 ", + "-Identity ", + "Identity ", + "-SigmaXT ", + "SigmaXT ", + "-SigmaXY ", + "SigmaXY ", + "-SigmaXZ ", + "SigmaXZ ", + "-SigmaYT ", + "SigmaYT ", + "-SigmaYZ ", + "SigmaYZ ", + "-SigmaZT ", + "SigmaZT "}}; const std::array Gamma::adj = {{ - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaT, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaX, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaY, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaZ, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::Identity, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::MinusSigmaZT}}; + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaT, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::Identity, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusSigmaZT}}; const std::array, Gamma::nGamma> Gamma::mul = {{ - {{Gamma::Algebra::Identity, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaX, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaY, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaZ, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::SigmaXY}} - , - {{Gamma::Algebra::MinusIdentity, - Gamma::Algebra::Identity, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::GammaX, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::GammaY, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::Gamma5, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::MinusSigmaXY}} - , - {{Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::Identity, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaX, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaY, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaZ}} - , - {{Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::Identity, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::Gamma5, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaT, - Gamma::Algebra::GammaX, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::GammaY, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusGammaZ}} - , - {{Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::Identity, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaY, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::GammaX, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::GammaZGamma5}} - , - {{Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaT, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::Identity, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaZ, - Gamma::Algebra::GammaY, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaX, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaZGamma5}} - , - {{Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::Identity, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::GammaX, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaY, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::GammaYGamma5}} - , - {{Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::Identity, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaX, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaY, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaYGamma5}} - , - {{Gamma::Algebra::GammaX, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::Identity, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaY}} - , - {{Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaX, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::Identity, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaZ, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaY, - Gamma::Algebra::MinusGammaY}} - , - {{Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::Identity, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::GammaY, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaX, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaXGamma5}} - , - {{Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::Identity, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaY, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::GammaX, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::GammaXGamma5}} - , - {{Gamma::Algebra::GammaY, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::Identity, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::GammaX, - Gamma::Algebra::MinusGammaX}} - , - {{Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaY, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::Identity, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaX}} - , - {{Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::Identity, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaX, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaY, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaT}} - , - {{Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::Identity, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaX, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::GammaY, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaT}} - , - {{Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::Identity, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::GammaY, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaX, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaTGamma5}} - , - {{Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::Identity, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaY, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::GammaX, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaTGamma5}} - , - {{Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaX, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::GammaY, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::Identity, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::MinusSigmaZT}} - , - {{Gamma::Algebra::MinusGamma5, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaX, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaY, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::Identity, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::SigmaZT}} - , - {{Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::GammaX, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaZ, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::GammaY, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::Identity, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::SigmaXZ}} - , - {{Gamma::Algebra::SigmaYZ, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaX, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaY, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::Identity, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::Gamma5, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusSigmaXZ}} - , - {{Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaY, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::GammaX, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaT, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::Identity, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusGamma5}} - , - {{Gamma::Algebra::SigmaZT, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaZ, - Gamma::Algebra::GammaY, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaX, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::Identity, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::Gamma5}} - , - {{Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaY, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaX, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::Identity, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusSigmaXT}} - , - {{Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::GammaY, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaX, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::Identity, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::SigmaXT}} - , - {{Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::GammaY, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaX, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::Identity, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::SigmaYZ}} - , - {{Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaY, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaZ, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::GammaX, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::Identity, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::MinusSigmaYZ}} - , - {{Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::GammaX, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::GammaY, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::Identity, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaYT}} - , - {{Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaX, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaY, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::Gamma5, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::Identity, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaYT}} - , - {{Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::GammaY, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::GammaX, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::Gamma5, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusIdentity, - Gamma::Algebra::Identity}} - , - {{Gamma::Algebra::SigmaXY, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::MinusGammaZ, - Gamma::Algebra::GammaZ, - Gamma::Algebra::MinusGammaZGamma5, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::MinusGammaYGamma5, - Gamma::Algebra::GammaY, - Gamma::Algebra::MinusGammaY, - Gamma::Algebra::MinusGammaXGamma5, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::MinusGammaX, - Gamma::Algebra::GammaX, - Gamma::Algebra::GammaT, - Gamma::Algebra::MinusGammaT, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::MinusGammaTGamma5, - Gamma::Algebra::MinusSigmaZT, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::MinusGamma5, - Gamma::Algebra::Gamma5, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::Identity, - Gamma::Algebra::MinusIdentity}} -}}; + {{Gamma::Algebra::Identity, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaX, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaY, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaZ, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaXY}} + , + {{Gamma::Algebra::MinusIdentity, + Gamma::Algebra::Identity, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::GammaX, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::GammaY, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::Gamma5, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::MinusSigmaXY}} + , + {{Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::Identity, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaZ}} + , + {{Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::Identity, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::Gamma5, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaT, + Gamma::Algebra::GammaX, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::GammaY, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusGammaZ}} + , + {{Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::Identity, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaY, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::GammaX, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::GammaZGamma5}} + , + {{Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaT, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::Identity, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaY, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaZGamma5}} + , + {{Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::Identity, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::GammaX, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaY, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::GammaYGamma5}} + , + {{Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::Identity, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaX, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaY, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaYGamma5}} + , + {{Gamma::Algebra::GammaX, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::Identity, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaY}} + , + {{Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaX, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::Identity, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaY, + Gamma::Algebra::MinusGammaY}} + , + {{Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::Identity, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::GammaY, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaXGamma5}} + , + {{Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::Identity, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::GammaX, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::GammaXGamma5}} + , + {{Gamma::Algebra::GammaY, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::Identity, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::GammaX, + Gamma::Algebra::MinusGammaX}} + , + {{Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaY, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::Identity, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaX}} + , + {{Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::Identity, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaX, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaT}} + , + {{Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::Identity, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaX, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::GammaY, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaT}} + , + {{Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::Identity, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::GammaY, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaX, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaTGamma5}} + , + {{Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::Identity, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::GammaX, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaTGamma5}} + , + {{Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaX, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::GammaY, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::Identity, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusSigmaZT}} + , + {{Gamma::Algebra::MinusGamma5, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaX, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaY, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::Identity, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::SigmaZT}} + , + {{Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::GammaX, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::GammaY, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::Identity, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::SigmaXZ}} + , + {{Gamma::Algebra::SigmaYZ, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaX, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaY, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::Identity, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::Gamma5, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusSigmaXZ}} + , + {{Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaY, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::GammaX, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaT, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::Identity, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusGamma5}} + , + {{Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaY, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaX, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::Identity, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::Gamma5}} + , + {{Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaY, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaX, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::Identity, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXT}} + , + {{Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::GammaY, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaX, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::Identity, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::SigmaXT}} + , + {{Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::GammaY, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaX, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::Identity, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaYZ}} + , + {{Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaY, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::GammaX, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::Identity, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::MinusSigmaYZ}} + , + {{Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::GammaX, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::GammaY, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::Identity, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaYT}} + , + {{Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaY, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::Gamma5, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::Identity, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaYT}} + , + {{Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::GammaX, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::Gamma5, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusIdentity, + Gamma::Algebra::Identity}} + , + {{Gamma::Algebra::SigmaXY, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::MinusGammaZ, + Gamma::Algebra::GammaZ, + Gamma::Algebra::MinusGammaZGamma5, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::MinusGammaYGamma5, + Gamma::Algebra::GammaY, + Gamma::Algebra::MinusGammaY, + Gamma::Algebra::MinusGammaXGamma5, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::MinusGammaX, + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaT, + Gamma::Algebra::MinusGammaT, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::MinusGammaTGamma5, + Gamma::Algebra::MinusSigmaZT, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::MinusGamma5, + Gamma::Algebra::Gamma5, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::Identity, + Gamma::Algebra::MinusIdentity}} + }}; -}} +NAMESPACE_END(Grid); From f4c06ed8c01550d22e760bebf5e180da614af8b9 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 14 Jan 2018 22:08:25 +0000 Subject: [PATCH 065/754] Namespace --- lib/qcd/smearing/WilsonFlow.h | 236 +++++++++++++++++----------------- 1 file changed, 117 insertions(+), 119 deletions(-) diff --git a/lib/qcd/smearing/WilsonFlow.h b/lib/qcd/smearing/WilsonFlow.h index 4f5c0d43..abf84e3d 100644 --- a/lib/qcd/smearing/WilsonFlow.h +++ b/lib/qcd/smearing/WilsonFlow.h @@ -25,59 +25,58 @@ with this program; if not, write to the Free Software Foundation, Inc., See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ -/* END LEGAL */ + /* END LEGAL */ #ifndef WILSONFLOW_H #define WILSONFLOW_H -namespace Grid { -namespace QCD { +NAMESPACE_BEGIN(Grid); template class WilsonFlow: public Smear{ - unsigned int Nstep; - unsigned int measure_interval; - mutable RealD epsilon, taus; + unsigned int Nstep; + unsigned int measure_interval; + mutable RealD epsilon, taus; - mutable WilsonGaugeAction SG; + mutable WilsonGaugeAction SG; - void evolve_step(typename Gimpl::GaugeField&) const; - void evolve_step_adaptive(typename Gimpl::GaugeField&, RealD); - RealD tau(unsigned int t)const {return epsilon*(t+1.0); } + void evolve_step(typename Gimpl::GaugeField&) const; + void evolve_step_adaptive(typename Gimpl::GaugeField&, RealD); + RealD tau(unsigned int t)const {return epsilon*(t+1.0); } - public: - INHERIT_GIMPL_TYPES(Gimpl) +public: + INHERIT_GIMPL_TYPES(Gimpl) - explicit WilsonFlow(unsigned int Nstep, RealD epsilon, unsigned int interval = 1): - Nstep(Nstep), - epsilon(epsilon), - measure_interval(interval), - SG(WilsonGaugeAction(3.0)) { - // WilsonGaugeAction with beta 3.0 - assert(epsilon > 0.0); - LogMessage(); - } + explicit WilsonFlow(unsigned int Nstep, RealD epsilon, unsigned int interval = 1): + Nstep(Nstep), + epsilon(epsilon), + measure_interval(interval), + SG(WilsonGaugeAction(3.0)) { + // WilsonGaugeAction with beta 3.0 + assert(epsilon > 0.0); + LogMessage(); + } - void LogMessage() { - std::cout << GridLogMessage - << "[WilsonFlow] Nstep : " << Nstep << std::endl; - std::cout << GridLogMessage - << "[WilsonFlow] epsilon : " << epsilon << std::endl; - std::cout << GridLogMessage - << "[WilsonFlow] full trajectory : " << Nstep * epsilon << std::endl; - } + void LogMessage() { + std::cout << GridLogMessage + << "[WilsonFlow] Nstep : " << Nstep << std::endl; + std::cout << GridLogMessage + << "[WilsonFlow] epsilon : " << epsilon << std::endl; + std::cout << GridLogMessage + << "[WilsonFlow] full trajectory : " << Nstep * epsilon << std::endl; + } - virtual void smear(GaugeField&, const GaugeField&) const; + virtual void smear(GaugeField&, const GaugeField&) const; - virtual void derivative(GaugeField&, const GaugeField&, const GaugeField&) const { - assert(0); - // undefined for WilsonFlow - } + virtual void derivative(GaugeField&, const GaugeField&, const GaugeField&) const { + assert(0); + // undefined for WilsonFlow + } - void smear_adaptive(GaugeField&, const GaugeField&, RealD maxTau); - RealD energyDensityPlaquette(unsigned int step, const GaugeField& U) const; - RealD energyDensityPlaquette(const GaugeField& U) const; + void smear_adaptive(GaugeField&, const GaugeField&, RealD maxTau); + RealD energyDensityPlaquette(unsigned int step, const GaugeField& U) const; + RealD energyDensityPlaquette(const GaugeField& U) const; }; @@ -86,74 +85,74 @@ class WilsonFlow: public Smear{ //////////////////////////////////////////////////////////////////////////////// template void WilsonFlow::evolve_step(typename Gimpl::GaugeField &U) const{ - GaugeField Z(U._grid); - GaugeField tmp(U._grid); - SG.deriv(U, Z); - Z *= 0.25; // Z0 = 1/4 * F(U) - Gimpl::update_field(Z, U, -2.0*epsilon); // U = W1 = exp(ep*Z0)*W0 + GaugeField Z(U._grid); + GaugeField tmp(U._grid); + SG.deriv(U, Z); + Z *= 0.25; // Z0 = 1/4 * F(U) + Gimpl::update_field(Z, U, -2.0*epsilon); // U = W1 = exp(ep*Z0)*W0 - Z *= -17.0/8.0; - SG.deriv(U, tmp); Z += tmp; // -17/32*Z0 +Z1 - Z *= 8.0/9.0; // Z = -17/36*Z0 +8/9*Z1 - Gimpl::update_field(Z, U, -2.0*epsilon); // U_= W2 = exp(ep*Z)*W1 + Z *= -17.0/8.0; + SG.deriv(U, tmp); Z += tmp; // -17/32*Z0 +Z1 + Z *= 8.0/9.0; // Z = -17/36*Z0 +8/9*Z1 + Gimpl::update_field(Z, U, -2.0*epsilon); // U_= W2 = exp(ep*Z)*W1 - Z *= -4.0/3.0; - SG.deriv(U, tmp); Z += tmp; // 4/3*(17/36*Z0 -8/9*Z1) +Z2 - Z *= 3.0/4.0; // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2 - Gimpl::update_field(Z, U, -2.0*epsilon); // V(t+e) = exp(ep*Z)*W2 + Z *= -4.0/3.0; + SG.deriv(U, tmp); Z += tmp; // 4/3*(17/36*Z0 -8/9*Z1) +Z2 + Z *= 3.0/4.0; // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2 + Gimpl::update_field(Z, U, -2.0*epsilon); // V(t+e) = exp(ep*Z)*W2 } template void WilsonFlow::evolve_step_adaptive(typename Gimpl::GaugeField &U, RealD maxTau) { - if (maxTau - taus < epsilon){ - epsilon = maxTau-taus; - } - //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl; - GaugeField Z(U._grid); - GaugeField Zprime(U._grid); - GaugeField tmp(U._grid), Uprime(U._grid); - Uprime = U; - SG.deriv(U, Z); - Zprime = -Z; - Z *= 0.25; // Z0 = 1/4 * F(U) - Gimpl::update_field(Z, U, -2.0*epsilon); // U = W1 = exp(ep*Z0)*W0 + if (maxTau - taus < epsilon){ + epsilon = maxTau-taus; + } + //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl; + GaugeField Z(U._grid); + GaugeField Zprime(U._grid); + GaugeField tmp(U._grid), Uprime(U._grid); + Uprime = U; + SG.deriv(U, Z); + Zprime = -Z; + Z *= 0.25; // Z0 = 1/4 * F(U) + Gimpl::update_field(Z, U, -2.0*epsilon); // U = W1 = exp(ep*Z0)*W0 - Z *= -17.0/8.0; - SG.deriv(U, tmp); Z += tmp; // -17/32*Z0 +Z1 - Zprime += 2.0*tmp; - Z *= 8.0/9.0; // Z = -17/36*Z0 +8/9*Z1 - Gimpl::update_field(Z, U, -2.0*epsilon); // U_= W2 = exp(ep*Z)*W1 + Z *= -17.0/8.0; + SG.deriv(U, tmp); Z += tmp; // -17/32*Z0 +Z1 + Zprime += 2.0*tmp; + Z *= 8.0/9.0; // Z = -17/36*Z0 +8/9*Z1 + Gimpl::update_field(Z, U, -2.0*epsilon); // U_= W2 = exp(ep*Z)*W1 - Z *= -4.0/3.0; - SG.deriv(U, tmp); Z += tmp; // 4/3*(17/36*Z0 -8/9*Z1) +Z2 - Z *= 3.0/4.0; // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2 - Gimpl::update_field(Z, U, -2.0*epsilon); // V(t+e) = exp(ep*Z)*W2 + Z *= -4.0/3.0; + SG.deriv(U, tmp); Z += tmp; // 4/3*(17/36*Z0 -8/9*Z1) +Z2 + Z *= 3.0/4.0; // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2 + Gimpl::update_field(Z, U, -2.0*epsilon); // V(t+e) = exp(ep*Z)*W2 - // Ramos - Gimpl::update_field(Zprime, Uprime, -2.0*epsilon); // V'(t+e) = exp(ep*Z')*W0 - // Compute distance as norm^2 of the difference - GaugeField diffU = U - Uprime; - RealD diff = norm2(diffU); - // adjust integration step + // Ramos + Gimpl::update_field(Zprime, Uprime, -2.0*epsilon); // V'(t+e) = exp(ep*Z')*W0 + // Compute distance as norm^2 of the difference + GaugeField diffU = U - Uprime; + RealD diff = norm2(diffU); + // adjust integration step - taus += epsilon; - //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl; + taus += epsilon; + //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl; - epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.); - //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl; + epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.); + //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl; } template RealD WilsonFlow::energyDensityPlaquette(unsigned int step, const GaugeField& U) const { - RealD td = tau(step); - return 2.0 * td * td * SG.S(U)/U._grid->gSites(); + RealD td = tau(step); + return 2.0 * td * td * SG.S(U)/U._grid->gSites(); } template RealD WilsonFlow::energyDensityPlaquette(const GaugeField& U) const { - return 2.0 * taus * taus * SG.S(U)/U._grid->gSites(); + return 2.0 * taus * taus * SG.S(U)/U._grid->gSites(); } @@ -163,51 +162,50 @@ RealD WilsonFlow::energyDensityPlaquette(const GaugeField& U) const { template void WilsonFlow::smear(GaugeField& out, const GaugeField& in) const { - out = in; - for (unsigned int step = 1; step <= Nstep; step++) { - auto start = std::chrono::high_resolution_clock::now(); - evolve_step(out); - auto end = std::chrono::high_resolution_clock::now(); - std::chrono::duration diff = end - start; - #ifdef WF_TIMING - std::cout << "Time to evolve " << diff.count() << " s\n"; - #endif - std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " - << step << " " - << energyDensityPlaquette(step,out) << std::endl; - if( step % measure_interval == 0){ - std::cout << GridLogMessage << "[WilsonFlow] Top. charge : " - << step << " " - << WilsonLoops::TopologicalCharge(out) << std::endl; - } + out = in; + for (unsigned int step = 1; step <= Nstep; step++) { + auto start = std::chrono::high_resolution_clock::now(); + evolve_step(out); + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration diff = end - start; +#ifdef WF_TIMING + std::cout << "Time to evolve " << diff.count() << " s\n"; +#endif + std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " + << step << " " + << energyDensityPlaquette(step,out) << std::endl; + if( step % measure_interval == 0){ + std::cout << GridLogMessage << "[WilsonFlow] Top. charge : " + << step << " " + << WilsonLoops::TopologicalCharge(out) << std::endl; } + } } template void WilsonFlow::smear_adaptive(GaugeField& out, const GaugeField& in, RealD maxTau){ - out = in; - taus = epsilon; - unsigned int step = 0; - do{ - step++; - //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl; - evolve_step_adaptive(out, maxTau); - std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " - << step << " " - << energyDensityPlaquette(out) << std::endl; - if( step % measure_interval == 0){ - std::cout << GridLogMessage << "[WilsonFlow] Top. charge : " - << step << " " - << WilsonLoops::TopologicalCharge(out) << std::endl; - } - } while (taus < maxTau); + out = in; + taus = epsilon; + unsigned int step = 0; + do{ + step++; + //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl; + evolve_step_adaptive(out, maxTau); + std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " + << step << " " + << energyDensityPlaquette(out) << std::endl; + if( step % measure_interval == 0){ + std::cout << GridLogMessage << "[WilsonFlow] Top. charge : " + << step << " " + << WilsonLoops::TopologicalCharge(out) << std::endl; + } + } while (taus < maxTau); } -} // namespace QCD -} // namespace Grid +NAMESPACE_END(Grid); #endif // WILSONFLOW_H From 1591d391b9f0511fd75fe469b1f8d62f73c96199 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 14 Jan 2018 22:09:42 +0000 Subject: [PATCH 066/754] Namespace --- lib/qcd/smearing/StoutSmearing.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/lib/qcd/smearing/StoutSmearing.h b/lib/qcd/smearing/StoutSmearing.h index bfc37d0d..9a036e98 100644 --- a/lib/qcd/smearing/StoutSmearing.h +++ b/lib/qcd/smearing/StoutSmearing.h @@ -5,16 +5,15 @@ #ifndef STOUT_SMEAR_ #define STOUT_SMEAR_ -namespace Grid { -namespace QCD { +NAMESPACE_BEGIN(Grid); /*! @brief Stout smearing of link variable. */ template class Smear_Stout : public Smear { - private: +private: const Smear* SmearBase; - public: +public: INHERIT_GIMPL_TYPES(Gimpl) Smear_Stout(Smear* base) : SmearBase(base) { @@ -41,8 +40,8 @@ class Smear_Stout : public Smear { tmp = peekLorentz(C, mu); Umu = peekLorentz(U, mu); iq_mu = Ta( - tmp * - adj(Umu)); // iq_mu = Ta(Omega_mu) to match the signs with the paper + tmp * + adj(Umu)); // iq_mu = Ta(Omega_mu) to match the signs with the paper exponentiate_iQ(tmp, iq_mu); pokeLorentz(u_smr, tmp * Umu, mu); // u_smr = exp(iQ_mu)*U_mu } @@ -105,7 +104,7 @@ class Smear_Stout : public Smear { c0max = 2.0 * pow(tmp, 1.5); theta = acos(c0 / c0max) * - one_over_three; // divide by three here, now leave as it is + one_over_three; // divide by three here, now leave as it is u = sqrt(tmp) * cos(theta); w = sqrt(c1) * sin(theta); } @@ -130,7 +129,7 @@ class Smear_Stout : public Smear { e2iu = cos(2.0 * u) + timesI(sin(2.0 * u)); h0 = e2iu * (u2 - w2) + - emiu * ((8.0 * u2 * cosw) + (2.0 * u * (3.0 * u2 + w2) * ixi0)); + emiu * ((8.0 * u2 * cosw) + (2.0 * u * (3.0 * u2 + w2) * ixi0)); h1 = e2iu * (2.0 * u) - emiu * ((2.0 * u * cosw) - (3.0 * u2 - w2) * ixi0); h2 = e2iu - emiu * (cosw + (3.0 * u) * ixi0); @@ -154,7 +153,7 @@ class Smear_Stout : public Smear { return cos(w) / (w * w) - sin(w) / (w * w * w); } }; -} -} + +NAMESPACE_END(Grid); #endif From e2c39945b3c03d89879c23408f9ef820543b537c Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 14 Jan 2018 22:11:03 +0000 Subject: [PATCH 067/754] Namespace --- lib/qcd/smearing/GaugeConfiguration.h | 48 +++++++++++++-------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/lib/qcd/smearing/GaugeConfiguration.h b/lib/qcd/smearing/GaugeConfiguration.h index fc045ba2..6173fe64 100644 --- a/lib/qcd/smearing/GaugeConfiguration.h +++ b/lib/qcd/smearing/GaugeConfiguration.h @@ -6,12 +6,10 @@ #ifndef GAUGE_CONFIG_ #define GAUGE_CONFIG_ -namespace Grid { +NAMESPACE_BEGIN(Grid); -namespace QCD { - - //trivial class for no smearing - template< class Impl > +//trivial class for no smearing +template< class Impl > class NoSmearing { public: INHERIT_FIELD_TYPES(Impl); @@ -45,10 +43,10 @@ public: */ template class SmearedConfiguration { - public: +public: INHERIT_GIMPL_TYPES(Gimpl); - private: +private: const unsigned int smearingLevels; Smear_Stout StoutSmearing; std::vector SmearedSet; @@ -130,7 +128,7 @@ class SmearedConfiguration { LatticeComplex r01(grid), r11(grid), r21(grid), r02(grid), r12(grid); LatticeComplex r22(grid), tr1(grid), tr2(grid); LatticeComplex b10(grid), b11(grid), b12(grid), b20(grid), b21(grid), - b22(grid); + b22(grid); LatticeComplex LatticeUnitComplex(grid); LatticeUnitComplex = 1.0; @@ -154,19 +152,19 @@ class SmearedConfiguration { e2iu = cos(2.0 * u) + timesI(sin(2.0 * u)); r01 = (2.0 * u + timesI(2.0 * (u2 - w2))) * e2iu + - emiu * ((16.0 * u * cosw + 2.0 * u * (3.0 * u2 + w2) * xi0) + - timesI(-8.0 * u2 * cosw + 2.0 * (9.0 * u2 + w2) * xi0)); + emiu * ((16.0 * u * cosw + 2.0 * u * (3.0 * u2 + w2) * xi0) + + timesI(-8.0 * u2 * cosw + 2.0 * (9.0 * u2 + w2) * xi0)); r11 = (2.0 * LatticeUnitComplex + timesI(4.0 * u)) * e2iu + - emiu * ((-2.0 * cosw + (3.0 * u2 - w2) * xi0) + - timesI((2.0 * u * cosw + 6.0 * u * xi0))); + emiu * ((-2.0 * cosw + (3.0 * u2 - w2) * xi0) + + timesI((2.0 * u * cosw + 6.0 * u * xi0))); r21 = - 2.0 * timesI(e2iu) + emiu * (-3.0 * u * xi0 + timesI(cosw - 3.0 * xi0)); + 2.0 * timesI(e2iu) + emiu * (-3.0 * u * xi0 + timesI(cosw - 3.0 * xi0)); r02 = -2.0 * e2iu + - emiu * (-8.0 * u2 * xi0 + - timesI(2.0 * u * (cosw + xi0 + 3.0 * u2 * xi1))); + emiu * (-8.0 * u2 * xi0 + + timesI(2.0 * u * (cosw + xi0 + 3.0 * u2 * xi1))); r12 = emiu * (2.0 * u * xi0 + timesI(-cosw - xi0 + 3.0 * u2 * xi1)); @@ -200,28 +198,28 @@ class SmearedConfiguration { GaugeLinkField USQ = USigmap * iQ; GaugeLinkField iGamma = tr1 * iQ - timesI(tr2) * iQ2 + - timesI(f1) * USigmap + f2 * QUS + f2 * USQ; + timesI(f1) * USigmap + f2 * QUS + f2 * USQ; iLambda = Ta(iGamma); } //==================================================================== - public: +public: GaugeField* - ThinLinks; /*!< @brief Pointer to the thin - links configuration */ + ThinLinks; /*!< @brief Pointer to the thin + links configuration */ /*! @brief Standard constructor */ SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear, Smear_Stout& Stout) - : smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL) { + : smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL) { for (unsigned int i = 0; i < smearingLevels; ++i) SmearedSet.push_back(*(new GaugeField(UGrid))); } /*! For just thin links */ SmearedConfiguration() - : smearingLevels(0), StoutSmearing(), SmearedSet(), ThinLinks(NULL) {} + : smearingLevels(0), StoutSmearing(), SmearedSet(), ThinLinks(NULL) {} @@ -237,7 +235,7 @@ class SmearedConfiguration { for (int mu = 0; mu < Nd; mu++) { // to get just SigmaTilde tmp_mu = adj(peekLorentz(SmearedSet[smearingLevels - 1], mu)) * - peekLorentz(force, mu); + peekLorentz(force, mu); pokeLorentz(force, tmp_mu, mu); } @@ -261,7 +259,7 @@ class SmearedConfiguration { if (smeared) { if (smearingLevels) { RealD impl_plaq = - WilsonLoops::avgPlaquette(SmearedSet[smearingLevels - 1]); + WilsonLoops::avgPlaquette(SmearedSet[smearingLevels - 1]); std::cout << GridLogDebug << "getting Usmr Plaq: " << impl_plaq << std::endl; return get_SmearedU(); @@ -280,7 +278,7 @@ class SmearedConfiguration { } } }; -} -} + +NAMESPACE_END(Grid); #endif From 34a788331f55c572ffc884f34081485f03f4c4b9 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 14 Jan 2018 22:13:02 +0000 Subject: [PATCH 068/754] Namespace --- lib/qcd/smearing/APEsmearing.h | 198 ++++++++++++++++----------------- 1 file changed, 97 insertions(+), 101 deletions(-) diff --git a/lib/qcd/smearing/APEsmearing.h b/lib/qcd/smearing/APEsmearing.h index d3fe94f6..36e14423 100644 --- a/lib/qcd/smearing/APEsmearing.h +++ b/lib/qcd/smearing/APEsmearing.h @@ -25,134 +25,130 @@ with this program; if not, write to the Free Software Foundation, Inc., See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ -/* END LEGAL */ -/*! - @brief Declaration of Smear_APE class for APE smearing -*/ + /* END LEGAL */ + /*! + @brief Declaration of Smear_APE class for APE smearing + */ #ifndef APE_SMEAR_ #define APE_SMEAR_ - namespace Grid { - namespace QCD { +NAMESPACE_BEGIN(Grid); + +/*! @brief APE type smearing of link variables. */ +template +class Smear_APE: public Smear{ +private: + const std::vector rho;/*!< Array of weights */ + + //This member must be private - we do not want to control from outside + std::vector set_rho(const double common_rho) const { + std::vector res; + + for(int mn=0; mn - class Smear_APE: public Smear{ - private: - const std::vector rho;/*!< Array of weights */ + // Constructors and destructors + Smear_APE(const std::vector& rho_):rho(rho_){} // check vector size + Smear_APE(double rho_val):rho(set_rho(rho_val)){} + Smear_APE():rho(set_rho(1.0)){} + ~Smear_APE(){} -//This member must be private - we do not want to control from outside - std::vector set_rho(const double common_rho) const { - std::vector res; + /////////////////////////////////////////////////////////////////////////////// + void smear(GaugeField& u_smr, const GaugeField& U)const{ + GridBase *grid = U._grid; + GaugeLinkField Cup(grid), tmp_stpl(grid); + WilsonLoops WL; + u_smr = zero; - for(int mn=0; mn& rho_):rho(rho_){} // check vector size - Smear_APE(double rho_val):rho(set_rho(rho_val)){} - Smear_APE():rho(set_rho(1.0)){} - ~Smear_APE(){} - - /////////////////////////////////////////////////////////////////////////////// - void smear(GaugeField& u_smr, const GaugeField& U)const{ - GridBase *grid = U._grid; - GaugeLinkField Cup(grid), tmp_stpl(grid); - WilsonLoops WL; - u_smr = zero; - - for(int mu=0; mu WL; - GaugeLinkField staple(grid), u_tmp(grid); - GaugeLinkField iLambda_mu(grid), iLambda_nu(grid); - GaugeLinkField U_mu(grid), U_nu(grid); - GaugeLinkField sh_field(grid), temp_Sigma(grid); - Real rho_munu, rho_numu; + WilsonLoops WL; + GaugeLinkField staple(grid), u_tmp(grid); + GaugeLinkField iLambda_mu(grid), iLambda_nu(grid); + GaugeLinkField U_mu(grid), U_nu(grid); + GaugeLinkField sh_field(grid), temp_Sigma(grid); + Real rho_munu, rho_numu; - for(int mu = 0; mu < Nd; ++mu){ - U_mu = peekLorentz( U, mu); - iLambda_mu = peekLorentz(iLambda, mu); + for(int mu = 0; mu < Nd; ++mu){ + U_mu = peekLorentz( U, mu); + iLambda_mu = peekLorentz(iLambda, mu); - for(int nu = 0; nu < Nd; ++nu){ - if(nu==mu) continue; - U_nu = peekLorentz( U, nu); - iLambda_nu = peekLorentz(iLambda, nu); + for(int nu = 0; nu < Nd; ++nu){ + if(nu==mu) continue; + U_nu = peekLorentz( U, nu); + iLambda_nu = peekLorentz(iLambda, nu); - rho_munu = rho[mu + Nd * nu]; - rho_numu = rho[nu + Nd * mu]; + rho_munu = rho[mu + Nd * nu]; + rho_numu = rho[nu + Nd * mu]; - WL.StapleUpper(staple, U, mu, nu); + WL.StapleUpper(staple, U, mu, nu); - temp_Sigma = -rho_numu*staple*iLambda_nu; //ok - //-r_numu*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x)*Lambda_nu(x) - Gimpl::AddLink(SigmaTerm, temp_Sigma, mu); + temp_Sigma = -rho_numu*staple*iLambda_nu; //ok + //-r_numu*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x)*Lambda_nu(x) + Gimpl::AddLink(SigmaTerm, temp_Sigma, mu); - sh_field = Cshift(iLambda_nu, mu, 1);// general also for Gparity? + sh_field = Cshift(iLambda_nu, mu, 1);// general also for Gparity? - temp_Sigma = rho_numu*sh_field*staple; //ok - //r_numu*Lambda_nu(mu)*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x) - Gimpl::AddLink(SigmaTerm, temp_Sigma, mu); + temp_Sigma = rho_numu*sh_field*staple; //ok + //r_numu*Lambda_nu(mu)*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x) + Gimpl::AddLink(SigmaTerm, temp_Sigma, mu); - sh_field = Cshift(iLambda_mu, nu, 1); + sh_field = Cshift(iLambda_mu, nu, 1); - temp_Sigma = -rho_munu*staple*U_nu*sh_field*adj(U_nu); //ok - //-r_munu*U_nu(x+mu)*Udag_mu(x+nu)*Lambda_mu(x+nu)*Udag_nu(x) - Gimpl::AddLink(SigmaTerm, temp_Sigma, mu); + temp_Sigma = -rho_munu*staple*U_nu*sh_field*adj(U_nu); //ok + //-r_munu*U_nu(x+mu)*Udag_mu(x+nu)*Lambda_mu(x+nu)*Udag_nu(x) + Gimpl::AddLink(SigmaTerm, temp_Sigma, mu); - staple = zero; - sh_field = Cshift(U_nu, mu, 1); + staple = zero; + sh_field = Cshift(U_nu, mu, 1); - temp_Sigma = -rho_munu*adj(sh_field)*adj(U_mu)*iLambda_mu*U_nu; - temp_Sigma += rho_numu*adj(sh_field)*adj(U_mu)*iLambda_nu*U_nu; + temp_Sigma = -rho_munu*adj(sh_field)*adj(U_mu)*iLambda_mu*U_nu; + temp_Sigma += rho_numu*adj(sh_field)*adj(U_mu)*iLambda_nu*U_nu; - u_tmp = adj(U_nu)*iLambda_nu; - sh_field = Cshift(u_tmp, mu, 1); - temp_Sigma += -rho_numu*sh_field*adj(U_mu)*U_nu; - sh_field = Cshift(temp_Sigma, nu, -1); - Gimpl::AddLink(SigmaTerm, sh_field, mu); + u_tmp = adj(U_nu)*iLambda_nu; + sh_field = Cshift(u_tmp, mu, 1); + temp_Sigma += -rho_numu*sh_field*adj(U_mu)*U_nu; + sh_field = Cshift(temp_Sigma, nu, -1); + Gimpl::AddLink(SigmaTerm, sh_field, mu); - } - } - } - }; + } + } + } +}; +NAMESPACE_END(Grid); - - }// namespace QCD -}//namespace Grid #endif From 81dcd0e6ea7197f023bb01d80197b29166a1c8dd Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 14 Jan 2018 22:13:46 +0000 Subject: [PATCH 069/754] Namespace --- lib/qcd/representations/two_index.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/lib/qcd/representations/two_index.h b/lib/qcd/representations/two_index.h index 082a52a5..60209076 100644 --- a/lib/qcd/representations/two_index.h +++ b/lib/qcd/representations/two_index.h @@ -1,13 +1,12 @@ /* * Policy classes for the HMC * Authors: Guido Cossu, David Preti -*/ + */ #ifndef SUN2INDEX_H_H #define SUN2INDEX_H_H -namespace Grid { -namespace QCD { +NAMESPACE_BEGIN(Grid); /* * This is an helper class for the HMC @@ -23,7 +22,7 @@ namespace QCD { template class TwoIndexRep { - public: +public: // typdef to be used by the Representations class in HMC to get the // types for the higher representation fields typedef typename SU_TwoIndex::LatticeTwoIndexMatrix LatticeMatrix; @@ -79,21 +78,22 @@ class TwoIndexRep { return out; } - private: +private: void projectOnAlgebra(typename SU::LatticeAlgebraVector &h_out, const LatticeMatrix &in, Real scale = 1.0) const { SU_TwoIndex::projectOnAlgebra(h_out, in, scale); } void FundamentalLieAlgebraMatrix( - typename SU::LatticeAlgebraVector &h, - typename SU::LatticeMatrix &out, Real scale = 1.0) const { + typename SU::LatticeAlgebraVector &h, + typename SU::LatticeMatrix &out, Real scale = 1.0) const { SU::FundamentalLieAlgebraMatrix(h, out, scale); } }; typedef TwoIndexRep TwoIndexSymmetricRepresentation; typedef TwoIndexRep TwoIndexAntiSymmetricRepresentation; -} -} + +NAMESPACE_END(Grid); + #endif From 9f2f294a27f644e987459810efb31c4ddda81fec Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 14 Jan 2018 22:14:58 +0000 Subject: [PATCH 070/754] Namespace --- lib/qcd/representations/hmc_types.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/lib/qcd/representations/hmc_types.h b/lib/qcd/representations/hmc_types.h index 3fee377e..5e4ab020 100644 --- a/lib/qcd/representations/hmc_types.h +++ b/lib/qcd/representations/hmc_types.h @@ -9,17 +9,15 @@ #include #include -namespace Grid { -namespace QCD { +NAMESPACE_BEGIN(Grid); // Supported types // enum {Fundamental, Adjoint} repr_type; - // Utility to add support to the HMC for representations other than the // fundamental template class Representations { - public: +public: typedef std::tuple Representation_type; // Size of the tuple, known at compile time @@ -47,11 +45,11 @@ class Representations { template inline typename std::enable_if<(I == tuple_size), void>::type update( - LatticeSourceField& U) {} + LatticeSourceField& U) {} template inline typename std::enable_if<(I < tuple_size), void>::type update( - LatticeSourceField& U) { + LatticeSourceField& U) { std::get(rep).update_representation(U); update(U); } @@ -77,7 +75,7 @@ struct AccessTypes : AccessTypes {}; template