From af9c8d1372c15e542ab4e2ce69a3e5150ba30c06 Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Wed, 6 Apr 2016 13:50:56 -0400 Subject: [PATCH 01/21] -Checkerboard fixes for Lanczos --- lib/algorithms/iterative/ImplicitlyRestartedLanczos.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h index 1e89fb00..c53d9318 100644 --- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h +++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h @@ -564,6 +564,7 @@ until convergence for(int j=k1-1; j Date: Fri, 15 Apr 2016 13:17:42 -0400 Subject: [PATCH 03/21] -Complete and working implementation of Grid_empty --- lib/simd/Grid_avx.h | 10 +- lib/simd/Grid_empty.h | 283 ++++++++++++++++++++++++++++-------------- lib/simd/Grid_sse4.h | 6 +- 3 files changed, 200 insertions(+), 99 deletions(-) diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index b836e757..f33bdf9c 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -410,22 +410,22 @@ namespace Optimization { struct Permute{ static inline __m256 Permute0(__m256 in){ - return _mm256_permute2f128_ps(in,in,0x01); + return _mm256_permute2f128_ps(in,in,0x01); //ABCD EFGH -> EFGH ABCD }; static inline __m256 Permute1(__m256 in){ - return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); + return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //ABCD EFGH -> CDAB GHEF }; static inline __m256 Permute2(__m256 in){ - return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); + return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //ABCD EFGH -> BADC FEHG }; static inline __m256 Permute3(__m256 in){ return in; }; static inline __m256d Permute0(__m256d in){ - return _mm256_permute2f128_pd(in,in,0x01); + return _mm256_permute2f128_pd(in,in,0x01); //AB CD -> CD AB }; - static inline __m256d Permute1(__m256d in){ + static inline __m256d Permute1(__m256d in){ //AB CD -> BA DC return _mm256_shuffle_pd(in,in,0x5); }; static inline __m256d Permute2(__m256d in){ diff --git a/lib/simd/Grid_empty.h b/lib/simd/Grid_empty.h index 8858624e..5ab75de7 100644 --- a/lib/simd/Grid_empty.h +++ b/lib/simd/Grid_empty.h @@ -55,51 +55,67 @@ namespace Optimization { struct Vsplat{ //Complex float - inline float operator()(float a, float b){ - return 0; + inline u128f operator()(float a, float b){ + u128f out; + out.f[0] = a; + out.f[1] = b; + out.f[2] = a; + out.f[3] = b; + return out; } // Real float - inline float operator()(float a){ - return 0; + inline u128f operator()(float a){ + u128f out; + out.f[0] = a; + out.f[1] = a; + out.f[2] = a; + out.f[3] = a; + return out; } //Complex double - inline double operator()(double a, double b){ - return 0; + inline u128d operator()(double a, double b){ + u128d out; + out.f[0] = a; + out.f[1] = b; + return out; } //Real double - inline double operator()(double a){ - return 0; + inline u128d operator()(double a){ + u128d out; + out.f[0] = a; + out.f[1] = a; + return out; } //Integer inline int operator()(Integer a){ - return 0; + return a; } }; struct Vstore{ //Float - inline void operator()(float a, float* F){ - + inline void operator()(u128f a, float* F){ + memcpy(F,a.f,4*sizeof(float)); } //Double - inline void operator()(double a, double* D){ - + inline void operator()(u128d a, double* D){ + memcpy(D,a.f,2*sizeof(double)); } //Integer inline void operator()(int a, Integer* I){ - + I[0] = a; } }; struct Vstream{ //Float - inline void operator()(float * a, float b){ - + inline void operator()(float * a, u128f b){ + memcpy(a,b.f,4*sizeof(float)); } //Double - inline void operator()(double * a, double b){ - + inline void operator()(double * a, u128d b){ + memcpy(a,b.f,2*sizeof(double)); } @@ -107,24 +123,40 @@ namespace Optimization { struct Vset{ // Complex float - inline float operator()(Grid::ComplexF *a){ - return 0; + inline u128f operator()(Grid::ComplexF *a){ + u128f out; + out.f[0] = a[0].real(); + out.f[1] = a[0].imag(); + out.f[2] = a[1].real(); + out.f[3] = a[1].imag(); + return out; } // Complex double - inline double operator()(Grid::ComplexD *a){ - return 0; + inline u128d operator()(Grid::ComplexD *a){ + u128d out; + out.f[0] = a[0].real(); + out.f[1] = a[0].imag(); + return out; } // Real float - inline float operator()(float *a){ - return 0; + inline u128f operator()(float *a){ + u128f out; + out.f[0] = a[0]; + out.f[1] = a[1]; + out.f[2] = a[2]; + out.f[3] = a[3]; + return out; } // Real double - inline double operator()(double *a){ - return 0; + inline u128d operator()(double *a){ + u128d out; + out.f[0] = a[0]; + out.f[1] = a[1]; + return out; } // Integer inline int operator()(Integer *a){ - return 0; + return a[0]; } @@ -146,129 +178,198 @@ namespace Optimization { ///////////////////////////////////////////////////// struct Sum{ //Complex/Real float - inline float operator()(float a, float b){ - return 0; + inline u128f operator()(u128f a, u128f b){ + u128f out; + out.f[0] = a.f[0] + b.f[0]; + out.f[1] = a.f[1] + b.f[1]; + out.f[2] = a.f[2] + b.f[2]; + out.f[3] = a.f[3] + b.f[3]; + return out; } //Complex/Real double - inline double operator()(double a, double b){ - return 0; + inline u128d operator()(u128d a, u128d b){ + u128d out; + out.f[0] = a.f[0] + b.f[0]; + out.f[1] = a.f[1] + b.f[1]; + return out; } //Integer inline int operator()(int a, int b){ - return 0; + return a + b; } }; struct Sub{ //Complex/Real float - inline float operator()(float a, float b){ - return 0; + inline u128f operator()(u128f a, u128f b){ + u128f out; + out.f[0] = a.f[0] - b.f[0]; + out.f[1] = a.f[1] - b.f[1]; + out.f[2] = a.f[2] - b.f[2]; + out.f[3] = a.f[3] - b.f[3]; + return out; } //Complex/Real double - inline double operator()(double a, double b){ - return 0; + inline u128d operator()(u128d a, u128d b){ + u128d out; + out.f[0] = a.f[0] - b.f[0]; + out.f[1] = a.f[1] - b.f[1]; + return out; } //Integer inline int operator()(int a, int b){ - return 0; + return a-b; } }; struct MultComplex{ // Complex float - inline float operator()(float a, float b){ - return 0; + inline u128f operator()(u128f a, u128f b){ + u128f out; + out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1]; + out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0]; + out.f[2] = a.f[2]*b.f[2] - a.f[3]*b.f[3]; + out.f[3] = a.f[2]*b.f[3] + a.f[3]*b.f[2]; + return out; } // Complex double - inline double operator()(double a, double b){ - return 0; + inline u128d operator()(u128d a, u128d b){ + u128d out; + out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1]; + out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0]; + return out; } }; struct Mult{ - inline float mac(float a, float b,double c){ - return 0; - } - inline double mac(double a, double b,double c){ - return 0; - } + //CK: Appear unneeded + // inline float mac(float a, float b,double c){ + // return 0; + // } + // inline double mac(double a, double b,double c){ + // return 0; + // } + // Real float - inline float operator()(float a, float b){ - return 0; + inline u128f operator()(u128f a, u128f b){ + u128f out; + out.f[0] = a.f[0]*b.f[0]; + out.f[1] = a.f[1]*b.f[1]; + out.f[2] = a.f[2]*b.f[2]; + out.f[3] = a.f[3]*b.f[3]; + return out; } // Real double - inline double operator()(double a, double b){ - return 0; + inline u128d operator()(u128d a, u128d b){ + u128d out; + out.f[0] = a.f[0]*b.f[0]; + out.f[1] = a.f[1]*b.f[1]; + return out; } // Integer inline int operator()(int a, int b){ - return 0; + return a*b; } }; struct Conj{ // Complex single - inline float operator()(float in){ - return 0; + inline u128f operator()(u128f in){ + u128f out; + out.f[0] = in.f[0]; + out.f[1] = -in.f[1]; + out.f[2] = in.f[2]; + out.f[3] = -in.f[3]; + return out; } // Complex double - inline double operator()(double in){ - return 0; + inline u128d operator()(u128d in){ + u128d out; + out.f[0] = in.f[0]; + out.f[1] = -in.f[1]; + return out; } // do not define for integer input }; struct TimesMinusI{ //Complex single - inline float operator()(float in, float ret){ - return 0; + inline u128f operator()(u128f in, u128f ret){ //note ret is ignored + u128f out; + out.f[0] = in.f[1]; + out.f[1] = -in.f[0]; + out.f[2] = in.f[3]; + out.f[3] = -in.f[2]; + return out; } //Complex double - inline double operator()(double in, double ret){ - return 0; + inline u128d operator()(u128d in, u128d ret){ + u128d out; + out.f[0] = in.f[1]; + out.f[1] = -in.f[0]; + return out; } - - }; struct TimesI{ //Complex single - inline float operator()(float in, float ret){ - return 0; + inline u128f operator()(u128f in, u128f ret){ //note ret is ignored + u128f out; + out.f[0] = -in.f[1]; + out.f[1] = in.f[0]; + out.f[2] = -in.f[3]; + out.f[3] = in.f[2]; + return out; } //Complex double - inline double operator()(double in, double ret){ - return 0; + inline u128d operator()(u128d in, u128d ret){ + u128d out; + out.f[0] = -in.f[1]; + out.f[1] = in.f[0]; + return out; } }; ////////////////////////////////////////////// // Some Template specialization struct Permute{ - - static inline float Permute0(float in){ + //We just have to mirror the permutes of Grid_sse4.h + static inline u128f Permute0(u128f in){ //AB CD -> CD AB + u128f out; + out.f[0] = in.f[2]; + out.f[1] = in.f[3]; + out.f[2] = in.f[0]; + out.f[3] = in.f[1]; + return out; + }; + static inline u128f Permute1(u128f in){ //AB CD -> BA DC + u128f out; + out.f[0] = in.f[1]; + out.f[1] = in.f[0]; + out.f[2] = in.f[3]; + out.f[3] = in.f[2]; + return out; + }; + static inline u128f Permute2(u128f in){ return in; }; - static inline float Permute1(float in){ - return in; - }; - static inline float Permute2(float in){ - return in; - }; - static inline float Permute3(float in){ + static inline u128f Permute3(u128f in){ return in; }; - static inline double Permute0(double in){ + static inline u128d Permute0(u128d in){ //AB -> BA + u128d out; + out.f[0] = in.f[1]; + out.f[1] = in.f[0]; + return out; + }; + static inline u128d Permute1(u128d in){ return in; }; - static inline double Permute1(double in){ + static inline u128d Permute2(u128d in){ return in; }; - static inline double Permute2(double in){ - return in; - }; - static inline double Permute3(double in){ + static inline u128d Permute3(u128d in){ return in; }; @@ -280,26 +381,26 @@ namespace Optimization { //Complex float Reduce template<> - inline Grid::ComplexF Reduce::operator()(float in){ - return 0; + inline Grid::ComplexF Reduce::operator()(u128f in){ //2 complex + return Grid::ComplexF(in.f[0] + in.f[2], in.f[1] + in.f[3]); } //Real float Reduce template<> - inline Grid::RealF Reduce::operator()(float in){ - return 0; + inline Grid::RealF Reduce::operator()(u128f in){ //4 floats + return in.f[0] + in.f[1] + in.f[2] + in.f[3]; } //Complex double Reduce template<> - inline Grid::ComplexD Reduce::operator()(double in){ - return 0; + inline Grid::ComplexD Reduce::operator()(u128d in){ //1 complex + return Grid::ComplexD(in.f[0],in.f[1]); } //Real double Reduce template<> - inline Grid::RealD Reduce::operator()(double in){ - return 0; + inline Grid::RealD Reduce::operator()(u128d in){ //2 doubles + return in.f[0] + in.f[1]; } //Integer Reduce @@ -314,8 +415,8 @@ namespace Optimization { ////////////////////////////////////////////////////////////////////////////////////// // Here assign types - typedef float SIMD_Ftype; // Single precision type - typedef double SIMD_Dtype; // Double precision type + typedef Optimization::u128f SIMD_Ftype; // Single precision type + typedef Optimization::u128d SIMD_Dtype; // Double precision type typedef int SIMD_Itype; // Integer type // prefetch utilities diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index b88ad4c9..8f4a9c93 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -267,10 +267,10 @@ namespace Optimization { struct Permute{ static inline __m128 Permute0(__m128 in){ - return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); + return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //AB CD -> CD AB }; static inline __m128 Permute1(__m128 in){ - return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); + return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //AB CD -> BA DC }; static inline __m128 Permute2(__m128 in){ return in; @@ -279,7 +279,7 @@ namespace Optimization { return in; }; - static inline __m128d Permute0(__m128d in){ + static inline __m128d Permute0(__m128d in){ //AB -> BA return _mm_shuffle_pd(in,in,0x1); }; static inline __m128d Permute1(__m128d in){ From f473919526154607bbfa357cee4be1b00a9303a3 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 19 Apr 2016 22:23:51 +0100 Subject: [PATCH 04/21] Rotate support --- lib/simd/Grid_avx.h | 105 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index b836e757..f7292add 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -437,6 +437,111 @@ namespace Optimization { }; +#if defined (AVX2) || defined (AVXFMA4) +#define _mm256_alignr_epi32(ret,a,b,n) ret= _mm256_alignr_epi8(a,b,(n*4)%16) +#define _mm256_alignr_epi64(ret,a,b,n) ret= _mm256_alignr_epi8(a,b,(n*8)%16) +#endif + +#if defined (AVX1) + +#define _mm256_alignr_epi32(ret,a,b,n) { \ + __m128 aa, bb; \ + \ + aa = _mm256_extractf128_ps(a,1); \ + bb = _mm256_extractf128_ps(b,1); \ + aa = _mm_alignr_epi8(aa,bb,(n*4)%16); \ + ret = _mm256_insertf128_ps(ret,aa,1); \ + \ + aa = _mm256_extractf128_ps(a,0); \ + bb = _mm256_extractf128_ps(b,0); \ + aa = _mm_alignr_epi8(aa,bb,(n*4)%16); \ + ret = _mm256_insertf128_ps(ret,aa,0); \ + } + +#define _mm256_alignr_epi64(ret,a,b,n) { \ + __m128 aa, bb; \ + \ + aa = _mm256_extractf128_pd(a,1); \ + bb = _mm256_extractf128_pd(b,1); \ + aa = _mm_alignr_epi8(aa,bb,(n*8)%16); \ + ret = _mm256_insertf128_pd(ret,aa,1); \ + \ + aa = _mm256_extractf128_pd(a,0); \ + bb = _mm256_extractf128_pd(b,0); \ + aa = _mm_alignr_epi8(aa,bb,(n*8)%16); \ + ret = _mm256_insertf128_pd(ret,aa,0); \ + } + +#endif + + inline std::ostream & operator << (std::ostream& stream, const __m256 a) + { + const float *p=(const float *)&a; + stream<< "{"<(in);break; + case 1: return tRotate<1>(in);break; + case 2: return tRotate<2>(in);break; + case 3: return tRotate<3>(in);break; + case 4: return tRotate<4>(in);break; + case 5: return tRotate<5>(in);break; + case 6: return tRotate<6>(in);break; + case 7: return tRotate<7>(in);break; + default: assert(0); + } + } + static inline __m256d rotate(__m256d in,int n){ + switch(n){ + case 0: return tRotate<0>(in);break; + case 1: return tRotate<1>(in);break; + case 2: return tRotate<2>(in);break; + case 3: return tRotate<3>(in);break; + default: assert(0); + } + } + + + template + static inline __m256 tRotate(__m256 in){ + __m256 tmp = Permute::Permute0(in); + __m256 ret; + if ( n > 3 ) { + _mm256_alignr_epi32(ret,in,tmp,n); + } else { + _mm256_alignr_epi32(ret,tmp,in,n); + } + // std::cout << " align epi32 n=" < "<< ret < + static inline __m256d tRotate(__m256d in){ + __m256d tmp = Permute::Permute0(in); + __m256d ret; + if ( n > 1 ) { + _mm256_alignr_epi64(ret,in,tmp,n); + } else { + _mm256_alignr_epi64(ret,tmp,in,n); + } + // std::cout << " align epi64 n=" < "<< ret < From e5657510b0f9e068fb3142dec6b47c4cc5e29818 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 19 Apr 2016 22:24:18 +0100 Subject: [PATCH 05/21] Rotate support for Ls simd-ized --- lib/simd/Grid_avx512.h | 116 +++++++++++++++++++++++++++++++++++ lib/simd/Grid_sse4.h | 26 ++++++++ lib/simd/Grid_vector_types.h | 40 ++++++++++-- 3 files changed, 176 insertions(+), 6 deletions(-) diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index 5d014137..9e601971 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -308,6 +308,122 @@ namespace Optimization { }; + struct Rotate{ + + static inline __m512 rotate(__m512 in, int n){ + return = _mm512_alignr_epi32(in,in,n); + }; + + static inline __m512d rotate(__m512d in, int n){ + return = _mm512_alignr_epi64(tmp,in,n); + }; + + +#if 0 + // 16 x 32 bit = 512 bits; 0-15 rotates + static inline __m512 rotateR(__m512 in, int n){ + + // 0 : D3210 C3210 B3210 A3210 -> D3 C3 B3 A3 D2 C2 B2 A2 D1 C1 B1 A1 D0 C0 B0 A0 + // 1 : A0321 D3210 C3210 B3210 -> A0 D3 C3 B3 A3 D2 C2 B2 A2 D1 C1 B1 A1 D0 C0 B0 + // 2 : B0321 A0321 D3210 C3210 -> B0 A0 D3 C3 B3 A3 D2 C2 B2 A2 D1 C1 B1 A1 D0 C0 + // 3 : C0321 B0321 A0321 D3210 -> C0 B0 A0 D3 C3 B3 A3 D2 C2 B2 A2 D1 C1 B1 A1 D0 + // 4 : D0321 C0321 B0321 A0321 -> D0 C0 B0 A0 D3 C3 B3 A3 D2 C2 B2 A2 D1 C1 B1 A1 + // 5 : A1032 D0321 C0321 B0321 -> A1 D0 C0 B0 A0 D3 C3 B3 A3 D2 C2 B2 A2 D1 C1 B1 + // 6 : B1032 A1032 D0321 C0321 -> B1 A1 D0 C0 B0 A0 D3 C3 B3 A3 D2 C2 B2 A2 D1 C1 + // 7 : C1032 B1032 A1032 D0321 -> C1 B1 A1 D0 C0 B0 A0 D3 C3 B3 A3 D2 C2 B2 A2 D1 + // 8 : D1032 C1032 B1032 A1032 -> D1 C1 B1 A1 D0 C0 B0 A0 D3 C3 B3 A3 D2 C2 B2 A2 + //... + //15 : C3210 B3210 A3210 D2103 -> C3 B3 A3 D2 C2 B2 A2 D1 C1 B1 A1 D0 C0 B0 A0 D3 + + int shuf_l = ( (n+3)/4 ) % 4; // shuf = 0,1,1,1,1,2,2,2,2,3,3,3,3,0,0,0 + int shuf_r = ( (n)/4 ) % 4; // shuf = 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3 + + int peri = n%4; + __m512 left,right; + switch(shuf_l){ // In = D3210 C3210 B3210 A3210 + case 0: left = in; break; // tmp = D3210 C3210 B3210 A3210 + case 1: left = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(0,3,2,1)); break; // tmp = D0321 C0321 B0321 A0321 + case 2: left = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); break; // tmp = D1032... + case 3: left = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,1,0,3)); break; // tmp = D2103... + } + + switch(shuf_r){ // In = D3210 C3210 B3210 A3210 + case 0: right = in; break; // tmp = D3210 C3210 B3210 A3210 + case 1: right = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(0,3,2,1)); break; // tmp = D0321 C0321 B0321 A0321 + case 2: right = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); break; // tmp = D1032... + case 3: right = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,1,0,3)); break; // tmp = D2103... + } + return = _mm512_alignr_epi32(left,right,peri*4); + + }; + + // 8 x 64 bit = 512 bits; 0-7 rotates + static inline __m512 RotateZ(__m512 in, int n){ + + // 0 : D10 C10 B10 A10 -> D1 C1 B1 A1 D0 C0 B0 A0 + // 1 : A01 D10 C10 B10 -> A0 D1 C1 B1 A1 D0 C0 B0 + // 2 : B01 A01 D10 C10 -> B0 A0 D1 C1 B1 A1 D0 C0 + // 3 : C01 B01 A01 D10 -> C0 B0 A0 D1 C1 B1 A1 D0 + // 4 : D01 C01 B01 A01 -> D0 C0 B0 A0 D1 C1 B1 A1 + // 5 : A10 D01 C01 B01 -> A1 D0 C0 B0 A0 D1 C1 B1 + // 6 : B10 A10 D01 C01 -> B1 A1 D0 C0 B0 A0 D1 C1 + // 7 : C10 B10 A10 D01 -> C1 B1 A1 D0 C0 B0 A0 D1 + + int shuf_l = ((n+3)/4) % 2;// 0,1,1,1,1,0,0,0 + int shuf_r = (n/4) % 2; + int peri = n%4; + + __m512 left, right; + switch(shuf_l){ // In = D3210 C3210 B3210 A3210 + case 0: left = in; break; // tmp = D3210 C3210 B3210 A3210 + case 1: left = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(0,1,3,2)); break; // tmp = D0132... + } + switch(shuf_r){ // In = D3210 C3210 B3210 A3210 + case 0: right = in; break; // tmp = D3210 C3210 B3210 A3210 + case 1: right = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(0,1,3,2)); break; // tmp = D0132... + } + return = _mm512_alignr_epi32(tmp,in,peri*4); + }; + + + + // 8 x 64 bit = 512 bits; 0-7 rotates + static inline __m512d RotateR(__m512d in, int n){ + + // 0 : D10 C10 B10 A10 -> D1 C1 B1 A1 D0 C0 B0 A0 + // 1 : A01 D10 C10 B10 -> A0 D1 C1 B1 A1 D0 C0 B0 + // 2 : B01 A01 D10 C10 -> B0 A0 D1 C1 B1 A1 D0 C0 + // 3 : C01 B01 A01 D10 -> C0 B0 A0 D1 C1 B1 A1 D0 + // 4 : D01 C01 B01 A01 -> D0 C0 B0 A0 D1 C1 B1 A1 + // 5 : A10 D01 C01 B01 -> A1 D0 C0 B0 A0 D1 C1 B1 + // 6 : B10 A10 D01 C01 -> B1 A1 D0 C0 B0 A0 D1 C1 + // 7 : C10 B10 A10 D01 -> C1 B1 A1 D0 C0 B0 A0 D1 + int shuf_l = ((n+3)/4) % 2;// 0,1,1,1,1,0,0,0 + int shuf_r = (n/4) % 2; + int peri = n%4; + + __m512 left, right; + switch(shuf_l){ + case 0: left = in; break; + case 1: left = _mm512_shuffle_pd(in,in,0x55); + } + switch(shuf_r){ + case 0: right = in; break; + case 1: right = _mm512_shuffle_pd(in,in,0x55); + } + return = _mm512_alignr_epi64(tmp,in,peri*2); + + }; + + // 4 x 128 bit = 512 bits; 0-4 rotates + static inline __m512 RotateZ(__m512 in, int n){ + int peri = n%4; + return = _mm512_alignr_epi32(in,in,peri*2); + }; +#endif + + }; + ////////////////////////////////////////////// // Some Template specialization diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index b88ad4c9..8abd4581 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -294,6 +294,32 @@ namespace Optimization { }; + struct Rotate{ + + static inline __m128 rotate(__m128 in,int n){ + switch(n){ + case 0: return tRotate<0>(in);break; + case 1: return tRotate<1>(in);break; + case 2: return tRotate<2>(in);break; + case 3: return tRotate<3>(in);break; + default: assert(0); + } + } + static inline __m128d rotate(__m128d in,int n){ + switch(n){ + case 0: return tRotate<0>(in);break; + case 1: return tRotate<1>(in);break; + default: assert(0); + } + } + +#define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) +#define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) + + template static inline __m128 tRotate(__m128 in){ return _mm_alignr_epi32(in,in,n); }; + template static inline __m128d tRotate(__m128d in){ return _mm_alignr_epi64(in,in,n); }; + + }; ////////////////////////////////////////////// // Some Template specialization diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index c247f15d..b485a47a 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -299,16 +299,44 @@ namespace Grid { } friend inline void permute(Grid_simd &y,Grid_simd b,int perm) { - if (perm==3) permute3(y,b); - else if (perm==2) permute2(y,b); - else if (perm==1) permute1(y,b); - else if (perm==0) permute0(y,b); + if ( perm & RotateBit ) { + int dist = perm&0xF; + y=rotate(b,dist); + return; + } + switch(perm){ + case 3: permute3(y,b); break; + case 2: permute2(y,b); break; + case 1: permute1(y,b); break; + case 0: permute0(y,b); break; + default: assert(0); + } } - - };// end of Grid_simd class definition + //////////////////////////////////////////////////////////////////// + // General rotate + //////////////////////////////////////////////////////////////////// + template =0> + inline Grid_simd rotate(Grid_simd b,int nrot) + { + nrot = nrot % Grid_simd::Nsimd(); + Grid_simd ret; + // std::cout << "Rotate Real by "< =0> + inline Grid_simd rotate(Grid_simd b,int nrot) + { + nrot = nrot % Grid_simd::Nsimd(); + Grid_simd ret; + // std::cout << "Rotate Complex by "< Date: Tue, 19 Apr 2016 15:13:54 -0700 Subject: [PATCH 06/21] Updated to compile and pass under intel SDE --- lib/simd/Grid_avx512.h | 150 +++++++++++------------------------------ 1 file changed, 41 insertions(+), 109 deletions(-) diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index 9e601971..b057a61b 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -308,123 +308,55 @@ namespace Optimization { }; + struct Rotate{ - static inline __m512 rotate(__m512 in, int n){ - return = _mm512_alignr_epi32(in,in,n); + static inline __m512 rotate(__m512 in,int n){ + switch(n){ + case 0: return tRotate<0>(in);break; + case 1: return tRotate<1>(in);break; + case 2: return tRotate<2>(in);break; + case 3: return tRotate<3>(in);break; + case 4: return tRotate<4>(in);break; + case 5: return tRotate<5>(in);break; + case 6: return tRotate<6>(in);break; + case 7: return tRotate<7>(in);break; + + case 8 : return tRotate<8>(in);break; + case 9 : return tRotate<9>(in);break; + case 10: return tRotate<10>(in);break; + case 11: return tRotate<11>(in);break; + case 12: return tRotate<12>(in);break; + case 13: return tRotate<13>(in);break; + case 14: return tRotate<14>(in);break; + case 15: return tRotate<15>(in);break; + default: assert(0); + } + } + static inline __m512d rotate(__m512d in,int n){ + switch(n){ + case 0: return tRotate<0>(in);break; + case 1: return tRotate<1>(in);break; + case 2: return tRotate<2>(in);break; + case 3: return tRotate<3>(in);break; + case 4: return tRotate<4>(in);break; + case 5: return tRotate<5>(in);break; + case 6: return tRotate<6>(in);break; + case 7: return tRotate<7>(in);break; + default: assert(0); + } + } + + template static inline __m512 tRotate(__m512 in){ + return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n); }; - static inline __m512d rotate(__m512d in, int n){ - return = _mm512_alignr_epi64(tmp,in,n); + template static inline __m512d tRotate(__m512d in){ + return (__m512d)_mm512_alignr_epi64((__m512i)in,(__m512i)in,n); }; - -#if 0 - // 16 x 32 bit = 512 bits; 0-15 rotates - static inline __m512 rotateR(__m512 in, int n){ - - // 0 : D3210 C3210 B3210 A3210 -> D3 C3 B3 A3 D2 C2 B2 A2 D1 C1 B1 A1 D0 C0 B0 A0 - // 1 : A0321 D3210 C3210 B3210 -> A0 D3 C3 B3 A3 D2 C2 B2 A2 D1 C1 B1 A1 D0 C0 B0 - // 2 : B0321 A0321 D3210 C3210 -> B0 A0 D3 C3 B3 A3 D2 C2 B2 A2 D1 C1 B1 A1 D0 C0 - // 3 : C0321 B0321 A0321 D3210 -> C0 B0 A0 D3 C3 B3 A3 D2 C2 B2 A2 D1 C1 B1 A1 D0 - // 4 : D0321 C0321 B0321 A0321 -> D0 C0 B0 A0 D3 C3 B3 A3 D2 C2 B2 A2 D1 C1 B1 A1 - // 5 : A1032 D0321 C0321 B0321 -> A1 D0 C0 B0 A0 D3 C3 B3 A3 D2 C2 B2 A2 D1 C1 B1 - // 6 : B1032 A1032 D0321 C0321 -> B1 A1 D0 C0 B0 A0 D3 C3 B3 A3 D2 C2 B2 A2 D1 C1 - // 7 : C1032 B1032 A1032 D0321 -> C1 B1 A1 D0 C0 B0 A0 D3 C3 B3 A3 D2 C2 B2 A2 D1 - // 8 : D1032 C1032 B1032 A1032 -> D1 C1 B1 A1 D0 C0 B0 A0 D3 C3 B3 A3 D2 C2 B2 A2 - //... - //15 : C3210 B3210 A3210 D2103 -> C3 B3 A3 D2 C2 B2 A2 D1 C1 B1 A1 D0 C0 B0 A0 D3 - - int shuf_l = ( (n+3)/4 ) % 4; // shuf = 0,1,1,1,1,2,2,2,2,3,3,3,3,0,0,0 - int shuf_r = ( (n)/4 ) % 4; // shuf = 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3 - - int peri = n%4; - __m512 left,right; - switch(shuf_l){ // In = D3210 C3210 B3210 A3210 - case 0: left = in; break; // tmp = D3210 C3210 B3210 A3210 - case 1: left = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(0,3,2,1)); break; // tmp = D0321 C0321 B0321 A0321 - case 2: left = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); break; // tmp = D1032... - case 3: left = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,1,0,3)); break; // tmp = D2103... - } - - switch(shuf_r){ // In = D3210 C3210 B3210 A3210 - case 0: right = in; break; // tmp = D3210 C3210 B3210 A3210 - case 1: right = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(0,3,2,1)); break; // tmp = D0321 C0321 B0321 A0321 - case 2: right = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); break; // tmp = D1032... - case 3: right = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,1,0,3)); break; // tmp = D2103... - } - return = _mm512_alignr_epi32(left,right,peri*4); - - }; - - // 8 x 64 bit = 512 bits; 0-7 rotates - static inline __m512 RotateZ(__m512 in, int n){ - - // 0 : D10 C10 B10 A10 -> D1 C1 B1 A1 D0 C0 B0 A0 - // 1 : A01 D10 C10 B10 -> A0 D1 C1 B1 A1 D0 C0 B0 - // 2 : B01 A01 D10 C10 -> B0 A0 D1 C1 B1 A1 D0 C0 - // 3 : C01 B01 A01 D10 -> C0 B0 A0 D1 C1 B1 A1 D0 - // 4 : D01 C01 B01 A01 -> D0 C0 B0 A0 D1 C1 B1 A1 - // 5 : A10 D01 C01 B01 -> A1 D0 C0 B0 A0 D1 C1 B1 - // 6 : B10 A10 D01 C01 -> B1 A1 D0 C0 B0 A0 D1 C1 - // 7 : C10 B10 A10 D01 -> C1 B1 A1 D0 C0 B0 A0 D1 - - int shuf_l = ((n+3)/4) % 2;// 0,1,1,1,1,0,0,0 - int shuf_r = (n/4) % 2; - int peri = n%4; - - __m512 left, right; - switch(shuf_l){ // In = D3210 C3210 B3210 A3210 - case 0: left = in; break; // tmp = D3210 C3210 B3210 A3210 - case 1: left = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(0,1,3,2)); break; // tmp = D0132... - } - switch(shuf_r){ // In = D3210 C3210 B3210 A3210 - case 0: right = in; break; // tmp = D3210 C3210 B3210 A3210 - case 1: right = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(0,1,3,2)); break; // tmp = D0132... - } - return = _mm512_alignr_epi32(tmp,in,peri*4); - }; - - - - // 8 x 64 bit = 512 bits; 0-7 rotates - static inline __m512d RotateR(__m512d in, int n){ - - // 0 : D10 C10 B10 A10 -> D1 C1 B1 A1 D0 C0 B0 A0 - // 1 : A01 D10 C10 B10 -> A0 D1 C1 B1 A1 D0 C0 B0 - // 2 : B01 A01 D10 C10 -> B0 A0 D1 C1 B1 A1 D0 C0 - // 3 : C01 B01 A01 D10 -> C0 B0 A0 D1 C1 B1 A1 D0 - // 4 : D01 C01 B01 A01 -> D0 C0 B0 A0 D1 C1 B1 A1 - // 5 : A10 D01 C01 B01 -> A1 D0 C0 B0 A0 D1 C1 B1 - // 6 : B10 A10 D01 C01 -> B1 A1 D0 C0 B0 A0 D1 C1 - // 7 : C10 B10 A10 D01 -> C1 B1 A1 D0 C0 B0 A0 D1 - int shuf_l = ((n+3)/4) % 2;// 0,1,1,1,1,0,0,0 - int shuf_r = (n/4) % 2; - int peri = n%4; - - __m512 left, right; - switch(shuf_l){ - case 0: left = in; break; - case 1: left = _mm512_shuffle_pd(in,in,0x55); - } - switch(shuf_r){ - case 0: right = in; break; - case 1: right = _mm512_shuffle_pd(in,in,0x55); - } - return = _mm512_alignr_epi64(tmp,in,peri*2); - - }; - - // 4 x 128 bit = 512 bits; 0-4 rotates - static inline __m512 RotateZ(__m512 in, int n){ - int peri = n%4; - return = _mm512_alignr_epi32(in,in,peri*2); - }; -#endif - }; - ////////////////////////////////////////////// // Some Template specialization From f2ae9682ffa4eaf225a92092ab46b4ca272c175d Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 19 Apr 2016 15:14:32 -0700 Subject: [PATCH 07/21] Remove some timing hacks --- benchmarks/Benchmark_zmm.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/Benchmark_zmm.cc b/benchmarks/Benchmark_zmm.cc index 74674cbf..e5039b16 100644 --- a/benchmarks/Benchmark_zmm.cc +++ b/benchmarks/Benchmark_zmm.cc @@ -127,7 +127,7 @@ int bench(std::ofstream &os, std::vector &latt4,int Ls) t1=usecond(); mfa = flops*ncall/(t1-t0); std::cout< Date: Tue, 19 Apr 2016 15:15:11 -0700 Subject: [PATCH 08/21] const safety --- lib/lattice/Lattice_peekpoke.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/lattice/Lattice_peekpoke.h b/lib/lattice/Lattice_peekpoke.h index 5edc223a..9bece943 100644 --- a/lib/lattice/Lattice_peekpoke.h +++ b/lib/lattice/Lattice_peekpoke.h @@ -152,7 +152,7 @@ PARALLEL_FOR_LOOP // Peek a scalar object from the SIMD array ////////////////////////////////////////////////////////// template - void peekLocalSite(sobj &s,Lattice &l,std::vector &site){ + void peekLocalSite(sobj &s,const Lattice &l,std::vector &site){ GridBase *grid=l._grid; From 04072a5e1f72fecc87e220abfc1fbaf60cb57f56 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 19 Apr 2016 15:15:34 -0700 Subject: [PATCH 09/21] Rotate is a temporary hack. Would like to merge ALL permutes as rotates of length 2, and make any rotate active over any subset of lane bits. This is hard, and requires general permute; current intrinsics mean this is only really possible for specific case by case encodings as presently performed. Intel could produce a general permute.. would help. IBM did it in VMX. --- lib/Simd.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/Simd.h b/lib/Simd.h index 27a5ec46..de49cca7 100644 --- a/lib/Simd.h +++ b/lib/Simd.h @@ -47,6 +47,8 @@ Author: paboyle #define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D) #define _MM_SELECT_TWO_TWO (A,B) _MM_SELECT_FOUR_TWO(0,0,A,B) +#define RotateBit (0x100) + namespace Grid { typedef uint32_t Integer; From c8a93d6a93b3adead66115d17c6c211bfdbfeccc Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 19 Apr 2016 15:18:12 -0700 Subject: [PATCH 10/21] Cartesian changes to allow all simd in one direction --- lib/cartesian/Cartesian_base.h | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/lib/cartesian/Cartesian_base.h b/lib/cartesian/Cartesian_base.h index afb4ab92..8272ac71 100644 --- a/lib/cartesian/Cartesian_base.h +++ b/lib/cartesian/Cartesian_base.h @@ -138,6 +138,25 @@ public: } inline int PermuteType(int dimension){ int permute_type=0; + // + // FIXME: + // + // Best way to encode this would be to present a mask + // for which simd dimensions are rotated, and the rotation + // size. If there is only one simd dimension rotated, this is just + // a permute. + // + // Cases: PermuteType == 1,2,4,8 + // Distance should be either 0,1,2.. + // + if ( _simd_layout[dimension] > 2 ) { + for(int d=0;d<_ndimension;d++){ + if ( d != dimension ) assert ( (_simd_layout[d]==1) ); + } + permute_type = RotateBit; // How to specify distance; this is not just direction. + return permute_type; + } + for(int d=_ndimension-1;d>dimension;d--){ if (_simd_layout[d]>1 ) permute_type++; } @@ -147,12 +166,12 @@ public: // Array sizing queries //////////////////////////////////////////////////////////////// - inline int iSites(void) { return _isites; }; - inline int Nsimd(void) { return _isites; };// Synonymous with iSites - inline int oSites(void) { return _osites; }; - inline int lSites(void) { return _isites*_osites; }; - inline int gSites(void) { return _isites*_osites*_Nprocessors; }; - inline int Nd (void) { return _ndimension;}; + inline int iSites(void) const { return _isites; }; + inline int Nsimd(void) const { return _isites; };// Synonymous with iSites + inline int oSites(void) const { return _osites; }; + inline int lSites(void) const { return _isites*_osites; }; + inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; + inline int Nd (void) const { return _ndimension;}; inline const std::vector &FullDimensions(void) { return _fdimensions;}; inline const std::vector &GlobalDimensions(void) { return _gdimensions;}; @@ -165,6 +184,9 @@ public: void GlobalIndexToGlobalCoor(int gidx,std::vector &gcoor){ Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions); } + void LocalIndexToLocalCoor(int lidx,std::vector &lcoor){ + Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions); + } void GlobalCoorToGlobalIndex(const std::vector & gcoor,int & gidx){ gidx=0; int mult=1; From b27bac466994b48449b0a5842ac801fffd07ebc7 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 19 Apr 2016 15:34:10 -0700 Subject: [PATCH 11/21] Updates for simd in one dir --- lib/Stencil.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/lib/Stencil.h b/lib/Stencil.h index c2f083ff..2b238622 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -321,6 +321,9 @@ PARALLEL_FOR_LOOP int simd_layout = _grid->_simd_layout[dimension]; int comm_dim = _grid->_processors[dimension] >1 ; int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); + int rotate_dim = _grid->_simd_layout[dimension]>2; + + assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported int sshift[2]; @@ -368,7 +371,8 @@ PARALLEL_FOR_LOOP int rd = _grid->_rdimensions[dimension]; int ld = _grid->_ldimensions[dimension]; int gd = _grid->_gdimensions[dimension]; - + int ly = _grid->_simd_layout[dimension]; + // Map to always positive shift modulo global full dimension. int shift = (shiftpm+fd)%fd; @@ -398,7 +402,7 @@ PARALLEL_FOR_LOOP int wrap = sshift/rd; int num = sshift%rd; if ( x< rd-num ) permute_slice=wrap; - else permute_slice = 1-wrap; + else permute_slice = (wrap+1)%ly; } CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); @@ -418,7 +422,6 @@ PARALLEL_FOR_LOOP int simd_layout = _grid->_simd_layout[dimension]; int comm_dim = _grid->_processors[dimension] >1 ; - // assert(simd_layout==1); // Why? assert(comm_dim==1); int shift = (shiftpm + fd) %fd; assert(shift>=0); @@ -529,7 +532,7 @@ PARALLEL_FOR_LOOP _entries[point][lo+o+b]._around_the_world=wrap; } - } + } o +=_grid->_slice_stride[dimension]; } @@ -749,6 +752,7 @@ PARALLEL_FOR_LOOP int comm_dim = _grid->_processors[dimension] >1 ; assert(comm_dim==1); + // This will not work with a rotate dim assert(simd_layout==2); assert(shift>=0); assert(shift2 // std::cout << "GatherSimd : lane 1st elem " << i << u_simd_send_buf[i ][u_comm_offset]<>(permute_type+1)); From 7223753355d4469382b6fecde6c8e4c462c6797e Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 19 Apr 2016 15:35:15 -0700 Subject: [PATCH 12/21] Rotate in a direction > 2 for simd_layout --- lib/cshift/Cshift_common.h | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/lib/cshift/Cshift_common.h b/lib/cshift/Cshift_common.h index 05c7c18c..b8e1284a 100644 --- a/lib/cshift/Cshift_common.h +++ b/lib/cshift/Cshift_common.h @@ -324,6 +324,7 @@ template Lattice Cshift_local(Lattice &ret,const Lattice int rd = grid->_rdimensions[dimension]; int ld = grid->_ldimensions[dimension]; int gd = grid->_gdimensions[dimension]; + int ly = grid->_simd_layout[dimension]; // Map to always positive shift modulo global full dimension. shift = (shift+fd)%fd; @@ -332,6 +333,7 @@ template Lattice Cshift_local(Lattice &ret,const Lattice // the permute type int permute_dim =grid->PermuteDim(dimension); int permute_type=grid->PermuteType(dimension); + int permute_type_dist; for(int x=0;x Lattice Cshift_local(Lattice &ret,const Lattice int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); int sx = (x+sshift)%rd; + // FIXME : This must change where we have a + // Rotate slice. + + // Document how this works ; why didn't I do this when I first wrote it... + // wrap is whether sshift > rd. + // num is sshift mod rd. + // int permute_slice=0; if(permute_dim){ int wrap = sshift/rd; int num = sshift%rd; + if ( x< rd-num ) permute_slice=wrap; - else permute_slice = 1-wrap; + else permute_slice = (wrap+1)%ly; + + if ( (ly>2) && (permute_slice) ) { + assert(permute_type & RotateBit); + permute_type_dist = permute_type|permute_slice; + } else { + permute_type_dist = permute_type; + } + } - if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type); + if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist); else Copy_plane(ret,rhs,dimension,x,sx,cbmask); From 806a83d38b19e4dcd6542a25acde0c4fe129ecd4 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 19 Apr 2016 15:36:19 -0700 Subject: [PATCH 13/21] simd in fifth dim support for dwf --- lib/qcd/action/fermion/FermionOperatorImpl.h | 106 ++++++++++++++++++- 1 file changed, 104 insertions(+), 2 deletions(-) diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h index 3ecb87f7..399c780b 100644 --- a/lib/qcd/action/fermion/FermionOperatorImpl.h +++ b/lib/qcd/action/fermion/FermionOperatorImpl.h @@ -142,6 +142,10 @@ namespace Grid { mult(&phi(),&U(mu),&chi()); } + template + inline void loadLinkElement(Simd & reg,ref &memory){ + reg = memory; + } inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) { conformable(Uds._grid,GaugeGrid); @@ -181,6 +185,100 @@ PARALLEL_FOR_LOOP }; + + + /////// + // Single flavour four spinors with colour index, 5d redblack + /////// + template + class DomainWallRedBlack5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { + public: + + typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl; + + INHERIT_GIMPL_TYPES(Gimpl); + + template using iImplSpinor = iScalar, Ns> >; + template using iImplHalfSpinor = iScalar, Nhs> >; + template using iImplDoubledGaugeField = iVector >, Nds >; + template using iImplGaugeField = iVector >, Nd >; + template using iImplGaugeLink = iScalar > >; + + typedef iImplSpinor SiteSpinor; + typedef iImplHalfSpinor SiteHalfSpinor; + typedef Lattice FermionField; + + // Make the doubled gauge field a *scalar* + typedef iImplDoubledGaugeField SiteDoubledGaugeField; // This is a scalar + typedef iImplGaugeField SiteScalarGaugeField; // scalar + typedef iImplGaugeLink SiteScalarGaugeLink; // scalar + + typedef Lattice DoubledGaugeField; + + typedef WilsonCompressor Compressor; + typedef WilsonImplParams ImplParams; + typedef WilsonStencil StencilImpl; + + ImplParams Params; + + DomainWallRedBlack5dImpl(const ImplParams &p= ImplParams()) : Params(p) {}; + + bool overlapCommsCompute(void) { return false; }; + + template + inline void loadLinkElement(Simd & reg,ref &memory){ + vsplat(reg,memory); + } + inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St) + { + SiteGaugeLink UU; + for(int i=0;i(Umu,mu); + U = adj(Cshift(U,mu,-1)); + PokeIndex(Uadj,U,mu); + } + + for(int lidx=0;lidxlSites();lidx++){ + std::vector lcoor; + GaugeGrid->LocalIndexToLocalCoor(lidx,lcoor); + + peekLocalSite(ScalarUmu,Umu,lcoor); + for(int mu=0;mu<4;mu++) ScalarUds(mu) = ScalarUmu(mu); + + peekLocalSite(ScalarUmu,Uadj,lcoor); + for(int mu=0;mu<4;mu++) ScalarUds(mu+4) = ScalarUmu(mu); + + pokeLocalSite(ScalarUds,Uds,lcoor); + } + + } + + inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){ + assert(0); + } + + inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu){ + assert(0); + } + + }; + + //////////////////////////////////////////////////////////////////////////////////////// // Flavour doubled spinors; is Gparity the only? what about C*? //////////////////////////////////////////////////////////////////////////////////////// @@ -290,8 +388,8 @@ PARALLEL_FOR_LOOP conformable(Uds._grid,GaugeGrid); conformable(Umu._grid,GaugeGrid); - GaugeLinkField Utmp(GaugeGrid); - GaugeLinkField U(GaugeGrid); + GaugeLinkField Utmp (GaugeGrid); + GaugeLinkField U (GaugeGrid); GaugeLinkField Uconj(GaugeGrid); Lattice > coor(GaugeGrid); @@ -379,6 +477,10 @@ PARALLEL_FOR_LOOP typedef WilsonImpl WilsonImplF; // Float typedef WilsonImpl WilsonImplD; // Double + typedef DomainWallRedBlack5dImpl DomainWallRedBlack5dImplR; // Real.. whichever prec + typedef DomainWallRedBlack5dImpl DomainWallRedBlack5dImplF; // Float + typedef DomainWallRedBlack5dImpl DomainWallRedBlack5dImplD; // Double + typedef GparityWilsonImpl GparityWilsonImplR; // Real.. whichever prec typedef GparityWilsonImpl GparityWilsonImplF; // Float typedef GparityWilsonImpl GparityWilsonImplD; // Double From 9b6ab6db16cb72433199a429193eb1da41fd7293 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 19 Apr 2016 15:38:01 -0700 Subject: [PATCH 14/21] simd in 5th dimension support --- lib/qcd/action/fermion/WilsonFermion.cc | 15 +- lib/qcd/action/fermion/WilsonFermion.h | 3 - lib/qcd/action/fermion/WilsonFermion5D.cc | 410 +++++--------------- lib/qcd/action/fermion/WilsonFermion5D.h | 30 +- lib/qcd/action/fermion/WilsonKernels.cc | 2 + lib/qcd/action/fermion/WilsonKernelsAsm.cc | 2 + lib/qcd/action/fermion/WilsonKernelsHand.cc | 79 +++- 7 files changed, 172 insertions(+), 369 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonFermion.cc b/lib/qcd/action/fermion/WilsonFermion.cc index d874e0ac..d5fde091 100644 --- a/lib/qcd/action/fermion/WilsonFermion.cc +++ b/lib/qcd/action/fermion/WilsonFermion.cc @@ -288,11 +288,7 @@ PARALLEL_FOR_LOOP void WilsonFermion::DhopInternal(StencilImpl & st,DoubledGaugeField & U, const FermionField &in, FermionField &out,int dag) { - if ( Impl::overlapCommsCompute () ) { - DhopInternalCommsOverlapCompute(st,U,in,out,dag); - } else { - DhopInternalCommsThenCompute(st,U,in,out,dag); - } + DhopInternalCommsThenCompute(st,U,in,out,dag); } template void WilsonFermion::DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U, @@ -330,15 +326,6 @@ PARALLEL_FOR_LOOP } }; - - template - void WilsonFermion::DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U, - const FermionField &in, FermionField &out,int dag) { - - assert(0); - - }; - FermOpTemplateInstantiate(WilsonFermion); GparityFermOpTemplateInstantiate(WilsonFermion); diff --git a/lib/qcd/action/fermion/WilsonFermion.h b/lib/qcd/action/fermion/WilsonFermion.h index 6bd9fb38..220c3449 100644 --- a/lib/qcd/action/fermion/WilsonFermion.h +++ b/lib/qcd/action/fermion/WilsonFermion.h @@ -116,9 +116,6 @@ namespace Grid { void DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U, const FermionField &in, FermionField &out,int dag) ; - void DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U, - const FermionField &in, FermionField &out,int dag) ; - // Constructor WilsonFermion(GaugeField &_Umu, diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 9874031d..b8dba2f7 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -68,10 +68,8 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, // some assertions assert(FiveDimGrid._ndimension==5); assert(FourDimGrid._ndimension==4); - assert(FiveDimRedBlackGrid._ndimension==5); assert(FourDimRedBlackGrid._ndimension==4); - assert(FiveDimRedBlackGrid._checker_dim==1); // Dimension zero of the five-d is the Ls direction @@ -106,11 +104,75 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, dslashtime=0; dslash1time=0; } + +template +WilsonFermion5D::WilsonFermion5D(int simd, GaugeField &_Umu, + GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + GridRedBlackCartesian &FourDimRedBlackGrid, + RealD _M5,const ImplParams &p) : + Kernels(p), + _FiveDimGrid (&FiveDimGrid), + _FiveDimRedBlackGrid(&FiveDimRedBlackGrid), + _FourDimGrid (&FourDimGrid), + _FourDimRedBlackGrid(&FourDimRedBlackGrid), + Stencil (_FiveDimGrid,npoint,Even,directions,displacements), + StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even + StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd + M5(_M5), + Umu(_FourDimGrid), + UmuEven(_FourDimRedBlackGrid), + UmuOdd (_FourDimRedBlackGrid), + Lebesgue(_FourDimGrid), + LebesgueEvenOdd(_FourDimRedBlackGrid) +{ + int nsimd = Simd::Nsimd(); + + // some assertions + assert(FiveDimGrid._ndimension==5); + assert(FiveDimRedBlackGrid._ndimension==5); + assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction + assert(FourDimGrid._ndimension==4); + assert(FourDimRedBlackGrid._ndimension==4); + + // Dimension zero of the five-d is the Ls direction + Ls=FiveDimGrid._fdimensions[0]; + assert(FiveDimGrid._processors[0] ==1); + assert(FiveDimGrid._simd_layout[0] ==nsimd); + + assert(FiveDimRedBlackGrid._fdimensions[0]==Ls); + assert(FiveDimRedBlackGrid._processors[0] ==1); + assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd); + + // Other dimensions must match the decomposition of the four-D fields + for(int d=0;d<4;d++){ + assert(FourDimRedBlackGrid._fdimensions[d] ==FourDimGrid._fdimensions[d]); + assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]); + + assert(FourDimRedBlackGrid._processors[d] ==FourDimGrid._processors[d]); + assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]); + + assert(FourDimGrid._simd_layout[d]=1); + assert(FourDimRedBlackGrid._simd_layout[d] ==1); + assert(FourDimRedBlackGrid._simd_layout[d] ==1); + assert(FiveDimRedBlackGrid._simd_layout[d+1]==1); + + assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]); + assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]); + assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]); + } + + // Allocate the required comms buffer + ImportGauge(_Umu); +} + + template void WilsonFermion5D::ImportGauge(const GaugeField &_Umu) { - GaugeField HUmu(_Umu._grid); - HUmu = _Umu*(-0.5); + GaugeField HUmu(_Umu._grid); + HUmu = _Umu*(-0.5); Impl::DoubleStore(GaugeGrid(),Umu,HUmu); pickCheckerboard(Even,UmuEven,Umu); pickCheckerboard(Odd ,UmuOdd,Umu); @@ -294,11 +356,7 @@ void WilsonFermion5D::DhopInternalCommsThenCompute(StencilImpl & st, Lebes Compressor compressor(dag); // Assume balanced KMP_AFFINITY; this is forced in GridThread.h - - int threads = GridThread::GetThreads(); - int HT = GridThread::GetHyperThreads(); - int cores = GridThread::GetCores(); - int nwork = U._grid->oSites(); + int LLs = in._grid->_rdimensions[0]; commtime -=usecond(); auto handle = st.HaloExchangeBegin(in,compressor); @@ -318,97 +376,48 @@ void WilsonFermion5D::DhopInternalCommsThenCompute(StencilImpl & st, Lebes if( this->HandOptDslash ) { PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ - int sU=ss; - for(int s=0;soSites();ss++){ - { - int sd; - for(sd=0;sdAsmOptDslash ) { - // for(int i=0;i<1;i++){ - // for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){ - // PerformanceCounter Counter(i); - // Counter.Start(); -#pragma omp parallel for - for(int t=0;toSites();ss++){ + for(int s=0;sHandOptDslash ) { - /* - -#pragma omp parallel for schedule(static) - for(int t=0;toSites();ss++){ - int sU=ss; - for(int s=0;soSites();ss++){ - int sU=ss; - for(int s=0;s -void WilsonFermion5D::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder &lo, - DoubledGaugeField & U, - const FermionField &in, FermionField &out,int dag) -{ - // assert((dag==DaggerNo) ||(dag==DaggerYes)); - alltime-=usecond(); - Compressor compressor(dag); - - // Assume balanced KMP_AFFINITY; this is forced in GridThread.h - - int threads = GridThread::GetThreads(); - int HT = GridThread::GetHyperThreads(); - int cores = GridThread::GetCores(); - int nwork = U._grid->oSites(); - - commtime -=usecond(); - auto handle = st.HaloExchangeBegin(in,compressor); - st.HaloExchangeComplete(handle); - commtime +=usecond(); - - jointime -=usecond(); - jointime +=usecond(); - - // Dhop takes the 4d grid from U, and makes a 5d index for fermion - // Not loop ordering and data layout. - // Designed to create - // - per thread reuse in L1 cache for U - // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable. - -#pragma omp parallel - { - for(int jjj=0;jjj<100;jjj++){ -#pragma omp barrier - dslashtime -=usecond(); - if ( dag == DaggerYes ) { - if( this->HandOptDslash ) { -#pragma omp for - for(int ss=0;ssoSites();ss++){ - int sU=ss; - for(int s=0;soSites();ss++){ - { - int sd; - for(sd=0;sdAsmOptDslash ) { - // for(int i=0;i<1;i++){ - // for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){ - // PerformanceCounter Counter(i); - // Counter.Start(); - -#pragma omp for - for(int t=0;tHandOptDslash ) { -#pragma omp for - - for(int ss=0;ssoSites();ss++){ - int sU=ss; - for(int s=0;soSites();ss++){ - int sU=ss; - for(int s=0;s -void WilsonFermion5D::DhopInternalL1bench(StencilImpl & st, LebesgueOrder &lo, - DoubledGaugeField & U, - const FermionField &in, FermionField &out,int dag) -{ - // assert((dag==DaggerNo) ||(dag==DaggerYes)); - alltime-=usecond(); - Compressor compressor(dag); - - // Assume balanced KMP_AFFINITY; this is forced in GridThread.h - - int threads = GridThread::GetThreads(); - int HT = GridThread::GetHyperThreads(); - int cores = GridThread::GetCores(); - int nwork = U._grid->oSites(); - - commtime -=usecond(); - auto handle = st.HaloExchangeBegin(in,compressor); - st.HaloExchangeComplete(handle); - commtime +=usecond(); - - jointime -=usecond(); - jointime +=usecond(); - - // Dhop takes the 4d grid from U, and makes a 5d index for fermion - // Not loop ordering and data layout. - // Designed to create - // - per thread reuse in L1 cache for U - // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable. - -#pragma omp parallel - { - for(int jjj=0;jjj<100;jjj++){ -#pragma omp barrier - dslashtime -=usecond(); - if ( dag == DaggerYes ) { - if( this->HandOptDslash ) { -#pragma omp for - for(int ss=0;ssoSites();ss++){ - int sU=0; - for(int s=0;soSites();ss++){ - { - int sd; - for(sd=0;sdAsmOptDslash ) { - // for(int i=0;i<1;i++){ - // for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){ - // PerformanceCounter Counter(i); - // Counter.Start(); - -#pragma omp for - for(int t=0;tHandOptDslash ) { -#pragma omp for - - for(int ss=0;ssoSites();ss++){ - int sU=0; - for(int s=0;soSites();ss++){ - int sU=0; - for(int s=0;s -void WilsonFermion5D::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo, - DoubledGaugeField & U, - const FermionField &in, FermionField &out,int dag) -{ - assert(0); -} template void WilsonFermion5D::DhopOE(const FermionField &in, FermionField &out,int dag) @@ -706,6 +470,8 @@ void WilsonFermion5D::DW(const FermionField &in, FermionField &out,int dag FermOpTemplateInstantiate(WilsonFermion5D); GparityFermOpTemplateInstantiate(WilsonFermion5D); +template class WilsonFermion5D; +template class WilsonFermion5D; }} diff --git a/lib/qcd/action/fermion/WilsonFermion5D.h b/lib/qcd/action/fermion/WilsonFermion5D.h index 840c1a46..792e4dd0 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.h +++ b/lib/qcd/action/fermion/WilsonFermion5D.h @@ -87,6 +87,7 @@ namespace Grid { virtual void MeooeDag (const FermionField &in, FermionField &out){assert(0);}; virtual void MooeeDag (const FermionField &in, FermionField &out){assert(0);}; virtual void MooeeInvDag (const FermionField &in, FermionField &out){assert(0);}; + virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac // These can be overridden by fancy 5d chiral action virtual void DhopDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag); @@ -121,32 +122,12 @@ namespace Grid { FermionField &out, int dag); - void DhopInternalOMPbench(StencilImpl & st, - LebesgueOrder &lo, - DoubledGaugeField &U, - const FermionField &in, - FermionField &out, - int dag); - - void DhopInternalL1bench(StencilImpl & st, - LebesgueOrder &lo, - DoubledGaugeField &U, - const FermionField &in, - FermionField &out, - int dag); - void DhopInternalCommsThenCompute(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag); - void DhopInternalCommsOverlapCompute(StencilImpl & st, - LebesgueOrder &lo, - DoubledGaugeField &U, - const FermionField &in, - FermionField &out, - int dag); // Constructors WilsonFermion5D(GaugeField &_Umu, @@ -156,6 +137,15 @@ namespace Grid { GridRedBlackCartesian &FourDimRedBlackGrid, double _M5,const ImplParams &p= ImplParams()); + // Constructors + WilsonFermion5D(int simd, + GaugeField &_Umu, + GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + GridRedBlackCartesian &FourDimRedBlackGrid, + double _M5,const ImplParams &p= ImplParams()); + // DoubleStore void ImportGauge(const GaugeField &_Umu); diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc index b94284f7..387ac0cd 100644 --- a/lib/qcd/action/fermion/WilsonKernels.cc +++ b/lib/qcd/action/fermion/WilsonKernels.cc @@ -529,5 +529,7 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField #endif FermOpTemplateInstantiate(WilsonKernels); +template class WilsonKernels; +template class WilsonKernels; }} diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc index bdda199f..859d1a20 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc +++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc @@ -256,5 +256,7 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField template class WilsonKernels; template class WilsonKernels; template class WilsonKernels; + template class WilsonKernels; + template class WilsonKernels; }} #endif diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc index 74440f16..c4e4d87e 100644 --- a/lib/qcd/action/fermion/WilsonKernelsHand.cc +++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc @@ -54,14 +54,15 @@ Author: paboyle Chi_11 = ref()(1)(1);\ Chi_12 = ref()(1)(2); +// To splat or not to splat depends on the implementation #define MULT_2SPIN(A)\ auto & ref(U._odata[sU](A)); \ - U_00 = ref()(0,0);\ - U_10 = ref()(1,0);\ - U_20 = ref()(2,0);\ - U_01 = ref()(0,1);\ - U_11 = ref()(1,1); \ - U_21 = ref()(2,1);\ + Impl::loadLinkElement(U_00,ref()(0,0)); \ + Impl::loadLinkElement(U_10,ref()(1,0)); \ + Impl::loadLinkElement(U_20,ref()(2,0)); \ + Impl::loadLinkElement(U_01,ref()(0,1)); \ + Impl::loadLinkElement(U_11,ref()(1,1)); \ + Impl::loadLinkElement(U_21,ref()(2,1)); \ UChi_00 = U_00*Chi_00;\ UChi_10 = U_00*Chi_10;\ UChi_01 = U_10*Chi_00;\ @@ -74,9 +75,9 @@ Author: paboyle UChi_11+= U_11*Chi_11;\ UChi_02+= U_21*Chi_01;\ UChi_12+= U_21*Chi_11;\ - U_00 = ref()(0,2);\ - U_10 = ref()(1,2);\ - U_20 = ref()(2,2);\ + Impl::loadLinkElement(U_00,ref()(0,2)); \ + Impl::loadLinkElement(U_10,ref()(1,2)); \ + Impl::loadLinkElement(U_20,ref()(2,2)); \ UChi_00+= U_00*Chi_02;\ UChi_10+= U_00*Chi_12;\ UChi_01+= U_10*Chi_02;\ @@ -84,6 +85,7 @@ Author: paboyle UChi_02+= U_20*Chi_02;\ UChi_12+= U_20*Chi_12; + #define PERMUTE_DIR(dir) \ permute##dir(Chi_00,Chi_00);\ permute##dir(Chi_01,Chi_01);\ @@ -809,7 +811,6 @@ int WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,Doub int sF,int sU,const FermionField &in, FermionField &out) { DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3 - //check consistency of return types between these functions and the ones in WilsonKernels.cc return 0; } @@ -843,6 +844,47 @@ int WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,D + ////////////// +/* +template<> +int WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, + std::vector > &buf, + int sF,int sU,const FermionField &in, FermionField &out) +{ + DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3 + return 0; + +} + +template<> +int WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, + std::vector > &buf, + int sF,int sU,const FermionField &in, FermionField &out) +{ + DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3 + return 0; +} + +template<> +int WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, + std::vector > &buf, + int sF,int sU,const FermionField &in, FermionField &out) +{ + DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3 + return 0; +} + +template<> +int WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, + std::vector > &buf, + int sF,int sU,const FermionField &in, FermionField &out) +{ + DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3 + return 0; +} + +*/ + template int WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, int ss,int sU,const FermionField &in, FermionField &out); @@ -870,4 +912,21 @@ template int WilsonKernels::DiracOptHandDhopSiteDag(StencilI std::vector > &buf, int ss,int sU,const FermionField &in, FermionField &out); + + + +template int WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, + std::vector > &buf, + int ss,int sU,const FermionField &in, FermionField &out); +template int WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, + std::vector > &buf, + int ss,int sU,const FermionField &in, FermionField &out); +template int WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, + std::vector > &buf, + int ss,int sU,const FermionField &in, FermionField &out); +template int WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, + std::vector > &buf, + int ss,int sU,const FermionField &in, FermionField &out); + + }} From ba427abde9013c94c99a02cb49384f3c4c580c90 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 19 Apr 2016 15:38:39 -0700 Subject: [PATCH 15/21] simd 5d --- lib/qcd/action/gauge/GaugeImpl.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/qcd/action/gauge/GaugeImpl.h b/lib/qcd/action/gauge/GaugeImpl.h index 51a4675e..012d21e4 100644 --- a/lib/qcd/action/gauge/GaugeImpl.h +++ b/lib/qcd/action/gauge/GaugeImpl.h @@ -42,7 +42,9 @@ template class WilsonLoops; #define INHERIT_GIMPL_TYPES(GImpl) \ typedef typename GImpl::Simd Simd;\ typedef typename GImpl::GaugeLinkField GaugeLinkField;\ - typedef typename GImpl::GaugeField GaugeField; + typedef typename GImpl::GaugeField GaugeField;\ + typedef typename GImpl::SiteGaugeField SiteGaugeField;\ + typedef typename GImpl::SiteGaugeLink SiteGaugeLink; // From 8fd8bc25e9d2e18fbf7494561d090742fbc0a9c7 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 19 Apr 2016 15:39:00 -0700 Subject: [PATCH 16/21] simd 5th dim with rotation --- lib/Make.inc | 4 +- lib/qcd/utils/SpaceTimeGrid.cc | 45 +++++- lib/qcd/utils/SpaceTimeGrid.h | 5 + lib/tensors/Tensor_extract_merge.h | 12 +- tests/Make.inc | 14 +- tests/Test_cshift_red_black_rotate.cc | 223 ++++++++++++++++++++++++++ tests/Test_cshift_rotate.cc | 125 +++++++++++++++ tests/Test_dwf_rb5d.cc | 138 ++++++++++++++++ tests/Test_simd.cc | 168 ++++++++++++++++++- 9 files changed, 724 insertions(+), 10 deletions(-) create mode 100644 tests/Test_cshift_red_black_rotate.cc create mode 100644 tests/Test_cshift_rotate.cc create mode 100644 tests/Test_dwf_rb5d.cc diff --git a/lib/Make.inc b/lib/Make.inc index 06b9da14..a7639d01 100644 --- a/lib/Make.inc +++ b/lib/Make.inc @@ -1,4 +1,4 @@ -HFILES=./Algorithms.h ./AlignedAllocator.h ./Cartesian.h ./Communicator.h ./Cshift.h ./Grid.h ./Init.h ./Lattice.h ./Lexicographic.h ./Log.h ./Old/Tensor_peek.h ./Old/Tensor_poke.h ./PerfCount.h ./Simd.h ./Stencil.h ./Tensors.h ./Threads.h ./Timer.h ./algorithms/CoarsenedMatrix.h ./algorithms/LinearOperator.h ./algorithms/Preconditioner.h ./algorithms/SparseMatrix.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/DenseMatrix.h ./algorithms/iterative/EigenSort.h ./algorithms/iterative/Francis.h ./algorithms/iterative/Householder.h ./algorithms/iterative/ImplicitlyRestartedLanczos.h ./algorithms/iterative/Matrix.h ./algorithms/iterative/MatrixUtils.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/SchurRedBlack.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./communicator/Communicator_base.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./lattice/Lattice_ET.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./parallelIO/BinaryIO.h ./parallelIO/NerscIO.h ./pugixml/pugixml.h ./qcd/QCD.h ./qcd/action/ActionBase.h ./qcd/action/ActionParams.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/fermion/WilsonTMFermion.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/gauge/GaugeImpl.h ./qcd/action/gauge/PlaqPlusRectangleAction.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/hmc/HMC.h ./qcd/hmc/HmcRunner.h ./qcd/hmc/NerscCheckpointer.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SUn.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/WilsonLoops.h ./serialisation/BaseIO.h ./serialisation/BinaryIO.h ./serialisation/MacroMagic.h ./serialisation/Serialisation.h ./serialisation/TextIO.h ./serialisation/XmlIO.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_imci.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./simd/Intel512avx.h ./simd/Intel512avxAddsub.h ./simd/Intel512common.h ./simd/Intel512double.h ./simd/Intel512imci.h ./simd/Intel512single.h ./simd/Intel512wilson.h ./stencil/Lebesgue.h ./tensors/Tensor_Ta.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_exp.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_index.h ./tensors/Tensor_inner.h ./tensors/Tensor_logical.h ./tensors/Tensor_outer.h ./tensors/Tensor_reality.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h +HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/CoarsenedMatrix.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/DenseMatrix.h ./algorithms/iterative/EigenSort.h ./algorithms/iterative/Francis.h ./algorithms/iterative/Householder.h ./algorithms/iterative/ImplicitlyRestartedLanczos.h ./algorithms/iterative/Matrix.h ./algorithms/iterative/MatrixUtils.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/Preconditioner.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./Init.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./Lattice.h ./Lexicographic.h ./Log.h ./Old/Tensor_peek.h ./Old/Tensor_poke.h ./parallelIO/BinaryIO.h ./parallelIO/NerscIO.h ./PerfCount.h ./pugixml/pugixml.h ./qcd/action/ActionBase.h ./qcd/action/ActionParams.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonFermion5DRedBlack.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/fermion/WilsonTMFermion.h ./qcd/action/gauge/GaugeImpl.h ./qcd/action/gauge/PlaqPlusRectangleAction.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/hmc/HMC.h ./qcd/hmc/HmcRunner.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/hmc/NerscCheckpointer.h ./qcd/QCD.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/SUn.h ./qcd/utils/WilsonLoops.h ./serialisation/BaseIO.h ./serialisation/BinaryIO.h ./serialisation/MacroMagic.h ./serialisation/Serialisation.h ./serialisation/TextIO.h ./serialisation/XmlIO.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_imci.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./simd/Intel512avx.h ./simd/Intel512avxAddsub.h ./simd/Intel512common.h ./simd/Intel512double.h ./simd/Intel512imci.h ./simd/Intel512single.h ./simd/Intel512wilson.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_exp.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_index.h ./tensors/Tensor_inner.h ./tensors/Tensor_logical.h ./tensors/Tensor_outer.h ./tensors/Tensor_reality.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h ./Tensors.h ./Threads.h ./Timer.h -CCFILES=./Init.cc ./Log.cc ./PerfCount.cc ./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./pugixml/pugixml.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsAsm.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/action/fermion/WilsonTMFermion.cc ./qcd/hmc/HMC.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./serialisation/BinaryIO.cc ./serialisation/TextIO.cc ./serialisation/XmlIO.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc +CCFILES=./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./Init.cc ./Log.cc ./PerfCount.cc ./pugixml/pugixml.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsAsm.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/action/fermion/WilsonTMFermion.cc ./qcd/hmc/HMC.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./serialisation/BinaryIO.cc ./serialisation/TextIO.cc ./serialisation/XmlIO.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc diff --git a/lib/qcd/utils/SpaceTimeGrid.cc b/lib/qcd/utils/SpaceTimeGrid.cc index 0b8c2a94..efa7b8b9 100644 --- a/lib/qcd/utils/SpaceTimeGrid.cc +++ b/lib/qcd/utils/SpaceTimeGrid.cc @@ -41,7 +41,11 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFourDimRedBlackGrid(const GridCartesia { return new GridRedBlackCartesian(FourDimGrid); } - +GridCartesian *SpaceTimeGrid::makeFourDimDWFGrid(const std::vector & latt,const std::vector &mpi) +{ + std::vector simd(4,1); + return makeFourDimGrid(latt,simd,mpi); +} GridCartesian *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian *FourDimGrid) { int N4=FourDimGrid->_ndimension; @@ -58,6 +62,7 @@ GridCartesian *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian return new GridCartesian(latt5,simd5,mpi5); } + GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid) { int N4=FourDimGrid->_ndimension; @@ -76,4 +81,42 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridC return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd); } + +GridCartesian *SpaceTimeGrid::makeFiveDimDWFGrid(int Ls,const GridCartesian *FourDimGrid) +{ + int N4=FourDimGrid->_ndimension; + int nsimd = FourDimGrid->Nsimd(); + + std::vector latt5(1,Ls); + std::vector simd5(1,nsimd); + std::vector mpi5(1,1); + + for(int d=0;d_fdimensions[d]); + simd5.push_back(1); + mpi5.push_back(FourDimGrid->_processors[d]); + } + return new GridCartesian(latt5,simd5,mpi5); +} + +GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid) +{ + int N4=FourDimGrid->_ndimension; + int nsimd = FourDimGrid->Nsimd(); + int cbd=0; + std::vector latt5(1,Ls); + std::vector simd5(1,nsimd); + std::vector mpi5(1,1); + std::vector cb5(1,1); + + for(int d=0;d_fdimensions[d]); + simd5.push_back(1); + mpi5.push_back(FourDimGrid->_processors[d]); + cb5.push_back(1); + } + return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd); +} + + }} diff --git a/lib/qcd/utils/SpaceTimeGrid.h b/lib/qcd/utils/SpaceTimeGrid.h index 94862ca2..61613e4d 100644 --- a/lib/qcd/utils/SpaceTimeGrid.h +++ b/lib/qcd/utils/SpaceTimeGrid.h @@ -35,9 +35,14 @@ class SpaceTimeGrid { static GridCartesian *makeFourDimGrid(const std::vector & latt,const std::vector &simd,const std::vector &mpi); static GridRedBlackCartesian *makeFourDimRedBlackGrid (const GridCartesian *FourDimGrid); + static GridCartesian *makeFiveDimGrid (int Ls,const GridCartesian *FourDimGrid); static GridRedBlackCartesian *makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid); + static GridCartesian *makeFiveDimDWFGrid (int Ls,const GridCartesian *FourDimGrid); + static GridRedBlackCartesian *makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid); + static GridCartesian *makeFourDimDWFGrid (const std::vector & latt,const std::vector &mpi); + }; }} diff --git a/lib/tensors/Tensor_extract_merge.h b/lib/tensors/Tensor_extract_merge.h index 93318d9c..ad98213d 100644 --- a/lib/tensors/Tensor_extract_merge.h +++ b/lib/tensors/Tensor_extract_merge.h @@ -44,7 +44,7 @@ template inline void extract(typename std::enable_if::value, const vsimd >::type * y, std::vector &extracted,int offset){ // FIXME: bounce off memory is painful - static const int Nsimd=vsimd::Nsimd(); + static const int Nsimd=sizeof(vsimd)/sizeof(scalar); int Nextr=extracted.size(); int s=Nsimd/Nextr; @@ -59,7 +59,9 @@ inline void extract(typename std::enable_if::value, const v template inline void merge(typename std::enable_if::value, vsimd >::type * y, std::vector &extracted,int offset){ - static const int Nsimd=vsimd::Nsimd(); + + static const int Nsimd=sizeof(vsimd)/sizeof(scalar); + int Nextr=extracted.size(); int s=Nsimd/Nextr; // can have sparse occupation of simd vector if simd_layout does not fill it // replicate n-fold. Use to allow Integer masks to @@ -127,7 +129,7 @@ template inline void extract(const vobj &vec,std::vector &extracted) typedef typename vobj::scalar_type scalar_type ; typedef typename vobj::vector_type vector_type ; - static const int Nsimd=vobj::vector_type::Nsimd(); + static const int Nsimd=sizeof(vector_type)/sizeof(scalar_type); static const int words=sizeof(vobj)/sizeof(vector_type); int Nextr = extracted.size(); @@ -199,7 +201,7 @@ void merge(vobj &vec,std::vector &extracted,int typedef typename vobj::scalar_type scalar_type ; typedef typename vobj::vector_type vector_type ; - const int Nsimd=vobj::vector_type::Nsimd(); + const int Nsimd=sizeof(vector_type)/sizeof(scalar_type); const int words=sizeof(vobj)/sizeof(vector_type); int Nextr=extracted.size(); diff --git a/tests/Make.inc b/tests/Make.inc index 98323b45..cd4ab811 100644 --- a/tests/Make.inc +++ b/tests/Make.inc @@ -1,5 +1,5 @@ -bin_PROGRAMS += Test_GaugeAction Test_RectPlaq Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_gp_rect_force Test_gparity Test_gpdwf_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd +bin_PROGRAMS = Test_GaugeAction Test_RectPlaq Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_cshift_red_black_rotate Test_cshift_rotate Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_dwf_rb5d Test_gamma Test_gp_rect_force Test_gparity Test_gpdwf_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd Test_GaugeAction_SOURCES=Test_GaugeAction.cc @@ -58,6 +58,14 @@ Test_cshift_red_black_SOURCES=Test_cshift_red_black.cc Test_cshift_red_black_LDADD=-lGrid +Test_cshift_red_black_rotate_SOURCES=Test_cshift_red_black_rotate.cc +Test_cshift_red_black_rotate_LDADD=-lGrid + + +Test_cshift_rotate_SOURCES=Test_cshift_rotate.cc +Test_cshift_rotate_LDADD=-lGrid + + Test_dwf_cg_prec_SOURCES=Test_dwf_cg_prec.cc Test_dwf_cg_prec_LDADD=-lGrid @@ -98,6 +106,10 @@ Test_dwf_lanczos_SOURCES=Test_dwf_lanczos.cc Test_dwf_lanczos_LDADD=-lGrid +Test_dwf_rb5d_SOURCES=Test_dwf_rb5d.cc +Test_dwf_rb5d_LDADD=-lGrid + + Test_gamma_SOURCES=Test_gamma.cc Test_gamma_LDADD=-lGrid diff --git a/tests/Test_cshift_red_black_rotate.cc b/tests/Test_cshift_red_black_rotate.cc new file mode 100644 index 00000000..69bfdf3d --- /dev/null +++ b/tests/Test_cshift_red_black_rotate.cc @@ -0,0 +1,223 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_cshift_red_black.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace Grid; +using namespace Grid::QCD; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + int Nd = latt_size.size(); + std::vector simd_layout( { vComplex::Nsimd(),1,1,1}); + std::vector mpi_layout = GridDefaultMpi(); + + std::vector mask(Nd,1); + mask[0]=0; + + GridCartesian Fine (latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian RBFine(latt_size,simd_layout,mpi_layout,mask,1); + + GridParallelRNG FineRNG(&Fine); FineRNG.SeedRandomDevice(); + + LatticeComplex U(&Fine); + LatticeComplex ShiftU(&Fine); + LatticeComplex rbShiftU(&Fine); + LatticeComplex Ue(&RBFine); + LatticeComplex Uo(&RBFine); + LatticeComplex ShiftUe(&RBFine); + LatticeComplex ShiftUo(&RBFine); + LatticeComplex lex(&Fine); + lex=zero; + Integer stride =1; + { + double nrm; + LatticeComplex coor(&Fine); + + for(int d=0;d coor(4); + + std::cout< scoor(coor); + scoor[dir] = (scoor[dir]+shift)%latt_size[dir]; + + Integer slex = scoor[0] + + latt_size[0]*scoor[1] + + latt_size[0]*latt_size[1]*scoor[2] + + latt_size[0]*latt_size[1]*latt_size[2]*scoor[3]; + + Complex scm(slex); + + double nrm = abs(scm-cm()()()); + std::vector peer(4); + Complex ctmp = cm; + Integer index=real(ctmp); + Lexicographic::CoorFromIndex(peer,index,latt_size); + + if (nrm > 0){ + std::cout<<"FAIL shift "<< shift<<" in dir "<< dir + <<" ["< scoor(coor); + scoor[dir] = (scoor[dir]+shift)%latt_size[dir]; + + Integer slex = scoor[0] + + latt_size[0]*scoor[1] + + latt_size[0]*latt_size[1]*scoor[2] + + latt_size[0]*latt_size[1]*latt_size[2]*scoor[3]; + + Complex scm(slex); + + std::vector peer(4); + Complex ctmp=cmeo; + Integer index=real(ctmp); + Lexicographic::CoorFromIndex(peer,index,latt_size); + + double nrm = abs(cmeo()()()-scm); + if (nrm != 0) { + std::cout<<"EOFAIL shift "<< shift<<" in dir "<< dir + <<" ["< 0){ + std::cout<<"FAIL shift "<< shift<<" in dir "<< dir + <<" ["< +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace Grid; +using namespace Grid::QCD; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout( { vComplex::Nsimd(),1,1,1}); + std::vector mpi_layout = GridDefaultMpi(); + + GridCartesian Fine(latt_size,simd_layout,mpi_layout); + + GridParallelRNG FineRNG(&Fine); FineRNG.SeedRandomDevice(); + + LatticeComplex U(&Fine); + LatticeComplex ShiftU(&Fine); + + LatticeComplex lex(&Fine); + lex=zero; + Integer stride =1; + { + double nrm; + LatticeComplex coor(&Fine); + + for(int d=0;d<4;d++){ + LatticeCoordinate(coor,d); + lex = lex + coor*stride; + stride=stride*latt_size[d]; + } + U=lex; + } + + + TComplex cm; + + for(int dir=0;dir<4;dir++){ + for(int shift=0;shift coor(4); + + for(coor[3]=0;coor[3] scoor(coor); + scoor[dir] = (scoor[dir]+shift)%latt_size[dir]; + + Integer slex = scoor[0] + + latt_size[0]*scoor[1] + + latt_size[0]*latt_size[1]*scoor[2] + + latt_size[0]*latt_size[1]*latt_size[2]*scoor[3]; + + Complex scm(slex); + + nrm = abs(scm-cm()()()); + std::vector peer(4); + Complex tmp =cm; + Integer index=real(tmp); + Lexicographic::CoorFromIndex(peer,index,latt_size); + + if (nrm > 0){ + std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir<<" ["< +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::GammaMatrix Gmu [] = { + Gamma::GammaX, + Gamma::GammaY, + Gamma::GammaZ, + Gamma::GammaT + }; + +typedef WilsonFermion5D WilsonFermion5DR; +typedef WilsonFermion5D WilsonFermion5DF; +typedef WilsonFermion5D WilsonFermion5DD; + +typedef WilsonFermion5D WilsonFermion5D_OKR; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int threads = GridThread::GetThreads(); + std::cout< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + + LatticeFermion src (FGrid); random(RNG5,src); + LatticeFermion result(FGrid); result=zero; + LatticeFermion ref(FGrid); ref=zero; + LatticeFermion tmp(FGrid); + LatticeFermion err(FGrid); + + LatticeGaugeField Umu(UGrid); random(RNG4,Umu); + std::vector U(4,UGrid); + + // Only one non-zero (y) + /* + Umu=zero; + for(int nn=0;nn(Umu,U[nn],nn); + } + */ + + RealD mass=0.1; + RealD M5 =1.8; + typename WilsonFermion5DR::ImplParams params; + + WilsonFermion5DR Dw(1,Umu,*FGrid,*FrbGrid,*sUGrid,*sUrbGrid,M5,params); + + Dw.Dhop(src,result,0); + + std::cout << "Norm src = "< void operator()(vec &rr,vec &i1,vec &i2) const { permute(rr,i1,n);} + template void apply(std::vector &rr,std::vector &in) const { + int sz=in.size(); + int msk = sz>>(n+1); + for(int i=0;i void operator()(vec &rr,vec &i1,vec &i2) const { rr=rotate(i1,n);} + template void apply(std::vector &rr,std::vector &in) const { + int sz = in.size(); + for(int i=0;i +void PermTester(const functor &func) +{ + GridSerialRNG sRNG; + sRNG.SeedRandomDevice(); + + int Nsimd = vec::Nsimd(); + + std::vector input1(Nsimd); + std::vector input2(Nsimd); + std::vector result(Nsimd); + std::vector reference(Nsimd); + + std::vector > buf(3); + vec & v_input1 = buf[0]; + vec & v_input2 = buf[1]; + vec & v_result = buf[2]; + + for(int i=0;i(v_input1,input1); + merge(v_input2,input2); + merge(v_result,result); + + func(v_result,v_input1,v_input2); + + func.apply(reference,input1); + + extract(v_result,result); + std::cout<1.0e-7){ + std::cout<(funcInnerProduct()); ReductionTester(funcReduce()); + + std::cout<(funcPermute(i)); + } + + std::cout<(funcRotate(r)); + } + + std::cout << GridLogMessage <<"==================================="<< std::endl; std::cout << GridLogMessage <<"Testing vRealD "<(funcInnerProduct()); ReductionTester(funcReduce()); + + std::cout<(funcPermute(i)); + } + + std::cout<(funcRotate(r)); + } + + + std::cout << GridLogMessage <<"==================================="<< std::endl; std::cout << GridLogMessage <<"Testing vComplexF "<(funcInnerProduct()); ReductionTester(funcReduce()); + + std::cout<(funcPermute(i)); + } + + std::cout<(funcRotate(r)); + } + std::cout<(funcInnerProduct()); ReductionTester(funcReduce()); + + std::cout<(funcPermute(i)); + } + + + std::cout<(funcRotate(r)); + } + Grid_finalize(); } From d9b5e66877066c1ac65bc3846b3321ee6ac2e29b Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 20 Apr 2016 18:25:48 +0100 Subject: [PATCH 17/21] Update Make.inc --- lib/Make.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Make.inc b/lib/Make.inc index a7639d01..776e2271 100644 --- a/lib/Make.inc +++ b/lib/Make.inc @@ -1,4 +1,4 @@ -HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/CoarsenedMatrix.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/DenseMatrix.h ./algorithms/iterative/EigenSort.h ./algorithms/iterative/Francis.h ./algorithms/iterative/Householder.h ./algorithms/iterative/ImplicitlyRestartedLanczos.h ./algorithms/iterative/Matrix.h ./algorithms/iterative/MatrixUtils.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/Preconditioner.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./Init.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./Lattice.h ./Lexicographic.h ./Log.h ./Old/Tensor_peek.h ./Old/Tensor_poke.h ./parallelIO/BinaryIO.h ./parallelIO/NerscIO.h ./PerfCount.h ./pugixml/pugixml.h ./qcd/action/ActionBase.h ./qcd/action/ActionParams.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonFermion5DRedBlack.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/fermion/WilsonTMFermion.h ./qcd/action/gauge/GaugeImpl.h ./qcd/action/gauge/PlaqPlusRectangleAction.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/hmc/HMC.h ./qcd/hmc/HmcRunner.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/hmc/NerscCheckpointer.h ./qcd/QCD.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/SUn.h ./qcd/utils/WilsonLoops.h ./serialisation/BaseIO.h ./serialisation/BinaryIO.h ./serialisation/MacroMagic.h ./serialisation/Serialisation.h ./serialisation/TextIO.h ./serialisation/XmlIO.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_imci.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./simd/Intel512avx.h ./simd/Intel512avxAddsub.h ./simd/Intel512common.h ./simd/Intel512double.h ./simd/Intel512imci.h ./simd/Intel512single.h ./simd/Intel512wilson.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_exp.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_index.h ./tensors/Tensor_inner.h ./tensors/Tensor_logical.h ./tensors/Tensor_outer.h ./tensors/Tensor_reality.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h ./Tensors.h ./Threads.h ./Timer.h +HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/CoarsenedMatrix.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/DenseMatrix.h ./algorithms/iterative/EigenSort.h ./algorithms/iterative/Francis.h ./algorithms/iterative/Householder.h ./algorithms/iterative/ImplicitlyRestartedLanczos.h ./algorithms/iterative/Matrix.h ./algorithms/iterative/MatrixUtils.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/Preconditioner.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./Init.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./Lattice.h ./Lexicographic.h ./Log.h ./Old/Tensor_peek.h ./Old/Tensor_poke.h ./parallelIO/BinaryIO.h ./parallelIO/NerscIO.h ./PerfCount.h ./pugixml/pugixml.h ./qcd/action/ActionBase.h ./qcd/action/ActionParams.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/fermion/WilsonTMFermion.h ./qcd/action/gauge/GaugeImpl.h ./qcd/action/gauge/PlaqPlusRectangleAction.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/hmc/HMC.h ./qcd/hmc/HmcRunner.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/hmc/NerscCheckpointer.h ./qcd/QCD.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/SUn.h ./qcd/utils/WilsonLoops.h ./serialisation/BaseIO.h ./serialisation/BinaryIO.h ./serialisation/MacroMagic.h ./serialisation/Serialisation.h ./serialisation/TextIO.h ./serialisation/XmlIO.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_imci.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./simd/Intel512avx.h ./simd/Intel512avxAddsub.h ./simd/Intel512common.h ./simd/Intel512double.h ./simd/Intel512imci.h ./simd/Intel512single.h ./simd/Intel512wilson.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_exp.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_index.h ./tensors/Tensor_inner.h ./tensors/Tensor_logical.h ./tensors/Tensor_outer.h ./tensors/Tensor_reality.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h ./Tensors.h ./Threads.h ./Timer.h CCFILES=./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./Init.cc ./Log.cc ./PerfCount.cc ./pugixml/pugixml.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsAsm.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/action/fermion/WilsonTMFermion.cc ./qcd/hmc/HMC.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./serialisation/BinaryIO.cc ./serialisation/TextIO.cc ./serialisation/XmlIO.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc From a6dfa2386b98625767393a803da560ec47725d6b Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 22 Apr 2016 06:33:41 -0700 Subject: [PATCH 18/21] GCC choked on intrinsics calls that ICPC did not --- lib/simd/Grid_avx.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index d05551c7..03faabee 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -438,8 +438,8 @@ namespace Optimization { }; #if defined (AVX2) || defined (AVXFMA4) -#define _mm256_alignr_epi32(ret,a,b,n) ret= _mm256_alignr_epi8(a,b,(n*4)%16) -#define _mm256_alignr_epi64(ret,a,b,n) ret= _mm256_alignr_epi8(a,b,(n*8)%16) +#define _mm256_alignr_epi32(ret,a,b,n) ret=(__m256) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*4)%16) +#define _mm256_alignr_epi64(ret,a,b,n) ret=(__m256d) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*8)%16) #endif #if defined (AVX1) @@ -449,26 +449,26 @@ namespace Optimization { \ aa = _mm256_extractf128_ps(a,1); \ bb = _mm256_extractf128_ps(b,1); \ - aa = _mm_alignr_epi8(aa,bb,(n*4)%16); \ + aa = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16); \ ret = _mm256_insertf128_ps(ret,aa,1); \ \ aa = _mm256_extractf128_ps(a,0); \ bb = _mm256_extractf128_ps(b,0); \ - aa = _mm_alignr_epi8(aa,bb,(n*4)%16); \ + aa = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16); \ ret = _mm256_insertf128_ps(ret,aa,0); \ } #define _mm256_alignr_epi64(ret,a,b,n) { \ - __m128 aa, bb; \ + __m128d aa, bb; \ \ aa = _mm256_extractf128_pd(a,1); \ bb = _mm256_extractf128_pd(b,1); \ - aa = _mm_alignr_epi8(aa,bb,(n*8)%16); \ + aa = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16); \ ret = _mm256_insertf128_pd(ret,aa,1); \ \ aa = _mm256_extractf128_pd(a,0); \ bb = _mm256_extractf128_pd(b,0); \ - aa = _mm_alignr_epi8(aa,bb,(n*8)%16); \ + aa = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16); \ ret = _mm256_insertf128_pd(ret,aa,0); \ } From e3f141f82f9ea13498f59bb745edbfec99c740f5 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 22 Apr 2016 10:30:30 -0700 Subject: [PATCH 19/21] Fixed SSE compile with typecasts --- lib/simd/Grid_sse4.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index 2af91f44..a4002373 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -316,8 +316,8 @@ namespace Optimization { #define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) #define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) - template static inline __m128 tRotate(__m128 in){ return _mm_alignr_epi32(in,in,n); }; - template static inline __m128d tRotate(__m128d in){ return _mm_alignr_epi64(in,in,n); }; + template static inline __m128 tRotate(__m128 in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); }; + template static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); }; }; ////////////////////////////////////////////// From c79ea0dcef2c73802fa7902dbc92e7e5da8ea03d Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 22 Apr 2016 21:52:54 -0700 Subject: [PATCH 20/21] Fixingn IMCI --- configure.ac | 11 +---- lib/algorithms/approx/Remez.h | 6 ++- lib/qcd/action/fermion/WilsonKernelsAsm.cc | 2 +- lib/simd/Grid_imci.h | 52 +++++++++++++++++++++- 4 files changed, 58 insertions(+), 13 deletions(-) diff --git a/configure.ac b/configure.ac index 4da6e3d9..88d85c99 100644 --- a/configure.ac +++ b/configure.ac @@ -55,15 +55,6 @@ echo ::::::::::::::::::::::::::::::::::::::::::: AC_CHECK_FUNCS([gettimeofday]) -#AC_CHECK_LIB([gmp],[__gmpf_init],, -# [AC_MSG_ERROR(GNU Multiple Precision GMP library was not found in your system. -#Please install or provide the correct path to your installation -#Info at: http://www.gmplib.org)]) - -#AC_CHECK_LIB([mpfr],[mpfr_init],, -# [AC_MSG_ERROR(GNU Multiple Precision MPFR library was not found in your system. -#Please install or provide the correct path to your installation -#Info at: http://www.mpfr.org/)]) # # SIMD instructions selection @@ -124,7 +115,7 @@ case ${ac_SIMD} in echo Configuring for IMCI AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner] ) supported="cross compilation" - ac_ZMM=yes; + ac_ZMM=no; ;; NEONv8) echo Configuring for experimental ARMv8a support diff --git a/lib/algorithms/approx/Remez.h b/lib/algorithms/approx/Remez.h index 6e3cf05b..4a56d5d2 100644 --- a/lib/algorithms/approx/Remez.h +++ b/lib/algorithms/approx/Remez.h @@ -16,9 +16,13 @@ #define INCLUDED_ALG_REMEZ_H #include +#include -//#include +#ifdef HAVE_GMP_H +#include +#else #include +#endif #define JMAX 10000 //Maximum number of iterations of Newton's approximation #define SUM_MAX 10 // Maximum number of terms in exponential diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc index 859d1a20..980a2b17 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc +++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc @@ -27,7 +27,7 @@ Author: paboyle *************************************************************************************/ /* END LEGAL */ #include -#if defined(AVX512) || defined (IMCI) +#if defined(AVX512) //#if defined (IMCI) #include diff --git a/lib/simd/Grid_imci.h b/lib/simd/Grid_imci.h index 044c5cb4..119a31a3 100644 --- a/lib/simd/Grid_imci.h +++ b/lib/simd/Grid_imci.h @@ -36,7 +36,9 @@ Author: paboyle //---------------------------------------------------------------------- #include +#include +namespace Grid{ namespace Optimization { struct Vsplat{ @@ -316,6 +318,54 @@ namespace Optimization { }; + struct Rotate{ + + static inline __m512 rotate(__m512 in,int n){ + switch(n){ + case 0: return tRotate<0>(in);break; + case 1: return tRotate<1>(in);break; + case 2: return tRotate<2>(in);break; + case 3: return tRotate<3>(in);break; + case 4: return tRotate<4>(in);break; + case 5: return tRotate<5>(in);break; + case 6: return tRotate<6>(in);break; + case 7: return tRotate<7>(in);break; + + case 8 : return tRotate<8>(in);break; + case 9 : return tRotate<9>(in);break; + case 10: return tRotate<10>(in);break; + case 11: return tRotate<11>(in);break; + case 12: return tRotate<12>(in);break; + case 13: return tRotate<13>(in);break; + case 14: return tRotate<14>(in);break; + case 15: return tRotate<15>(in);break; + default: assert(0); + } + } + static inline __m512d rotate(__m512d in,int n){ + switch(n){ + case 0: return tRotate<0>(in);break; + case 1: return tRotate<1>(in);break; + case 2: return tRotate<2>(in);break; + case 3: return tRotate<3>(in);break; + case 4: return tRotate<4>(in);break; + case 5: return tRotate<5>(in);break; + case 6: return tRotate<6>(in);break; + case 7: return tRotate<7>(in);break; + default: assert(0); + } + } + + template static inline __m512 tRotate(__m512 in){ + return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n); + }; + + template static inline __m512d tRotate(__m512d in){ + return (__m512d)_mm512_alignr_epi32((__m512i)in,(__m512i)in,2*n); + }; + + }; + ////////////////////////////////////////////// @@ -358,7 +408,7 @@ namespace Optimization { ////////////////////////////////////////////////////////////////////////////////////// // Here assign types -namespace Grid { + typedef __m512 SIMD_Ftype; // Single precision type typedef __m512d SIMD_Dtype; // Double precision type typedef __m512i SIMD_Itype; // Integer type From 1e554350acae0e67fa7177ed0db9d4f684a54af2 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 29 Apr 2016 16:49:18 -0700 Subject: [PATCH 21/21] The threaded coms didn't agree with GCC. Suprised, and looks like GCC bug. --- lib/Stencil.h | 7 +++++-- lib/qcd/action/fermion/WilsonFermion5D.cc | 5 +++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/lib/Stencil.h b/lib/Stencil.h index 2b238622..ac351301 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -594,8 +594,11 @@ PARALLEL_FOR_LOOP template void HaloExchange(const Lattice &source,compressor &compress) { - auto thr = HaloExchangeBegin(source,compress); - HaloExchangeComplete(thr); + Mergers.resize(0); + Packets.resize(0); + HaloGather(source,compress); + Communicate(); + CommsMerge(); } void HaloExchangeComplete(std::thread &thr) diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index b8dba2f7..7ab82a2a 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -359,8 +359,9 @@ void WilsonFermion5D::DhopInternalCommsThenCompute(StencilImpl & st, Lebes int LLs = in._grid->_rdimensions[0]; commtime -=usecond(); - auto handle = st.HaloExchangeBegin(in,compressor); - st.HaloExchangeComplete(handle); + // auto handle = st.HaloExchangeBegin(in,compressor); + // st.HaloExchangeComplete(handle); + st.HaloExchange(in,compressor); commtime +=usecond(); jointime -=usecond();