diff --git a/benchmarks/Benchmark_zmm.cc b/benchmarks/Benchmark_zmm.cc index ebe7282e..e5039b16 100644 --- a/benchmarks/Benchmark_zmm.cc +++ b/benchmarks/Benchmark_zmm.cc @@ -127,7 +127,7 @@ int bench(std::ofstream &os, std::vector &latt4,int Ls) t1=usecond(); mfa = flops*ncall/(t1-t0); std::cout< &latt4,int Ls) diff = resulto-resulta; std::cout< #define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D) #define _MM_SELECT_TWO_TWO (A,B) _MM_SELECT_FOUR_TWO(0,0,A,B) +#define RotateBit (0x100) + namespace Grid { typedef uint32_t Integer; diff --git a/lib/Stencil.h b/lib/Stencil.h index c2f083ff..ac351301 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -321,6 +321,9 @@ PARALLEL_FOR_LOOP int simd_layout = _grid->_simd_layout[dimension]; int comm_dim = _grid->_processors[dimension] >1 ; int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); + int rotate_dim = _grid->_simd_layout[dimension]>2; + + assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported int sshift[2]; @@ -368,7 +371,8 @@ PARALLEL_FOR_LOOP int rd = _grid->_rdimensions[dimension]; int ld = _grid->_ldimensions[dimension]; int gd = _grid->_gdimensions[dimension]; - + int ly = _grid->_simd_layout[dimension]; + // Map to always positive shift modulo global full dimension. int shift = (shiftpm+fd)%fd; @@ -398,7 +402,7 @@ PARALLEL_FOR_LOOP int wrap = sshift/rd; int num = sshift%rd; if ( x< rd-num ) permute_slice=wrap; - else permute_slice = 1-wrap; + else permute_slice = (wrap+1)%ly; } CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); @@ -418,7 +422,6 @@ PARALLEL_FOR_LOOP int simd_layout = _grid->_simd_layout[dimension]; int comm_dim = _grid->_processors[dimension] >1 ; - // assert(simd_layout==1); // Why? assert(comm_dim==1); int shift = (shiftpm + fd) %fd; assert(shift>=0); @@ -529,7 +532,7 @@ PARALLEL_FOR_LOOP _entries[point][lo+o+b]._around_the_world=wrap; } - } + } o +=_grid->_slice_stride[dimension]; } @@ -591,8 +594,11 @@ PARALLEL_FOR_LOOP template void HaloExchange(const Lattice &source,compressor &compress) { - auto thr = HaloExchangeBegin(source,compress); - HaloExchangeComplete(thr); + Mergers.resize(0); + Packets.resize(0); + HaloGather(source,compress); + Communicate(); + CommsMerge(); } void HaloExchangeComplete(std::thread &thr) @@ -749,6 +755,7 @@ PARALLEL_FOR_LOOP int comm_dim = _grid->_processors[dimension] >1 ; assert(comm_dim==1); + // This will not work with a rotate dim assert(simd_layout==2); assert(shift>=0); assert(shift2 // std::cout << "GatherSimd : lane 1st elem " << i << u_simd_send_buf[i ][u_comm_offset]<>(permute_type+1)); diff --git a/lib/algorithms/approx/Remez.h b/lib/algorithms/approx/Remez.h index 6e3cf05b..4a56d5d2 100644 --- a/lib/algorithms/approx/Remez.h +++ b/lib/algorithms/approx/Remez.h @@ -16,9 +16,13 @@ #define INCLUDED_ALG_REMEZ_H #include +#include -//#include +#ifdef HAVE_GMP_H +#include +#else #include +#endif #define JMAX 10000 //Maximum number of iterations of Newton's approximation #define SUM_MAX 10 // Maximum number of terms in exponential diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h index 1e89fb00..c53d9318 100644 --- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h +++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h @@ -564,6 +564,7 @@ until convergence for(int j=k1-1; j(in);break; + case 1: return tRotate<1>(in);break; + case 2: return tRotate<2>(in);break; + case 3: return tRotate<3>(in);break; + case 4: return tRotate<4>(in);break; + case 5: return tRotate<5>(in);break; + case 6: return tRotate<6>(in);break; + case 7: return tRotate<7>(in);break; + default: assert(0); + } + } + static inline __m256d rotate(__m256d in,int n){ + switch(n){ + case 0: return tRotate<0>(in);break; + case 1: return tRotate<1>(in);break; + case 2: return tRotate<2>(in);break; + case 3: return tRotate<3>(in);break; + default: assert(0); + } + } + + + template + static inline __m256 tRotate(__m256 in){ + __m256 tmp = Permute::Permute0(in); + __m256 ret; + if ( n > 3 ) { + _mm256_alignr_epi32(ret,in,tmp,n); + } else { + _mm256_alignr_epi32(ret,tmp,in,n); + } + // std::cout << " align epi32 n=" < "<< ret < + static inline __m256d tRotate(__m256d in){ + __m256d tmp = Permute::Permute0(in); + __m256d ret; + if ( n > 1 ) { + _mm256_alignr_epi64(ret,in,tmp,n); + } else { + _mm256_alignr_epi64(ret,tmp,in,n); + } + // std::cout << " align epi64 n=" < "<< ret < diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index 5d014137..b057a61b 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -309,6 +309,54 @@ namespace Optimization { }; + struct Rotate{ + + static inline __m512 rotate(__m512 in,int n){ + switch(n){ + case 0: return tRotate<0>(in);break; + case 1: return tRotate<1>(in);break; + case 2: return tRotate<2>(in);break; + case 3: return tRotate<3>(in);break; + case 4: return tRotate<4>(in);break; + case 5: return tRotate<5>(in);break; + case 6: return tRotate<6>(in);break; + case 7: return tRotate<7>(in);break; + + case 8 : return tRotate<8>(in);break; + case 9 : return tRotate<9>(in);break; + case 10: return tRotate<10>(in);break; + case 11: return tRotate<11>(in);break; + case 12: return tRotate<12>(in);break; + case 13: return tRotate<13>(in);break; + case 14: return tRotate<14>(in);break; + case 15: return tRotate<15>(in);break; + default: assert(0); + } + } + static inline __m512d rotate(__m512d in,int n){ + switch(n){ + case 0: return tRotate<0>(in);break; + case 1: return tRotate<1>(in);break; + case 2: return tRotate<2>(in);break; + case 3: return tRotate<3>(in);break; + case 4: return tRotate<4>(in);break; + case 5: return tRotate<5>(in);break; + case 6: return tRotate<6>(in);break; + case 7: return tRotate<7>(in);break; + default: assert(0); + } + } + + template static inline __m512 tRotate(__m512 in){ + return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n); + }; + + template static inline __m512d tRotate(__m512d in){ + return (__m512d)_mm512_alignr_epi64((__m512i)in,(__m512i)in,n); + }; + + }; + ////////////////////////////////////////////// // Some Template specialization diff --git a/lib/simd/Grid_empty.h b/lib/simd/Grid_empty.h index 8858624e..5ab75de7 100644 --- a/lib/simd/Grid_empty.h +++ b/lib/simd/Grid_empty.h @@ -55,51 +55,67 @@ namespace Optimization { struct Vsplat{ //Complex float - inline float operator()(float a, float b){ - return 0; + inline u128f operator()(float a, float b){ + u128f out; + out.f[0] = a; + out.f[1] = b; + out.f[2] = a; + out.f[3] = b; + return out; } // Real float - inline float operator()(float a){ - return 0; + inline u128f operator()(float a){ + u128f out; + out.f[0] = a; + out.f[1] = a; + out.f[2] = a; + out.f[3] = a; + return out; } //Complex double - inline double operator()(double a, double b){ - return 0; + inline u128d operator()(double a, double b){ + u128d out; + out.f[0] = a; + out.f[1] = b; + return out; } //Real double - inline double operator()(double a){ - return 0; + inline u128d operator()(double a){ + u128d out; + out.f[0] = a; + out.f[1] = a; + return out; } //Integer inline int operator()(Integer a){ - return 0; + return a; } }; struct Vstore{ //Float - inline void operator()(float a, float* F){ - + inline void operator()(u128f a, float* F){ + memcpy(F,a.f,4*sizeof(float)); } //Double - inline void operator()(double a, double* D){ - + inline void operator()(u128d a, double* D){ + memcpy(D,a.f,2*sizeof(double)); } //Integer inline void operator()(int a, Integer* I){ - + I[0] = a; } }; struct Vstream{ //Float - inline void operator()(float * a, float b){ - + inline void operator()(float * a, u128f b){ + memcpy(a,b.f,4*sizeof(float)); } //Double - inline void operator()(double * a, double b){ - + inline void operator()(double * a, u128d b){ + memcpy(a,b.f,2*sizeof(double)); } @@ -107,24 +123,40 @@ namespace Optimization { struct Vset{ // Complex float - inline float operator()(Grid::ComplexF *a){ - return 0; + inline u128f operator()(Grid::ComplexF *a){ + u128f out; + out.f[0] = a[0].real(); + out.f[1] = a[0].imag(); + out.f[2] = a[1].real(); + out.f[3] = a[1].imag(); + return out; } // Complex double - inline double operator()(Grid::ComplexD *a){ - return 0; + inline u128d operator()(Grid::ComplexD *a){ + u128d out; + out.f[0] = a[0].real(); + out.f[1] = a[0].imag(); + return out; } // Real float - inline float operator()(float *a){ - return 0; + inline u128f operator()(float *a){ + u128f out; + out.f[0] = a[0]; + out.f[1] = a[1]; + out.f[2] = a[2]; + out.f[3] = a[3]; + return out; } // Real double - inline double operator()(double *a){ - return 0; + inline u128d operator()(double *a){ + u128d out; + out.f[0] = a[0]; + out.f[1] = a[1]; + return out; } // Integer inline int operator()(Integer *a){ - return 0; + return a[0]; } @@ -146,129 +178,198 @@ namespace Optimization { ///////////////////////////////////////////////////// struct Sum{ //Complex/Real float - inline float operator()(float a, float b){ - return 0; + inline u128f operator()(u128f a, u128f b){ + u128f out; + out.f[0] = a.f[0] + b.f[0]; + out.f[1] = a.f[1] + b.f[1]; + out.f[2] = a.f[2] + b.f[2]; + out.f[3] = a.f[3] + b.f[3]; + return out; } //Complex/Real double - inline double operator()(double a, double b){ - return 0; + inline u128d operator()(u128d a, u128d b){ + u128d out; + out.f[0] = a.f[0] + b.f[0]; + out.f[1] = a.f[1] + b.f[1]; + return out; } //Integer inline int operator()(int a, int b){ - return 0; + return a + b; } }; struct Sub{ //Complex/Real float - inline float operator()(float a, float b){ - return 0; + inline u128f operator()(u128f a, u128f b){ + u128f out; + out.f[0] = a.f[0] - b.f[0]; + out.f[1] = a.f[1] - b.f[1]; + out.f[2] = a.f[2] - b.f[2]; + out.f[3] = a.f[3] - b.f[3]; + return out; } //Complex/Real double - inline double operator()(double a, double b){ - return 0; + inline u128d operator()(u128d a, u128d b){ + u128d out; + out.f[0] = a.f[0] - b.f[0]; + out.f[1] = a.f[1] - b.f[1]; + return out; } //Integer inline int operator()(int a, int b){ - return 0; + return a-b; } }; struct MultComplex{ // Complex float - inline float operator()(float a, float b){ - return 0; + inline u128f operator()(u128f a, u128f b){ + u128f out; + out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1]; + out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0]; + out.f[2] = a.f[2]*b.f[2] - a.f[3]*b.f[3]; + out.f[3] = a.f[2]*b.f[3] + a.f[3]*b.f[2]; + return out; } // Complex double - inline double operator()(double a, double b){ - return 0; + inline u128d operator()(u128d a, u128d b){ + u128d out; + out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1]; + out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0]; + return out; } }; struct Mult{ - inline float mac(float a, float b,double c){ - return 0; - } - inline double mac(double a, double b,double c){ - return 0; - } + //CK: Appear unneeded + // inline float mac(float a, float b,double c){ + // return 0; + // } + // inline double mac(double a, double b,double c){ + // return 0; + // } + // Real float - inline float operator()(float a, float b){ - return 0; + inline u128f operator()(u128f a, u128f b){ + u128f out; + out.f[0] = a.f[0]*b.f[0]; + out.f[1] = a.f[1]*b.f[1]; + out.f[2] = a.f[2]*b.f[2]; + out.f[3] = a.f[3]*b.f[3]; + return out; } // Real double - inline double operator()(double a, double b){ - return 0; + inline u128d operator()(u128d a, u128d b){ + u128d out; + out.f[0] = a.f[0]*b.f[0]; + out.f[1] = a.f[1]*b.f[1]; + return out; } // Integer inline int operator()(int a, int b){ - return 0; + return a*b; } }; struct Conj{ // Complex single - inline float operator()(float in){ - return 0; + inline u128f operator()(u128f in){ + u128f out; + out.f[0] = in.f[0]; + out.f[1] = -in.f[1]; + out.f[2] = in.f[2]; + out.f[3] = -in.f[3]; + return out; } // Complex double - inline double operator()(double in){ - return 0; + inline u128d operator()(u128d in){ + u128d out; + out.f[0] = in.f[0]; + out.f[1] = -in.f[1]; + return out; } // do not define for integer input }; struct TimesMinusI{ //Complex single - inline float operator()(float in, float ret){ - return 0; + inline u128f operator()(u128f in, u128f ret){ //note ret is ignored + u128f out; + out.f[0] = in.f[1]; + out.f[1] = -in.f[0]; + out.f[2] = in.f[3]; + out.f[3] = -in.f[2]; + return out; } //Complex double - inline double operator()(double in, double ret){ - return 0; + inline u128d operator()(u128d in, u128d ret){ + u128d out; + out.f[0] = in.f[1]; + out.f[1] = -in.f[0]; + return out; } - - }; struct TimesI{ //Complex single - inline float operator()(float in, float ret){ - return 0; + inline u128f operator()(u128f in, u128f ret){ //note ret is ignored + u128f out; + out.f[0] = -in.f[1]; + out.f[1] = in.f[0]; + out.f[2] = -in.f[3]; + out.f[3] = in.f[2]; + return out; } //Complex double - inline double operator()(double in, double ret){ - return 0; + inline u128d operator()(u128d in, u128d ret){ + u128d out; + out.f[0] = -in.f[1]; + out.f[1] = in.f[0]; + return out; } }; ////////////////////////////////////////////// // Some Template specialization struct Permute{ - - static inline float Permute0(float in){ + //We just have to mirror the permutes of Grid_sse4.h + static inline u128f Permute0(u128f in){ //AB CD -> CD AB + u128f out; + out.f[0] = in.f[2]; + out.f[1] = in.f[3]; + out.f[2] = in.f[0]; + out.f[3] = in.f[1]; + return out; + }; + static inline u128f Permute1(u128f in){ //AB CD -> BA DC + u128f out; + out.f[0] = in.f[1]; + out.f[1] = in.f[0]; + out.f[2] = in.f[3]; + out.f[3] = in.f[2]; + return out; + }; + static inline u128f Permute2(u128f in){ return in; }; - static inline float Permute1(float in){ - return in; - }; - static inline float Permute2(float in){ - return in; - }; - static inline float Permute3(float in){ + static inline u128f Permute3(u128f in){ return in; }; - static inline double Permute0(double in){ + static inline u128d Permute0(u128d in){ //AB -> BA + u128d out; + out.f[0] = in.f[1]; + out.f[1] = in.f[0]; + return out; + }; + static inline u128d Permute1(u128d in){ return in; }; - static inline double Permute1(double in){ + static inline u128d Permute2(u128d in){ return in; }; - static inline double Permute2(double in){ - return in; - }; - static inline double Permute3(double in){ + static inline u128d Permute3(u128d in){ return in; }; @@ -280,26 +381,26 @@ namespace Optimization { //Complex float Reduce template<> - inline Grid::ComplexF Reduce::operator()(float in){ - return 0; + inline Grid::ComplexF Reduce::operator()(u128f in){ //2 complex + return Grid::ComplexF(in.f[0] + in.f[2], in.f[1] + in.f[3]); } //Real float Reduce template<> - inline Grid::RealF Reduce::operator()(float in){ - return 0; + inline Grid::RealF Reduce::operator()(u128f in){ //4 floats + return in.f[0] + in.f[1] + in.f[2] + in.f[3]; } //Complex double Reduce template<> - inline Grid::ComplexD Reduce::operator()(double in){ - return 0; + inline Grid::ComplexD Reduce::operator()(u128d in){ //1 complex + return Grid::ComplexD(in.f[0],in.f[1]); } //Real double Reduce template<> - inline Grid::RealD Reduce::operator()(double in){ - return 0; + inline Grid::RealD Reduce::operator()(u128d in){ //2 doubles + return in.f[0] + in.f[1]; } //Integer Reduce @@ -314,8 +415,8 @@ namespace Optimization { ////////////////////////////////////////////////////////////////////////////////////// // Here assign types - typedef float SIMD_Ftype; // Single precision type - typedef double SIMD_Dtype; // Double precision type + typedef Optimization::u128f SIMD_Ftype; // Single precision type + typedef Optimization::u128d SIMD_Dtype; // Double precision type typedef int SIMD_Itype; // Integer type // prefetch utilities diff --git a/lib/simd/Grid_imci.h b/lib/simd/Grid_imci.h index 044c5cb4..119a31a3 100644 --- a/lib/simd/Grid_imci.h +++ b/lib/simd/Grid_imci.h @@ -36,7 +36,9 @@ Author: paboyle //---------------------------------------------------------------------- #include +#include +namespace Grid{ namespace Optimization { struct Vsplat{ @@ -316,6 +318,54 @@ namespace Optimization { }; + struct Rotate{ + + static inline __m512 rotate(__m512 in,int n){ + switch(n){ + case 0: return tRotate<0>(in);break; + case 1: return tRotate<1>(in);break; + case 2: return tRotate<2>(in);break; + case 3: return tRotate<3>(in);break; + case 4: return tRotate<4>(in);break; + case 5: return tRotate<5>(in);break; + case 6: return tRotate<6>(in);break; + case 7: return tRotate<7>(in);break; + + case 8 : return tRotate<8>(in);break; + case 9 : return tRotate<9>(in);break; + case 10: return tRotate<10>(in);break; + case 11: return tRotate<11>(in);break; + case 12: return tRotate<12>(in);break; + case 13: return tRotate<13>(in);break; + case 14: return tRotate<14>(in);break; + case 15: return tRotate<15>(in);break; + default: assert(0); + } + } + static inline __m512d rotate(__m512d in,int n){ + switch(n){ + case 0: return tRotate<0>(in);break; + case 1: return tRotate<1>(in);break; + case 2: return tRotate<2>(in);break; + case 3: return tRotate<3>(in);break; + case 4: return tRotate<4>(in);break; + case 5: return tRotate<5>(in);break; + case 6: return tRotate<6>(in);break; + case 7: return tRotate<7>(in);break; + default: assert(0); + } + } + + template static inline __m512 tRotate(__m512 in){ + return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n); + }; + + template static inline __m512d tRotate(__m512d in){ + return (__m512d)_mm512_alignr_epi32((__m512i)in,(__m512i)in,2*n); + }; + + }; + ////////////////////////////////////////////// @@ -358,7 +408,7 @@ namespace Optimization { ////////////////////////////////////////////////////////////////////////////////////// // Here assign types -namespace Grid { + typedef __m512 SIMD_Ftype; // Single precision type typedef __m512d SIMD_Dtype; // Double precision type typedef __m512i SIMD_Itype; // Integer type diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index b88ad4c9..a4002373 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -267,10 +267,10 @@ namespace Optimization { struct Permute{ static inline __m128 Permute0(__m128 in){ - return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); + return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //AB CD -> CD AB }; static inline __m128 Permute1(__m128 in){ - return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); + return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //AB CD -> BA DC }; static inline __m128 Permute2(__m128 in){ return in; @@ -279,7 +279,7 @@ namespace Optimization { return in; }; - static inline __m128d Permute0(__m128d in){ + static inline __m128d Permute0(__m128d in){ //AB -> BA return _mm_shuffle_pd(in,in,0x1); }; static inline __m128d Permute1(__m128d in){ @@ -294,6 +294,32 @@ namespace Optimization { }; + struct Rotate{ + + static inline __m128 rotate(__m128 in,int n){ + switch(n){ + case 0: return tRotate<0>(in);break; + case 1: return tRotate<1>(in);break; + case 2: return tRotate<2>(in);break; + case 3: return tRotate<3>(in);break; + default: assert(0); + } + } + static inline __m128d rotate(__m128d in,int n){ + switch(n){ + case 0: return tRotate<0>(in);break; + case 1: return tRotate<1>(in);break; + default: assert(0); + } + } + +#define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) +#define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) + + template static inline __m128 tRotate(__m128 in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); }; + template static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); }; + + }; ////////////////////////////////////////////// // Some Template specialization diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index c247f15d..b485a47a 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -299,16 +299,44 @@ namespace Grid { } friend inline void permute(Grid_simd &y,Grid_simd b,int perm) { - if (perm==3) permute3(y,b); - else if (perm==2) permute2(y,b); - else if (perm==1) permute1(y,b); - else if (perm==0) permute0(y,b); + if ( perm & RotateBit ) { + int dist = perm&0xF; + y=rotate(b,dist); + return; + } + switch(perm){ + case 3: permute3(y,b); break; + case 2: permute2(y,b); break; + case 1: permute1(y,b); break; + case 0: permute0(y,b); break; + default: assert(0); + } } - - };// end of Grid_simd class definition + //////////////////////////////////////////////////////////////////// + // General rotate + //////////////////////////////////////////////////////////////////// + template =0> + inline Grid_simd rotate(Grid_simd b,int nrot) + { + nrot = nrot % Grid_simd::Nsimd(); + Grid_simd ret; + // std::cout << "Rotate Real by "< =0> + inline Grid_simd rotate(Grid_simd b,int nrot) + { + nrot = nrot % Grid_simd::Nsimd(); + Grid_simd ret; + // std::cout << "Rotate Complex by "< inline void extract(typename std::enable_if::value, const vsimd >::type * y, std::vector &extracted,int offset){ // FIXME: bounce off memory is painful - static const int Nsimd=vsimd::Nsimd(); + static const int Nsimd=sizeof(vsimd)/sizeof(scalar); int Nextr=extracted.size(); int s=Nsimd/Nextr; @@ -59,7 +59,9 @@ inline void extract(typename std::enable_if::value, const v template inline void merge(typename std::enable_if::value, vsimd >::type * y, std::vector &extracted,int offset){ - static const int Nsimd=vsimd::Nsimd(); + + static const int Nsimd=sizeof(vsimd)/sizeof(scalar); + int Nextr=extracted.size(); int s=Nsimd/Nextr; // can have sparse occupation of simd vector if simd_layout does not fill it // replicate n-fold. Use to allow Integer masks to @@ -127,7 +129,7 @@ template inline void extract(const vobj &vec,std::vector &extracted) typedef typename vobj::scalar_type scalar_type ; typedef typename vobj::vector_type vector_type ; - static const int Nsimd=vobj::vector_type::Nsimd(); + static const int Nsimd=sizeof(vector_type)/sizeof(scalar_type); static const int words=sizeof(vobj)/sizeof(vector_type); int Nextr = extracted.size(); @@ -199,7 +201,7 @@ void merge(vobj &vec,std::vector &extracted,int typedef typename vobj::scalar_type scalar_type ; typedef typename vobj::vector_type vector_type ; - const int Nsimd=vobj::vector_type::Nsimd(); + const int Nsimd=sizeof(vector_type)/sizeof(scalar_type); const int words=sizeof(vobj)/sizeof(vector_type); int Nextr=extracted.size(); diff --git a/tests/Make.inc b/tests/Make.inc index 98323b45..cd4ab811 100644 --- a/tests/Make.inc +++ b/tests/Make.inc @@ -1,5 +1,5 @@ -bin_PROGRAMS += Test_GaugeAction Test_RectPlaq Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_gp_rect_force Test_gparity Test_gpdwf_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd +bin_PROGRAMS = Test_GaugeAction Test_RectPlaq Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_cshift_red_black_rotate Test_cshift_rotate Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_dwf_rb5d Test_gamma Test_gp_rect_force Test_gparity Test_gpdwf_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd Test_GaugeAction_SOURCES=Test_GaugeAction.cc @@ -58,6 +58,14 @@ Test_cshift_red_black_SOURCES=Test_cshift_red_black.cc Test_cshift_red_black_LDADD=-lGrid +Test_cshift_red_black_rotate_SOURCES=Test_cshift_red_black_rotate.cc +Test_cshift_red_black_rotate_LDADD=-lGrid + + +Test_cshift_rotate_SOURCES=Test_cshift_rotate.cc +Test_cshift_rotate_LDADD=-lGrid + + Test_dwf_cg_prec_SOURCES=Test_dwf_cg_prec.cc Test_dwf_cg_prec_LDADD=-lGrid @@ -98,6 +106,10 @@ Test_dwf_lanczos_SOURCES=Test_dwf_lanczos.cc Test_dwf_lanczos_LDADD=-lGrid +Test_dwf_rb5d_SOURCES=Test_dwf_rb5d.cc +Test_dwf_rb5d_LDADD=-lGrid + + Test_gamma_SOURCES=Test_gamma.cc Test_gamma_LDADD=-lGrid diff --git a/tests/Test_cshift_red_black_rotate.cc b/tests/Test_cshift_red_black_rotate.cc new file mode 100644 index 00000000..69bfdf3d --- /dev/null +++ b/tests/Test_cshift_red_black_rotate.cc @@ -0,0 +1,223 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_cshift_red_black.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace Grid; +using namespace Grid::QCD; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + int Nd = latt_size.size(); + std::vector simd_layout( { vComplex::Nsimd(),1,1,1}); + std::vector mpi_layout = GridDefaultMpi(); + + std::vector mask(Nd,1); + mask[0]=0; + + GridCartesian Fine (latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian RBFine(latt_size,simd_layout,mpi_layout,mask,1); + + GridParallelRNG FineRNG(&Fine); FineRNG.SeedRandomDevice(); + + LatticeComplex U(&Fine); + LatticeComplex ShiftU(&Fine); + LatticeComplex rbShiftU(&Fine); + LatticeComplex Ue(&RBFine); + LatticeComplex Uo(&RBFine); + LatticeComplex ShiftUe(&RBFine); + LatticeComplex ShiftUo(&RBFine); + LatticeComplex lex(&Fine); + lex=zero; + Integer stride =1; + { + double nrm; + LatticeComplex coor(&Fine); + + for(int d=0;d coor(4); + + std::cout< scoor(coor); + scoor[dir] = (scoor[dir]+shift)%latt_size[dir]; + + Integer slex = scoor[0] + + latt_size[0]*scoor[1] + + latt_size[0]*latt_size[1]*scoor[2] + + latt_size[0]*latt_size[1]*latt_size[2]*scoor[3]; + + Complex scm(slex); + + double nrm = abs(scm-cm()()()); + std::vector peer(4); + Complex ctmp = cm; + Integer index=real(ctmp); + Lexicographic::CoorFromIndex(peer,index,latt_size); + + if (nrm > 0){ + std::cout<<"FAIL shift "<< shift<<" in dir "<< dir + <<" ["< scoor(coor); + scoor[dir] = (scoor[dir]+shift)%latt_size[dir]; + + Integer slex = scoor[0] + + latt_size[0]*scoor[1] + + latt_size[0]*latt_size[1]*scoor[2] + + latt_size[0]*latt_size[1]*latt_size[2]*scoor[3]; + + Complex scm(slex); + + std::vector peer(4); + Complex ctmp=cmeo; + Integer index=real(ctmp); + Lexicographic::CoorFromIndex(peer,index,latt_size); + + double nrm = abs(cmeo()()()-scm); + if (nrm != 0) { + std::cout<<"EOFAIL shift "<< shift<<" in dir "<< dir + <<" ["< 0){ + std::cout<<"FAIL shift "<< shift<<" in dir "<< dir + <<" ["< +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace Grid; +using namespace Grid::QCD; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout( { vComplex::Nsimd(),1,1,1}); + std::vector mpi_layout = GridDefaultMpi(); + + GridCartesian Fine(latt_size,simd_layout,mpi_layout); + + GridParallelRNG FineRNG(&Fine); FineRNG.SeedRandomDevice(); + + LatticeComplex U(&Fine); + LatticeComplex ShiftU(&Fine); + + LatticeComplex lex(&Fine); + lex=zero; + Integer stride =1; + { + double nrm; + LatticeComplex coor(&Fine); + + for(int d=0;d<4;d++){ + LatticeCoordinate(coor,d); + lex = lex + coor*stride; + stride=stride*latt_size[d]; + } + U=lex; + } + + + TComplex cm; + + for(int dir=0;dir<4;dir++){ + for(int shift=0;shift coor(4); + + for(coor[3]=0;coor[3] scoor(coor); + scoor[dir] = (scoor[dir]+shift)%latt_size[dir]; + + Integer slex = scoor[0] + + latt_size[0]*scoor[1] + + latt_size[0]*latt_size[1]*scoor[2] + + latt_size[0]*latt_size[1]*latt_size[2]*scoor[3]; + + Complex scm(slex); + + nrm = abs(scm-cm()()()); + std::vector peer(4); + Complex tmp =cm; + Integer index=real(tmp); + Lexicographic::CoorFromIndex(peer,index,latt_size); + + if (nrm > 0){ + std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir<<" ["< +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::GammaMatrix Gmu [] = { + Gamma::GammaX, + Gamma::GammaY, + Gamma::GammaZ, + Gamma::GammaT + }; + +typedef WilsonFermion5D WilsonFermion5DR; +typedef WilsonFermion5D WilsonFermion5DF; +typedef WilsonFermion5D WilsonFermion5DD; + +typedef WilsonFermion5D WilsonFermion5D_OKR; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int threads = GridThread::GetThreads(); + std::cout< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + + LatticeFermion src (FGrid); random(RNG5,src); + LatticeFermion result(FGrid); result=zero; + LatticeFermion ref(FGrid); ref=zero; + LatticeFermion tmp(FGrid); + LatticeFermion err(FGrid); + + LatticeGaugeField Umu(UGrid); random(RNG4,Umu); + std::vector U(4,UGrid); + + // Only one non-zero (y) + /* + Umu=zero; + for(int nn=0;nn(Umu,U[nn],nn); + } + */ + + RealD mass=0.1; + RealD M5 =1.8; + typename WilsonFermion5DR::ImplParams params; + + WilsonFermion5DR Dw(1,Umu,*FGrid,*FrbGrid,*sUGrid,*sUrbGrid,M5,params); + + Dw.Dhop(src,result,0); + + std::cout << "Norm src = "< void operator()(vec &rr,vec &i1,vec &i2) const { permute(rr,i1,n);} + template void apply(std::vector &rr,std::vector &in) const { + int sz=in.size(); + int msk = sz>>(n+1); + for(int i=0;i void operator()(vec &rr,vec &i1,vec &i2) const { rr=rotate(i1,n);} + template void apply(std::vector &rr,std::vector &in) const { + int sz = in.size(); + for(int i=0;i +void PermTester(const functor &func) +{ + GridSerialRNG sRNG; + sRNG.SeedRandomDevice(); + + int Nsimd = vec::Nsimd(); + + std::vector input1(Nsimd); + std::vector input2(Nsimd); + std::vector result(Nsimd); + std::vector reference(Nsimd); + + std::vector > buf(3); + vec & v_input1 = buf[0]; + vec & v_input2 = buf[1]; + vec & v_result = buf[2]; + + for(int i=0;i(v_input1,input1); + merge(v_input2,input2); + merge(v_result,result); + + func(v_result,v_input1,v_input2); + + func.apply(reference,input1); + + extract(v_result,result); + std::cout<1.0e-7){ + std::cout<(funcInnerProduct()); ReductionTester(funcReduce()); + + std::cout<(funcPermute(i)); + } + + std::cout<(funcRotate(r)); + } + + std::cout << GridLogMessage <<"==================================="<< std::endl; std::cout << GridLogMessage <<"Testing vRealD "<(funcInnerProduct()); ReductionTester(funcReduce()); + + std::cout<(funcPermute(i)); + } + + std::cout<(funcRotate(r)); + } + + + std::cout << GridLogMessage <<"==================================="<< std::endl; std::cout << GridLogMessage <<"Testing vComplexF "<(funcInnerProduct()); ReductionTester(funcReduce()); + + std::cout<(funcPermute(i)); + } + + std::cout<(funcRotate(r)); + } + std::cout<(funcInnerProduct()); ReductionTester(funcReduce()); + + std::cout<(funcPermute(i)); + } + + + std::cout<(funcRotate(r)); + } + Grid_finalize(); }