diff --git a/configure.ac b/configure.ac index a6658a96..90764cb7 100644 --- a/configure.ac +++ b/configure.ac @@ -206,8 +206,8 @@ case ${ax_cv_cxx_compiler_vendor} in AC_DEFINE([AVX1],[1],[AVX intrinsics]) SIMD_FLAGS='-mavx -xavx';; AVXFMA) - AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA4]) - SIMD_FLAGS='-mavx -mfma';; + AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3]) + SIMD_FLAGS='-mavx -fma';; AVX2) AC_DEFINE([AVX2],[1],[AVX2 intrinsics]) SIMD_FLAGS='-march=core-avx2 -xcore-avx2';; diff --git a/lib/FFT.h b/lib/FFT.h index b5b31d82..240f338b 100644 --- a/lib/FFT.h +++ b/lib/FFT.h @@ -244,7 +244,10 @@ namespace Grid { pokeLocalSite(s,pgbuf,cbuf); } } - result = Cshift(result,dim,L); + if (p != processors[dim] - 1) + { + result = Cshift(result,dim,L); + } } // Loop over orthog coords @@ -287,10 +290,10 @@ namespace Grid { cgbuf = clbuf; cgbuf[dim] = clbuf[dim]+L*pc; peekLocalSite(s,pgbuf,cgbuf); - s = s * div; pokeLocalSite(s,result,clbuf); } } + result = result*div; // destroying plan FFTW::fftw_destroy_plan(p); diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index f50eae2b..36360102 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -167,7 +167,7 @@ namespace Optimization { } //Integer inline __m256i operator()(__m256i a, __m256i b){ -#if defined (AVX1) || defined (AVXFMA4) +#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4) __m128i a0,a1; __m128i b0,b1; a0 = _mm256_extractf128_si256(a,0); @@ -195,7 +195,7 @@ namespace Optimization { } //Integer inline __m256i operator()(__m256i a, __m256i b){ -#if defined (AVX1) || defined (AVXFMA4) +#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4) __m128i a0,a1; __m128i b0,b1; a0 = _mm256_extractf128_si256(a,0); @@ -216,7 +216,7 @@ namespace Optimization { struct MultComplex{ // Complex float inline __m256 operator()(__m256 a, __m256 b){ -#if defined (AVX1) +#if defined (AVX1) __m256 ymm0,ymm1,ymm2; ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br @@ -233,7 +233,7 @@ namespace Optimization { a_imag = _mm256_mul_ps( a_imag,tmp ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr #endif -#if defined (AVX2) +#if defined (AVX2) || defined (AVXFMA) __m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar __m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) )); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br @@ -264,7 +264,7 @@ namespace Optimization { IF IMM0[3] = 0 THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i ; 0xC unchanged */ -#if defined (AVX1) +#if defined (AVX1) __m256d ymm0,ymm1,ymm2; ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00 ymm0 = _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br @@ -279,7 +279,7 @@ namespace Optimization { a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr #endif -#if defined (AVX2) +#if defined (AVX2) || defined (AVXFMA) __m256d a_real = _mm256_movedup_pd( a ); // Ar Ar __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br @@ -320,7 +320,7 @@ namespace Optimization { #if defined (AVXFMA4) a= _mm256_macc_ps(b,c,a); #endif -#if defined (AVX2) +#if defined (AVX2) || defined (AVXFMA) a= _mm256_fmadd_ps( b, c, a); #endif } @@ -332,7 +332,7 @@ namespace Optimization { #if defined (AVXFMA4) a= _mm256_macc_pd(b,c,a); #endif -#if defined (AVX2) +#if defined (AVX2) || defined (AVXFMA) a= _mm256_fmadd_pd( b, c, a); #endif } @@ -347,7 +347,7 @@ namespace Optimization { } // Integer inline __m256i operator()(__m256i a, __m256i b){ -#if defined (AVX1) +#if defined (AVX1) || defined (AVXFMA) __m128i a0,a1; __m128i b0,b1; a0 = _mm256_extractf128_si256(a,0); diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index 07933f52..bc86291d 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -244,7 +244,22 @@ namespace Optimization { return a*b; } }; - + + struct Div{ + // Real double + inline vector4double operator()(vector4double a, vector4double b){ + return vec_swdiv(a, b); + } + + // Real float + FLOAT_WRAP_2(operator(), inline) + + // Integer + inline int operator()(int a, int b){ + return a/b; + } + }; + struct Conj{ // Complex double inline vector4double operator()(vector4double v){ @@ -413,6 +428,7 @@ template using ReduceSIMD = Optimization::Reduce; typedef Optimization::Sum SumSIMD; typedef Optimization::Sub SubSIMD; typedef Optimization::Mult MultSIMD; +typedef Optimization::Div DivSIMD; typedef Optimization::MultComplex MultComplexSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 184baad9..080dd5c0 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -44,7 +44,7 @@ directory #ifdef SSE4 #include "Grid_sse4.h" #endif -#if defined(AVX1) || defined(AVX2) || defined(AVXFMA4) +#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4) #include "Grid_avx.h" #endif #if defined AVX512 diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc index 189f0559..92f9bcd8 100644 --- a/tests/Test_simd.cc +++ b/tests/Test_simd.cc @@ -50,6 +50,12 @@ public: template void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1*i2;} std::string name(void) const { return std::string("Times"); } }; +class funcDivide { +public: + funcDivide() {}; + template void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1/i2;} + std::string name(void) const { return std::string("Divide"); } +}; class funcConj { public: funcConj() {}; @@ -341,6 +347,7 @@ int main (int argc, char ** argv) Tester(funcPlus()); Tester(funcMinus()); Tester(funcTimes()); + Tester(funcDivide()); Tester(funcAdj()); Tester(funcConj()); Tester(funcInnerProduct()); @@ -371,6 +378,7 @@ int main (int argc, char ** argv) Tester(funcPlus()); Tester(funcMinus()); Tester(funcTimes()); + Tester(funcDivide()); Tester(funcAdj()); Tester(funcConj()); Tester(funcInnerProduct()); diff --git a/tests/core/Test_fftf.cc b/tests/core/Test_fftf.cc index 4eb4398d..22838f7b 100644 --- a/tests/core/Test_fftf.cc +++ b/tests/core/Test_fftf.cc @@ -68,7 +68,7 @@ int main (int argc, char ** argv) for(int mu=0;mu<4;mu++){ RealD TwoPiL = M_PI * 2.0/ latt_size[mu]; LatticeCoordinate(coor,mu); - C = C - (TwoPiL * p[mu]) * coor; + C = C + (TwoPiL * p[mu]) * coor; } C = exp(C*ci); @@ -78,10 +78,11 @@ int main (int argc, char ** argv) FFT theFFT(&Fine); - theFFT.FFT_dim(Ctilde,C,0,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<