1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-09 21:50:45 +01:00

Merge branch 'develop' of https://github.com/paboyle/Grid into develop

This commit is contained in:
Guido Cossu 2016-11-26 18:25:32 +00:00
commit d8258f0758
7 changed files with 53 additions and 24 deletions

View File

@ -206,8 +206,8 @@ case ${ax_cv_cxx_compiler_vendor} in
AC_DEFINE([AVX1],[1],[AVX intrinsics]) AC_DEFINE([AVX1],[1],[AVX intrinsics])
SIMD_FLAGS='-mavx -xavx';; SIMD_FLAGS='-mavx -xavx';;
AVXFMA) AVXFMA)
AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA4]) AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3])
SIMD_FLAGS='-mavx -mfma';; SIMD_FLAGS='-mavx -fma';;
AVX2) AVX2)
AC_DEFINE([AVX2],[1],[AVX2 intrinsics]) AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
SIMD_FLAGS='-march=core-avx2 -xcore-avx2';; SIMD_FLAGS='-march=core-avx2 -xcore-avx2';;

View File

@ -244,7 +244,10 @@ namespace Grid {
pokeLocalSite(s,pgbuf,cbuf); pokeLocalSite(s,pgbuf,cbuf);
} }
} }
result = Cshift(result,dim,L); if (p != processors[dim] - 1)
{
result = Cshift(result,dim,L);
}
} }
// Loop over orthog coords // Loop over orthog coords
@ -287,10 +290,10 @@ namespace Grid {
cgbuf = clbuf; cgbuf = clbuf;
cgbuf[dim] = clbuf[dim]+L*pc; cgbuf[dim] = clbuf[dim]+L*pc;
peekLocalSite(s,pgbuf,cgbuf); peekLocalSite(s,pgbuf,cgbuf);
s = s * div;
pokeLocalSite(s,result,clbuf); pokeLocalSite(s,result,clbuf);
} }
} }
result = result*div;
// destroying plan // destroying plan
FFTW<scalar>::fftw_destroy_plan(p); FFTW<scalar>::fftw_destroy_plan(p);

View File

@ -167,7 +167,7 @@ namespace Optimization {
} }
//Integer //Integer
inline __m256i operator()(__m256i a, __m256i b){ inline __m256i operator()(__m256i a, __m256i b){
#if defined (AVX1) || defined (AVXFMA4) #if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
__m128i a0,a1; __m128i a0,a1;
__m128i b0,b1; __m128i b0,b1;
a0 = _mm256_extractf128_si256(a,0); a0 = _mm256_extractf128_si256(a,0);
@ -195,7 +195,7 @@ namespace Optimization {
} }
//Integer //Integer
inline __m256i operator()(__m256i a, __m256i b){ inline __m256i operator()(__m256i a, __m256i b){
#if defined (AVX1) || defined (AVXFMA4) #if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
__m128i a0,a1; __m128i a0,a1;
__m128i b0,b1; __m128i b0,b1;
a0 = _mm256_extractf128_si256(a,0); a0 = _mm256_extractf128_si256(a,0);
@ -216,7 +216,7 @@ namespace Optimization {
struct MultComplex{ struct MultComplex{
// Complex float // Complex float
inline __m256 operator()(__m256 a, __m256 b){ inline __m256 operator()(__m256 a, __m256 b){
#if defined (AVX1) #if defined (AVX1)
__m256 ymm0,ymm1,ymm2; __m256 ymm0,ymm1,ymm2;
ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
@ -233,7 +233,7 @@ namespace Optimization {
a_imag = _mm256_mul_ps( a_imag,tmp ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br a_imag = _mm256_mul_ps( a_imag,tmp ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
#endif #endif
#if defined (AVX2) #if defined (AVX2) || defined (AVXFMA)
__m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar __m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
__m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai __m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) )); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) )); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
@ -264,7 +264,7 @@ namespace Optimization {
IF IMM0[3] = 0 IF IMM0[3] = 0
THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i ; 0xC unchanged THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i ; 0xC unchanged
*/ */
#if defined (AVX1) #if defined (AVX1)
__m256d ymm0,ymm1,ymm2; __m256d ymm0,ymm1,ymm2;
ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00 ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
ymm0 = _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br ymm0 = _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
@ -279,7 +279,7 @@ namespace Optimization {
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
#endif #endif
#if defined (AVX2) #if defined (AVX2) || defined (AVXFMA)
__m256d a_real = _mm256_movedup_pd( a ); // Ar Ar __m256d a_real = _mm256_movedup_pd( a ); // Ar Ar
__m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
@ -320,7 +320,7 @@ namespace Optimization {
#if defined (AVXFMA4) #if defined (AVXFMA4)
a= _mm256_macc_ps(b,c,a); a= _mm256_macc_ps(b,c,a);
#endif #endif
#if defined (AVX2) #if defined (AVX2) || defined (AVXFMA)
a= _mm256_fmadd_ps( b, c, a); a= _mm256_fmadd_ps( b, c, a);
#endif #endif
} }
@ -332,7 +332,7 @@ namespace Optimization {
#if defined (AVXFMA4) #if defined (AVXFMA4)
a= _mm256_macc_pd(b,c,a); a= _mm256_macc_pd(b,c,a);
#endif #endif
#if defined (AVX2) #if defined (AVX2) || defined (AVXFMA)
a= _mm256_fmadd_pd( b, c, a); a= _mm256_fmadd_pd( b, c, a);
#endif #endif
} }
@ -347,7 +347,7 @@ namespace Optimization {
} }
// Integer // Integer
inline __m256i operator()(__m256i a, __m256i b){ inline __m256i operator()(__m256i a, __m256i b){
#if defined (AVX1) #if defined (AVX1) || defined (AVXFMA)
__m128i a0,a1; __m128i a0,a1;
__m128i b0,b1; __m128i b0,b1;
a0 = _mm256_extractf128_si256(a,0); a0 = _mm256_extractf128_si256(a,0);

View File

@ -244,7 +244,22 @@ namespace Optimization {
return a*b; return a*b;
} }
}; };
struct Div{
// Real double
inline vector4double operator()(vector4double a, vector4double b){
return vec_swdiv(a, b);
}
// Real float
FLOAT_WRAP_2(operator(), inline)
// Integer
inline int operator()(int a, int b){
return a/b;
}
};
struct Conj{ struct Conj{
// Complex double // Complex double
inline vector4double operator()(vector4double v){ inline vector4double operator()(vector4double v){
@ -413,6 +428,7 @@ template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
typedef Optimization::Sum SumSIMD; typedef Optimization::Sum SumSIMD;
typedef Optimization::Sub SubSIMD; typedef Optimization::Sub SubSIMD;
typedef Optimization::Mult MultSIMD; typedef Optimization::Mult MultSIMD;
typedef Optimization::Div DivSIMD;
typedef Optimization::MultComplex MultComplexSIMD; typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::Conj ConjSIMD; typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesMinusI TimesMinusISIMD;

View File

@ -44,7 +44,7 @@ directory
#ifdef SSE4 #ifdef SSE4
#include "Grid_sse4.h" #include "Grid_sse4.h"
#endif #endif
#if defined(AVX1) || defined(AVX2) || defined(AVXFMA4) #if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4)
#include "Grid_avx.h" #include "Grid_avx.h"
#endif #endif
#if defined AVX512 #if defined AVX512

View File

@ -50,6 +50,12 @@ public:
template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1*i2;} template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1*i2;}
std::string name(void) const { return std::string("Times"); } std::string name(void) const { return std::string("Times"); }
}; };
class funcDivide {
public:
funcDivide() {};
template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1/i2;}
std::string name(void) const { return std::string("Divide"); }
};
class funcConj { class funcConj {
public: public:
funcConj() {}; funcConj() {};
@ -341,6 +347,7 @@ int main (int argc, char ** argv)
Tester<RealF,vRealF>(funcPlus()); Tester<RealF,vRealF>(funcPlus());
Tester<RealF,vRealF>(funcMinus()); Tester<RealF,vRealF>(funcMinus());
Tester<RealF,vRealF>(funcTimes()); Tester<RealF,vRealF>(funcTimes());
Tester<RealF,vRealF>(funcDivide());
Tester<RealF,vRealF>(funcAdj()); Tester<RealF,vRealF>(funcAdj());
Tester<RealF,vRealF>(funcConj()); Tester<RealF,vRealF>(funcConj());
Tester<RealF,vRealF>(funcInnerProduct()); Tester<RealF,vRealF>(funcInnerProduct());
@ -371,6 +378,7 @@ int main (int argc, char ** argv)
Tester<RealD,vRealD>(funcPlus()); Tester<RealD,vRealD>(funcPlus());
Tester<RealD,vRealD>(funcMinus()); Tester<RealD,vRealD>(funcMinus());
Tester<RealD,vRealD>(funcTimes()); Tester<RealD,vRealD>(funcTimes());
Tester<RealD,vRealD>(funcDivide());
Tester<RealD,vRealD>(funcAdj()); Tester<RealD,vRealD>(funcAdj());
Tester<RealD,vRealD>(funcConj()); Tester<RealD,vRealD>(funcConj());
Tester<RealD,vRealD>(funcInnerProduct()); Tester<RealD,vRealD>(funcInnerProduct());

View File

@ -68,7 +68,7 @@ int main (int argc, char ** argv)
for(int mu=0;mu<4;mu++){ for(int mu=0;mu<4;mu++){
RealD TwoPiL = M_PI * 2.0/ latt_size[mu]; RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
LatticeCoordinate(coor,mu); LatticeCoordinate(coor,mu);
C = C - (TwoPiL * p[mu]) * coor; C = C + (TwoPiL * p[mu]) * coor;
} }
C = exp(C*ci); C = exp(C*ci);
@ -78,10 +78,11 @@ int main (int argc, char ** argv)
FFT theFFT(&Fine); FFT theFFT(&Fine);
theFFT.FFT_dim(Ctilde,C,0,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<<std::endl; Ctilde = C;
theFFT.FFT_dim(Ctilde,C,1,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<<std::endl; theFFT.FFT_dim(Ctilde,Ctilde,0,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
theFFT.FFT_dim(Ctilde,C,2,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<<std::endl; theFFT.FFT_dim(Ctilde,Ctilde,1,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
theFFT.FFT_dim(Ctilde,C,3,FFT::forward); std::cout << theFFT.MFlops()<<std::endl; theFFT.FFT_dim(Ctilde,Ctilde,2,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
theFFT.FFT_dim(Ctilde,Ctilde,3,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
// C=zero; // C=zero;
// Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde); // Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
@ -93,10 +94,11 @@ int main (int argc, char ** argv)
C=C-Ctilde; C=C-Ctilde;
std::cout << "diff scalar "<<norm2(C) << std::endl; std::cout << "diff scalar "<<norm2(C) << std::endl;
theFFT.FFT_dim(Stilde,S,0,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl; Stilde = S;
theFFT.FFT_dim(Stilde,S,1,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl; theFFT.FFT_dim(Stilde,Stilde,0,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
theFFT.FFT_dim(Stilde,S,2,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl; theFFT.FFT_dim(Stilde,Stilde,1,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl; theFFT.FFT_dim(Stilde,Stilde,2,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
theFFT.FFT_dim(Stilde,Stilde,3,FFT::forward); std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl;
SpinMatrixF Sp; SpinMatrixF Sp;
Sp = zero; Sp = Sp+cVol; Sp = zero; Sp = Sp+cVol;