diff --git a/README.md b/README.md index bfe558a2..c47a257c 100644 --- a/README.md +++ b/README.md @@ -138,7 +138,7 @@ The following options can be use with the `--enable-comms=` option to target dif | `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model | | `shmem ` | Cray SHMEM communications | -For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). +For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead. ### Possible SIMD types diff --git a/benchmarks/Benchmark_mooee.cc b/benchmarks/Benchmark_mooee.cc new file mode 100644 index 00000000..dfaea627 --- /dev/null +++ b/benchmarks/Benchmark_mooee.cc @@ -0,0 +1,183 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./benchmarks/Benchmark_dwf.cc + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int threads = GridThread::GetThreads(); + std::cout< latt4 = GridDefaultLatt(); + const int Ls=16; + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::cout << GridLogMessage << "Making Vec5d innermost grids"< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + std::cout << GridLogMessage << "Seeded"<_Nprocessors; + + + if (1) + { + const int ncall=1000; + + std::cout << GridLogMessage<< "*********************************************************" <Barrier(); \ + t0=usecond(); \ + for(int i=0;iBarrier(); \ + Dw.CayleyReport(); \ + std::cout<Barrier(); \ + t0=usecond(); \ + for(int i=0;iBarrier(); \ + Dw.CayleyReport(); \ + std::cout<Barrier(); + + double t0,t1; + + LatticeFermion r_eo(sFGrid); + LatticeFermion src_e (sFrbGrid); + LatticeFermion src_o (sFrbGrid); + LatticeFermion r_e (sFrbGrid); + LatticeFermion r_o (sFrbGrid); + + pickCheckerboard(Even,src_e,src); + pickCheckerboard(Odd,src_o,src); + + setCheckerboard(r_eo,src_o); + setCheckerboard(r_eo,src_e); + + r_e = zero; + r_o = zero; + + BENCH_DW_MEO(Dhop ,src,result); + BENCH_DW_MEO(DhopEO ,src_o,r_e); + BENCH_DW(Meooe ,src_o,r_e); + BENCH_DW(Mooee ,src_o,r_o); + BENCH_DW(MooeeInv,src_o,r_o); + + } + + Grid_finalize(); +} diff --git a/configure.ac b/configure.ac index a6658a96..4225bac5 100644 --- a/configure.ac +++ b/configure.ac @@ -206,8 +206,8 @@ case ${ax_cv_cxx_compiler_vendor} in AC_DEFINE([AVX1],[1],[AVX intrinsics]) SIMD_FLAGS='-mavx -xavx';; AVXFMA) - AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA4]) - SIMD_FLAGS='-mavx -mfma';; + AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3]) + SIMD_FLAGS='-mavx -fma';; AVX2) AC_DEFINE([AVX2],[1],[AVX2 intrinsics]) SIMD_FLAGS='-march=core-avx2 -xcore-avx2';; @@ -290,7 +290,7 @@ esac case ${ac_COMMS} in *-auto) LX_FIND_MPI - if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["MPI not found"]); fi + if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["The configure could not find the MPI compilation flags. N.B. The -auto mode is not supported by Cray wrappers. Use the non -auto version in this case."]); fi AM_CXXFLAGS="$MPI_CXXFLAGS $AM_CXXFLAGS" AM_CFLAGS="$MPI_CFLAGS $AM_CFLAGS" AM_LDFLAGS="`echo $MPI_CXXLDFLAGS | sed -E 's/-l@<:@^ @:>@+//g'` $AM_LDFLAGS" diff --git a/lib/FFT.h b/lib/FFT.h index b5b31d82..240f338b 100644 --- a/lib/FFT.h +++ b/lib/FFT.h @@ -244,7 +244,10 @@ namespace Grid { pokeLocalSite(s,pgbuf,cbuf); } } - result = Cshift(result,dim,L); + if (p != processors[dim] - 1) + { + result = Cshift(result,dim,L); + } } // Loop over orthog coords @@ -287,10 +290,10 @@ namespace Grid { cgbuf = clbuf; cgbuf[dim] = clbuf[dim]+L*pc; peekLocalSite(s,pgbuf,cgbuf); - s = s * div; pokeLocalSite(s,result,clbuf); } } + result = result*div; // destroying plan FFTW::fftw_destroy_plan(p); diff --git a/lib/qcd/action/Actions.h b/lib/qcd/action/Actions.h index ba6e577d..4a30f8c3 100644 --- a/lib/qcd/action/Actions.h +++ b/lib/qcd/action/Actions.h @@ -195,6 +195,7 @@ typedef WilsonTMFermion WilsonTMFermionD; typedef DomainWallFermion DomainWallFermionR; typedef DomainWallFermion DomainWallFermionF; typedef DomainWallFermion DomainWallFermionD; + typedef MobiusFermion MobiusFermionR; typedef MobiusFermion MobiusFermionF; typedef MobiusFermion MobiusFermionD; @@ -203,6 +204,20 @@ typedef ZMobiusFermion ZMobiusFermionR; typedef ZMobiusFermion ZMobiusFermionF; typedef ZMobiusFermion ZMobiusFermionD; +// Ls vectorised +typedef DomainWallFermion DomainWallFermionVec5dR; +typedef DomainWallFermion DomainWallFermionVec5dF; +typedef DomainWallFermion DomainWallFermionVec5dD; + +typedef MobiusFermion MobiusFermionVec5dR; +typedef MobiusFermion MobiusFermionVec5dF; +typedef MobiusFermion MobiusFermionVec5dD; + +typedef ZMobiusFermion ZMobiusFermionVec5dR; +typedef ZMobiusFermion ZMobiusFermionVec5dF; +typedef ZMobiusFermion ZMobiusFermionVec5dD; + + typedef ScaledShamirFermion ScaledShamirFermionR; typedef ScaledShamirFermion ScaledShamirFermionF; typedef ScaledShamirFermion ScaledShamirFermionD; @@ -254,6 +269,7 @@ typedef MobiusFermion GparityMobiusFermionF; typedef MobiusFermion GparityMobiusFermionD; + }} /////////////////////////////////////////////////////////////////////////////// // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code diff --git a/lib/qcd/action/fermion/CayleyFermion5D.cc b/lib/qcd/action/fermion/CayleyFermion5D.cc index 57b047d4..b8e98dce 100644 --- a/lib/qcd/action/fermion/CayleyFermion5D.cc +++ b/lib/qcd/action/fermion/CayleyFermion5D.cc @@ -62,6 +62,50 @@ void CayleyFermion5D::Dminus(const FermionField &psi, FermionField &chi) axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi } } + + +template void CayleyFermion5D::CayleyReport(void) +{ + this->Report(); + std::vector latt = GridDefaultLatt(); + RealD volume = this->Ls; for(int mu=0;mu_FourDimGrid->_Nprocessors; + if ( M5Dcalls > 0 ) { + std::cout << GridLogMessage << "#### M5D calls report " << std::endl; + std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls : " << M5Dcalls << std::endl; + std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << M5Dtime / M5Dcalls << " us" << std::endl; + + // Flops = 6.0*(Nc*Ns) *Ls*vol + RealD mflops = 6.0*12*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + } + + if ( MooeeInvCalls > 0 ) { + + std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl; + std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl; + std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl; + + // Flops = 9*12*Ls*vol/2 + RealD mflops = 9.0*12*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + } + +} +template void CayleyFermion5D::CayleyZeroCounters(void) +{ + this->ZeroCounters(); + M5Dflops=0; + M5Dcalls=0; + M5Dtime=0; + MooeeInvFlops=0; + MooeeInvCalls=0; + MooeeInvTime=0; +} + + template void CayleyFermion5D::DminusDag(const FermionField &psi, FermionField &chi) { diff --git a/lib/qcd/action/fermion/CayleyFermion5D.h b/lib/qcd/action/fermion/CayleyFermion5D.h index 1d8c2b95..6fb58234 100644 --- a/lib/qcd/action/fermion/CayleyFermion5D.h +++ b/lib/qcd/action/fermion/CayleyFermion5D.h @@ -120,6 +120,18 @@ namespace Grid { GridRedBlackCartesian &FourDimRedBlackGrid, RealD _mass,RealD _M5,const ImplParams &p= ImplParams()); + + + void CayleyReport(void); + void CayleyZeroCounters(void); + + double M5Dflops; + double M5Dcalls; + double M5Dtime; + + double MooeeInvFlops; + double MooeeInvCalls; + double MooeeInvTime; protected: void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c); diff --git a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc index 62e91dd4..8e7df945 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc @@ -51,6 +51,9 @@ void CayleyFermion5D::M5D(const FermionField &psi, GridBase *grid=psi._grid; assert(phi.checkerboard == psi.checkerboard); chi.checkerboard=psi.checkerboard; + // Flops = 6.0*(Nc*Ns) *Ls*vol + M5Dcalls++; + M5Dtime-=usecond(); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss+=Ls){ // adds Ls for(int s=0;s @@ -91,6 +95,9 @@ void CayleyFermion5D::M5Ddag(const FermionField &psi, assert(phi.checkerboard == psi.checkerboard); chi.checkerboard=psi.checkerboard; + // Flops = 6.0*(Nc*Ns) *Ls*vol + M5Dcalls++; + M5Dtime-=usecond(); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss+=Ls){ // adds Ls auto tmp = psi._odata[0]; @@ -116,6 +123,7 @@ PARALLEL_FOR_LOOP } } } + M5Dtime+=usecond(); } template @@ -126,10 +134,14 @@ void CayleyFermion5D::MooeeInv (const FermionField &psi, FermionField & chi.checkerboard=psi.checkerboard; + MooeeInvCalls++; + MooeeInvTime-=usecond(); + PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss+=Ls){ // adds Ls auto tmp = psi._odata[0]; + // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops // Apply (L^{\prime})^{-1} chi[ss]=psi[ss]; // chi[0]=psi[0] for(int s=1;s @@ -166,6 +181,8 @@ void CayleyFermion5D::MooeeInvDag (const FermionField &psi, FermionField & assert(psi.checkerboard == psi.checkerboard); chi.checkerboard=psi.checkerboard; + MooeeInvCalls++; + MooeeInvTime-=usecond(); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss+=Ls){ // adds Ls @@ -197,6 +214,9 @@ PARALLEL_FOR_LOOP chi[ss+s] = chi[ss+s] - lee[s]*tmp; } } + + MooeeInvTime+=usecond(); + } #ifdef CAYLEY_DPERP_CACHE diff --git a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc index f6569923..35a10de2 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc @@ -60,7 +60,7 @@ void CayleyFermion5D::M5D(const FermionField &psi, GridBase *grid=psi._grid; int Ls = this->Ls; int LLs = grid->_rdimensions[0]; - int nsimd= Simd::Nsimd(); + const int nsimd= Simd::Nsimd(); Vector > u(LLs); Vector > l(LLs); @@ -86,35 +86,138 @@ void CayleyFermion5D::M5D(const FermionField &psi, d_p[ss] = diag[s]; }} + + M5Dcalls++; + M5Dtime-=usecond(); + + assert(Nc==3); + PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss+=LLs){ // adds LLs +#if 0 + alignas(64) SiteHalfSpinor hp; + alignas(64) SiteHalfSpinor hm; + alignas(64) SiteSpinor fp; + alignas(64) SiteSpinor fm; - alignas(64) SiteHalfSpinor hp; - alignas(64) SiteHalfSpinor hm; - alignas(64) SiteSpinor fp; - alignas(64) SiteSpinor fm; + for(int v=0;v=v ) rotate(hm,hm,nsimd-1); + if ( vp<=v ) rotate(hp,hp,1); + if ( vm>=v ) rotate(hm,hm,nsimd-1); + + hp=0.5*hp; + hm=0.5*hm; - hp=hp*0.5; - hm=hm*0.5; - spRecon5m(fp,hp); - spRecon5p(fm,hm); + spRecon5m(fp,hp); + spRecon5p(fm,hm); - chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp; - chi[ss+v] = chi[ss+v] +l[v]*fm; + chi[ss+v] = d[v]*phi[ss+v]; + chi[ss+v] = chi[ss+v] +u[v]*fp; + chi[ss+v] = chi[ss+v] +l[v]*fm; - } + } +#else + for(int v=0;v(hp_00.v); + hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); + hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v); + hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v); + hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v); + hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v); + } + if ( vm>=v ) { + hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v); + hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v); + hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v); + hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v); + hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v); + hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); + } + + /* + if ( ss==0) std::cout << " dphi_00 " < @@ -154,6 +257,8 @@ void CayleyFermion5D::M5Ddag(const FermionField &psi, d_p[ss] = diag[s]; }} + M5Dcalls++; + M5Dtime-=usecond(); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss+=LLs){ // adds LLs @@ -183,8 +288,8 @@ PARALLEL_FOR_LOOP } } + M5Dtime+=usecond(); } - template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv) { @@ -250,13 +355,11 @@ void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField } } + MooeeInvCalls++; + MooeeInvTime-=usecond(); // Dynamic allocate on stack to get per thread without serialised heap acces -PARALLEL_FOR_LOOP - for(auto site=0;site SitePplus(LLs); Vector SitePminus(LLs); @@ -267,6 +370,9 @@ PARALLEL_FOR_LOOP SiteHalfSpinor BcastP; SiteHalfSpinor BcastM; +#pragma omp for + for(auto site=0;site::Report(void) std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + RealD Fullmflops = 1344*volume*DhopCalls/(DhopComputeTime+DhopCommTime)/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; + + } if ( DerivCalls > 0 ) { @@ -209,12 +214,15 @@ void WilsonFermion5D::Report(void) RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime; std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl; - } + + RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NP << std::endl; } if (DerivCalls > 0 || DhopCalls > 0){ - std::cout << GridLogMessage << "WilsonFermion5D Stencil"<i ; 0xC unchanged */ -#if defined (AVX1) +#if defined (AVX1) __m256d ymm0,ymm1,ymm2; ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00 ymm0 = _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br @@ -282,7 +282,7 @@ namespace Optimization { a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr #endif -#if defined (AVX2) +#if defined (AVX2) || defined (AVXFMA) __m256d a_real = _mm256_movedup_pd( a ); // Ar Ar __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br @@ -323,7 +323,7 @@ namespace Optimization { #if defined (AVXFMA4) a= _mm256_macc_ps(b,c,a); #endif -#if defined (AVX2) +#if defined (AVX2) || defined (AVXFMA) a= _mm256_fmadd_ps( b, c, a); #endif } @@ -335,7 +335,7 @@ namespace Optimization { #if defined (AVXFMA4) a= _mm256_macc_pd(b,c,a); #endif -#if defined (AVX2) +#if defined (AVX2) || defined (AVXFMA) a= _mm256_fmadd_pd( b, c, a); #endif } @@ -350,7 +350,7 @@ namespace Optimization { } // Integer inline __m256i operator()(__m256i a, __m256i b){ -#if defined (AVX1) +#if defined (AVX1) || defined (AVXFMA) __m128i a0,a1; __m128i b0,b1; a0 = _mm256_extractf128_si256(a,0); diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index 38785138..d6531d57 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -86,13 +86,13 @@ namespace Optimization { struct Vstream{ //Float inline void operator()(float * a, __m512 b){ - //_mm512_stream_ps(a,b); - _mm512_store_ps(a,b); + _mm512_stream_ps(a,b); + // _mm512_store_ps(a,b); } //Double inline void operator()(double * a, __m512d b){ - //_mm512_stream_pd(a,b); - _mm512_store_pd(a,b); + _mm512_stream_pd(a,b); + // _mm512_store_pd(a,b); } }; diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index 07933f52..bc86291d 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -244,7 +244,22 @@ namespace Optimization { return a*b; } }; - + + struct Div{ + // Real double + inline vector4double operator()(vector4double a, vector4double b){ + return vec_swdiv(a, b); + } + + // Real float + FLOAT_WRAP_2(operator(), inline) + + // Integer + inline int operator()(int a, int b){ + return a/b; + } + }; + struct Conj{ // Complex double inline vector4double operator()(vector4double v){ @@ -413,6 +428,7 @@ template using ReduceSIMD = Optimization::Reduce; typedef Optimization::Sum SumSIMD; typedef Optimization::Sub SubSIMD; typedef Optimization::Mult MultSIMD; +typedef Optimization::Div DivSIMD; typedef Optimization::MultComplex MultComplexSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index a0c1f55b..3bbc7b18 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -44,7 +44,7 @@ directory #ifdef SSE4 #include "Grid_sse4.h" #endif -#if defined(AVX1) || defined(AVX2) || defined(AVXFMA4) +#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4) #include "Grid_avx.h" #endif #if defined AVX512 @@ -130,7 +130,7 @@ class Grid_simd { Vector_type v; - static inline int Nsimd(void) { + static inline constexpr int Nsimd(void) { return sizeof(Vector_type) / sizeof(Scalar_type); } diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc index 189f0559..92f9bcd8 100644 --- a/tests/Test_simd.cc +++ b/tests/Test_simd.cc @@ -50,6 +50,12 @@ public: template void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1*i2;} std::string name(void) const { return std::string("Times"); } }; +class funcDivide { +public: + funcDivide() {}; + template void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1/i2;} + std::string name(void) const { return std::string("Divide"); } +}; class funcConj { public: funcConj() {}; @@ -341,6 +347,7 @@ int main (int argc, char ** argv) Tester(funcPlus()); Tester(funcMinus()); Tester(funcTimes()); + Tester(funcDivide()); Tester(funcAdj()); Tester(funcConj()); Tester(funcInnerProduct()); @@ -371,6 +378,7 @@ int main (int argc, char ** argv) Tester(funcPlus()); Tester(funcMinus()); Tester(funcTimes()); + Tester(funcDivide()); Tester(funcAdj()); Tester(funcConj()); Tester(funcInnerProduct()); diff --git a/tests/core/Test_fftf.cc b/tests/core/Test_fftf.cc index 4eb4398d..22838f7b 100644 --- a/tests/core/Test_fftf.cc +++ b/tests/core/Test_fftf.cc @@ -68,7 +68,7 @@ int main (int argc, char ** argv) for(int mu=0;mu<4;mu++){ RealD TwoPiL = M_PI * 2.0/ latt_size[mu]; LatticeCoordinate(coor,mu); - C = C - (TwoPiL * p[mu]) * coor; + C = C + (TwoPiL * p[mu]) * coor; } C = exp(C*ci); @@ -78,10 +78,11 @@ int main (int argc, char ** argv) FFT theFFT(&Fine); - theFFT.FFT_dim(Ctilde,C,0,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<