diff --git a/.gitignore b/.gitignore index 38e3da2d..da7de5e4 100644 --- a/.gitignore +++ b/.gitignore @@ -49,6 +49,7 @@ config.status .deps Make.inc eigen.inc +Eigen.inc # http://www.gnu.org/software/autoconf # ######################################## diff --git a/README.md b/README.md index bfe558a2..c47a257c 100644 --- a/README.md +++ b/README.md @@ -138,7 +138,7 @@ The following options can be use with the `--enable-comms=` option to target dif | `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model | | `shmem ` | Cray SHMEM communications | -For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). +For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead. ### Possible SIMD types diff --git a/benchmarks/Benchmark_mooee.cc b/benchmarks/Benchmark_mooee.cc new file mode 100644 index 00000000..c895109f --- /dev/null +++ b/benchmarks/Benchmark_mooee.cc @@ -0,0 +1,237 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./benchmarks/Benchmark_dwf.cc + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int threads = GridThread::GetThreads(); + std::cout< latt4 = GridDefaultLatt(); + const int Ls=8; + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::cout << GridLogMessage << "Making Vec5d innermost grids"< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + std::cout << GridLogMessage << "Seeded"<_Nprocessors; + + + if (1) + { + const int ncall=100; + + std::cout << GridLogMessage<< "*********************************************************" <Barrier(); + + double t0,t1; + t0=usecond(); + for(int i=0;iBarrier(); + + std::cout<Barrier(); + t0=usecond(); + for (int i = 0; i < ncall; i++) { + Dw.DhopEO(src_o, r_e, DaggerNo); + } + t1=usecond(); + FGrid->Barrier(); + std::cout<Barrier(); + t0=usecond(); + for (int i = 0; i < ncall; i++) { + Dw.Mooee(src_o, r_o); + } + t1=usecond(); + FGrid->Barrier(); + std::cout<Barrier(); + t0=usecond(); + for (int i = 0; i < ncall; i++) { + Dw.MooeeInv(src_o, r_o); + } + t1=usecond(); + FGrid->Barrier(); + std::cout<Barrier(); + t0=usecond(); + for (int i = 0; i < ncall; i++) { + Dw.Meooe(src_o, r_e); + } + t1=usecond(); + FGrid->Barrier(); + std::cout<Barrier(); + + double t0,t1; + t0=usecond(); + for(int i=0;iBarrier(); + + std::cout<Barrier(); + t0=usecond(); + for (int i = 0; i < ncall; i++) { + Dw.DhopEO(src_o, r_e, DaggerNo); + } + t1=usecond(); + FGrid->Barrier(); + std::cout<Barrier(); + t0=usecond(); + for (int i = 0; i < ncall; i++) { + Dw.Mooee(src_o, r_o); + } + t1=usecond(); + FGrid->Barrier(); + std::cout<Barrier(); + t0=usecond(); + for (int i = 0; i < ncall; i++) { + Dw.MooeeInv(src_o, r_o); + } + t1=usecond(); + FGrid->Barrier(); + std::cout<Barrier(); + t0=usecond(); + for (int i = 0; i < ncall; i++) { + Dw.Meooe(src_o, r_e); + } + t1=usecond(); + FGrid->Barrier(); + std::cout<::fftw_destroy_plan(p); diff --git a/lib/qcd/action/Actions.h b/lib/qcd/action/Actions.h index 91bfcc8b..f7471cf0 100644 --- a/lib/qcd/action/Actions.h +++ b/lib/qcd/action/Actions.h @@ -203,6 +203,7 @@ typedef WilsonTMFermion WilsonTMFermionD; typedef DomainWallFermion DomainWallFermionR; typedef DomainWallFermion DomainWallFermionF; typedef DomainWallFermion DomainWallFermionD; + typedef MobiusFermion MobiusFermionR; typedef MobiusFermion MobiusFermionF; typedef MobiusFermion MobiusFermionD; @@ -211,6 +212,20 @@ typedef ZMobiusFermion ZMobiusFermionR; typedef ZMobiusFermion ZMobiusFermionF; typedef ZMobiusFermion ZMobiusFermionD; +// Ls vectorised +typedef DomainWallFermion DomainWallFermionVec5dR; +typedef DomainWallFermion DomainWallFermionVec5dF; +typedef DomainWallFermion DomainWallFermionVec5dD; + +typedef MobiusFermion MobiusFermionVec5dR; +typedef MobiusFermion MobiusFermionVec5dF; +typedef MobiusFermion MobiusFermionVec5dD; + +typedef ZMobiusFermion ZMobiusFermionVec5dR; +typedef ZMobiusFermion ZMobiusFermionVec5dF; +typedef ZMobiusFermion ZMobiusFermionVec5dD; + + typedef ScaledShamirFermion ScaledShamirFermionR; typedef ScaledShamirFermion ScaledShamirFermionF; typedef ScaledShamirFermion ScaledShamirFermionD; @@ -269,6 +284,7 @@ typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DR; typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DF; typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DD; + }} /////////////////////////////////////////////////////////////////////////////// // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 4c2d24bf..d2ac96e3 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -194,6 +194,11 @@ void WilsonFermion5D::Report(void) std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + RealD Fullmflops = 1344*volume*DhopCalls/(DhopComputeTime+DhopCommTime)/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; + + } if ( DerivCalls > 0 ) { @@ -209,12 +214,15 @@ void WilsonFermion5D::Report(void) RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime; std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl; - } + + RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NP << std::endl; } if (DerivCalls > 0 || DhopCalls > 0){ - std::cout << GridLogMessage << "WilsonFermion5D Stencil"<i ; 0xC unchanged */ -#if defined (AVX1) +#if defined (AVX1) __m256d ymm0,ymm1,ymm2; ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00 ymm0 = _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br @@ -279,7 +279,7 @@ namespace Optimization { a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr #endif -#if defined (AVX2) +#if defined (AVX2) || defined (AVXFMA) __m256d a_real = _mm256_movedup_pd( a ); // Ar Ar __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br @@ -320,7 +320,7 @@ namespace Optimization { #if defined (AVXFMA4) a= _mm256_macc_ps(b,c,a); #endif -#if defined (AVX2) +#if defined (AVX2) || defined (AVXFMA) a= _mm256_fmadd_ps( b, c, a); #endif } @@ -332,7 +332,7 @@ namespace Optimization { #if defined (AVXFMA4) a= _mm256_macc_pd(b,c,a); #endif -#if defined (AVX2) +#if defined (AVX2) || defined (AVXFMA) a= _mm256_fmadd_pd( b, c, a); #endif } @@ -347,7 +347,7 @@ namespace Optimization { } // Integer inline __m256i operator()(__m256i a, __m256i b){ -#if defined (AVX1) +#if defined (AVX1) || defined (AVXFMA) __m128i a0,a1; __m128i b0,b1; a0 = _mm256_extractf128_si256(a,0); diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index 07933f52..bc86291d 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -244,7 +244,22 @@ namespace Optimization { return a*b; } }; - + + struct Div{ + // Real double + inline vector4double operator()(vector4double a, vector4double b){ + return vec_swdiv(a, b); + } + + // Real float + FLOAT_WRAP_2(operator(), inline) + + // Integer + inline int operator()(int a, int b){ + return a/b; + } + }; + struct Conj{ // Complex double inline vector4double operator()(vector4double v){ @@ -413,6 +428,7 @@ template using ReduceSIMD = Optimization::Reduce; typedef Optimization::Sum SumSIMD; typedef Optimization::Sub SubSIMD; typedef Optimization::Mult MultSIMD; +typedef Optimization::Div DivSIMD; typedef Optimization::MultComplex MultComplexSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 184baad9..080dd5c0 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -44,7 +44,7 @@ directory #ifdef SSE4 #include "Grid_sse4.h" #endif -#if defined(AVX1) || defined(AVX2) || defined(AVXFMA4) +#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4) #include "Grid_avx.h" #endif #if defined AVX512 diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc index 189f0559..92f9bcd8 100644 --- a/tests/Test_simd.cc +++ b/tests/Test_simd.cc @@ -50,6 +50,12 @@ public: template void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1*i2;} std::string name(void) const { return std::string("Times"); } }; +class funcDivide { +public: + funcDivide() {}; + template void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1/i2;} + std::string name(void) const { return std::string("Divide"); } +}; class funcConj { public: funcConj() {}; @@ -341,6 +347,7 @@ int main (int argc, char ** argv) Tester(funcPlus()); Tester(funcMinus()); Tester(funcTimes()); + Tester(funcDivide()); Tester(funcAdj()); Tester(funcConj()); Tester(funcInnerProduct()); @@ -371,6 +378,7 @@ int main (int argc, char ** argv) Tester(funcPlus()); Tester(funcMinus()); Tester(funcTimes()); + Tester(funcDivide()); Tester(funcAdj()); Tester(funcConj()); Tester(funcInnerProduct()); diff --git a/tests/core/Test_fftf.cc b/tests/core/Test_fftf.cc index 4eb4398d..22838f7b 100644 --- a/tests/core/Test_fftf.cc +++ b/tests/core/Test_fftf.cc @@ -68,7 +68,7 @@ int main (int argc, char ** argv) for(int mu=0;mu<4;mu++){ RealD TwoPiL = M_PI * 2.0/ latt_size[mu]; LatticeCoordinate(coor,mu); - C = C - (TwoPiL * p[mu]) * coor; + C = C + (TwoPiL * p[mu]) * coor; } C = exp(C*ci); @@ -78,10 +78,11 @@ int main (int argc, char ** argv) FFT theFFT(&Fine); - theFFT.FFT_dim(Ctilde,C,0,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<