From a2cffb030442cca679038f4655348df4260ab5de Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Mon, 21 Nov 2016 17:47:18 +0100 Subject: [PATCH 01/10] AVXFMA target fixed --- configure.ac | 4 ++-- lib/simd/Grid_avx.h | 18 +++++++++--------- lib/simd/Grid_vector_types.h | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/configure.ac b/configure.ac index a6658a96..90764cb7 100644 --- a/configure.ac +++ b/configure.ac @@ -206,8 +206,8 @@ case ${ax_cv_cxx_compiler_vendor} in AC_DEFINE([AVX1],[1],[AVX intrinsics]) SIMD_FLAGS='-mavx -xavx';; AVXFMA) - AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA4]) - SIMD_FLAGS='-mavx -mfma';; + AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3]) + SIMD_FLAGS='-mavx -fma';; AVX2) AC_DEFINE([AVX2],[1],[AVX2 intrinsics]) SIMD_FLAGS='-march=core-avx2 -xcore-avx2';; diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index f50eae2b..36360102 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -167,7 +167,7 @@ namespace Optimization { } //Integer inline __m256i operator()(__m256i a, __m256i b){ -#if defined (AVX1) || defined (AVXFMA4) +#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4) __m128i a0,a1; __m128i b0,b1; a0 = _mm256_extractf128_si256(a,0); @@ -195,7 +195,7 @@ namespace Optimization { } //Integer inline __m256i operator()(__m256i a, __m256i b){ -#if defined (AVX1) || defined (AVXFMA4) +#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4) __m128i a0,a1; __m128i b0,b1; a0 = _mm256_extractf128_si256(a,0); @@ -216,7 +216,7 @@ namespace Optimization { struct MultComplex{ // Complex float inline __m256 operator()(__m256 a, __m256 b){ -#if defined (AVX1) +#if defined (AVX1) __m256 ymm0,ymm1,ymm2; ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br @@ -233,7 +233,7 @@ namespace Optimization { a_imag = _mm256_mul_ps( a_imag,tmp ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr #endif -#if defined (AVX2) +#if defined (AVX2) || defined (AVXFMA) __m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar __m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) )); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br @@ -264,7 +264,7 @@ namespace Optimization { IF IMM0[3] = 0 THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i ; 0xC unchanged */ -#if defined (AVX1) +#if defined (AVX1) __m256d ymm0,ymm1,ymm2; ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00 ymm0 = _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br @@ -279,7 +279,7 @@ namespace Optimization { a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr #endif -#if defined (AVX2) +#if defined (AVX2) || defined (AVXFMA) __m256d a_real = _mm256_movedup_pd( a ); // Ar Ar __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br @@ -320,7 +320,7 @@ namespace Optimization { #if defined (AVXFMA4) a= _mm256_macc_ps(b,c,a); #endif -#if defined (AVX2) +#if defined (AVX2) || defined (AVXFMA) a= _mm256_fmadd_ps( b, c, a); #endif } @@ -332,7 +332,7 @@ namespace Optimization { #if defined (AVXFMA4) a= _mm256_macc_pd(b,c,a); #endif -#if defined (AVX2) +#if defined (AVX2) || defined (AVXFMA) a= _mm256_fmadd_pd( b, c, a); #endif } @@ -347,7 +347,7 @@ namespace Optimization { } // Integer inline __m256i operator()(__m256i a, __m256i b){ -#if defined (AVX1) +#if defined (AVX1) || defined (AVXFMA) __m128i a0,a1; __m128i b0,b1; a0 = _mm256_extractf128_si256(a,0); diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 184baad9..080dd5c0 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -44,7 +44,7 @@ directory #ifdef SSE4 #include "Grid_sse4.h" #endif -#if defined(AVX1) || defined(AVX2) || defined(AVXFMA4) +#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4) #include "Grid_avx.h" #endif #if defined AVX512 From 5833f247fa7a8f85b6afd5397796455070182792 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Thu, 24 Nov 2016 09:09:48 +0900 Subject: [PATCH 02/10] more FFt optimisations --- lib/FFT.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/FFT.h b/lib/FFT.h index b5b31d82..240f338b 100644 --- a/lib/FFT.h +++ b/lib/FFT.h @@ -244,7 +244,10 @@ namespace Grid { pokeLocalSite(s,pgbuf,cbuf); } } - result = Cshift(result,dim,L); + if (p != processors[dim] - 1) + { + result = Cshift(result,dim,L); + } } // Loop over orthog coords @@ -287,10 +290,10 @@ namespace Grid { cgbuf = clbuf; cgbuf[dim] = clbuf[dim]+L*pc; peekLocalSite(s,pgbuf,cgbuf); - s = s * div; pokeLocalSite(s,result,clbuf); } } + result = result*div; // destroying plan FFTW::fftw_destroy_plan(p); From 3cdf945d840dfb47c8a939bc113425ee3b8ebe68 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Thu, 24 Nov 2016 09:10:03 +0900 Subject: [PATCH 03/10] Test_fftf fix --- tests/core/Test_fftf.cc | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/tests/core/Test_fftf.cc b/tests/core/Test_fftf.cc index 4eb4398d..22838f7b 100644 --- a/tests/core/Test_fftf.cc +++ b/tests/core/Test_fftf.cc @@ -68,7 +68,7 @@ int main (int argc, char ** argv) for(int mu=0;mu<4;mu++){ RealD TwoPiL = M_PI * 2.0/ latt_size[mu]; LatticeCoordinate(coor,mu); - C = C - (TwoPiL * p[mu]) * coor; + C = C + (TwoPiL * p[mu]) * coor; } C = exp(C*ci); @@ -78,10 +78,11 @@ int main (int argc, char ** argv) FFT theFFT(&Fine); - theFFT.FFT_dim(Ctilde,C,0,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()< Date: Thu, 24 Nov 2016 13:24:12 +0000 Subject: [PATCH 04/10] Add QPX Div structure --- lib/simd/Grid_qpx.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index 07933f52..1cd6edae 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -245,6 +245,21 @@ namespace Optimization { } }; + struct Div{ + // Real double + inline vector4double operator()(vector4double a, vector4double b){ + return vec_swdivs(a, b); + } + + // Real float + FLOAT_WRAP_2(operator(), inline) + + // Integer + inline int operator()(int a, int b){ + return a/b; + } + } + struct Conj{ // Complex double inline vector4double operator()(vector4double v){ @@ -412,6 +427,7 @@ template using ReduceSIMD = Optimization::Reduce; // Arithmetic operations typedef Optimization::Sum SumSIMD; typedef Optimization::Sub SubSIMD; +typedef Optimization::Div DivSIMD; typedef Optimization::Mult MultSIMD; typedef Optimization::MultComplex MultComplexSIMD; typedef Optimization::Conj ConjSIMD; From b18950f776954213ecda0b4785aec4cb67833680 Mon Sep 17 00:00:00 2001 From: Lanny91 Date: Fri, 25 Nov 2016 13:21:33 +0000 Subject: [PATCH 05/10] Added simd real divide test with QPX divide fixes --- lib/simd/Grid_qpx.h | 14 +++++++------- tests/Test_simd.cc | 8 ++++++++ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index 1cd6edae..bc86291d 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -244,22 +244,22 @@ namespace Optimization { return a*b; } }; - + struct Div{ // Real double inline vector4double operator()(vector4double a, vector4double b){ - return vec_swdivs(a, b); + return vec_swdiv(a, b); } - + // Real float FLOAT_WRAP_2(operator(), inline) - + // Integer inline int operator()(int a, int b){ return a/b; } - } - + }; + struct Conj{ // Complex double inline vector4double operator()(vector4double v){ @@ -427,8 +427,8 @@ template using ReduceSIMD = Optimization::Reduce; // Arithmetic operations typedef Optimization::Sum SumSIMD; typedef Optimization::Sub SubSIMD; -typedef Optimization::Div DivSIMD; typedef Optimization::Mult MultSIMD; +typedef Optimization::Div DivSIMD; typedef Optimization::MultComplex MultComplexSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc index 189f0559..92f9bcd8 100644 --- a/tests/Test_simd.cc +++ b/tests/Test_simd.cc @@ -50,6 +50,12 @@ public: template void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1*i2;} std::string name(void) const { return std::string("Times"); } }; +class funcDivide { +public: + funcDivide() {}; + template void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1/i2;} + std::string name(void) const { return std::string("Divide"); } +}; class funcConj { public: funcConj() {}; @@ -341,6 +347,7 @@ int main (int argc, char ** argv) Tester(funcPlus()); Tester(funcMinus()); Tester(funcTimes()); + Tester(funcDivide()); Tester(funcAdj()); Tester(funcConj()); Tester(funcInnerProduct()); @@ -371,6 +378,7 @@ int main (int argc, char ** argv) Tester(funcPlus()); Tester(funcMinus()); Tester(funcTimes()); + Tester(funcDivide()); Tester(funcAdj()); Tester(funcConj()); Tester(funcInnerProduct()); From 6c0cc5676bc7969f0d82b73146c9c5f9021c135b Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Sat, 26 Nov 2016 18:25:12 +0000 Subject: [PATCH 06/10] Adding Eigen.inc to the gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 38e3da2d..da7de5e4 100644 --- a/.gitignore +++ b/.gitignore @@ -49,6 +49,7 @@ config.status .deps Make.inc eigen.inc +Eigen.inc # http://www.gnu.org/software/autoconf # ######################################## From 1e44fd309402c1ae099be6d5371100e7a3dd4c75 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Sat, 26 Nov 2016 18:30:53 +0000 Subject: [PATCH 07/10] Added some details on the mpi flags for Cray machines --- README.md | 2 +- configure.ac | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index bfe558a2..c47a257c 100644 --- a/README.md +++ b/README.md @@ -138,7 +138,7 @@ The following options can be use with the `--enable-comms=` option to target dif | `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model | | `shmem ` | Cray SHMEM communications | -For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). +For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead. ### Possible SIMD types diff --git a/configure.ac b/configure.ac index 90764cb7..4225bac5 100644 --- a/configure.ac +++ b/configure.ac @@ -290,7 +290,7 @@ esac case ${ac_COMMS} in *-auto) LX_FIND_MPI - if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["MPI not found"]); fi + if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["The configure could not find the MPI compilation flags. N.B. The -auto mode is not supported by Cray wrappers. Use the non -auto version in this case."]); fi AM_CXXFLAGS="$MPI_CXXFLAGS $AM_CXXFLAGS" AM_CFLAGS="$MPI_CFLAGS $AM_CFLAGS" AM_LDFLAGS="`echo $MPI_CXXLDFLAGS | sed -E 's/-l@<:@^ @:>@+//g'` $AM_LDFLAGS" From ae9688e343acef3e33776a702faf98b25d78cfaa Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Mon, 28 Nov 2016 11:37:02 +0000 Subject: [PATCH 08/10] Reporting also the total mflops --- lib/qcd/action/fermion/WilsonFermion5D.cc | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 4c2d24bf..d2ac96e3 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -194,6 +194,11 @@ void WilsonFermion5D::Report(void) std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + RealD Fullmflops = 1344*volume*DhopCalls/(DhopComputeTime+DhopCommTime)/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; + + } if ( DerivCalls > 0 ) { @@ -209,12 +214,15 @@ void WilsonFermion5D::Report(void) RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime; std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl; - } + + RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NP << std::endl; } if (DerivCalls > 0 || DhopCalls > 0){ - std::cout << GridLogMessage << "WilsonFermion5D Stencil"< Date: Tue, 29 Nov 2016 00:14:36 +0000 Subject: [PATCH 09/10] Actions updated --- lib/qcd/action/Actions.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/lib/qcd/action/Actions.h b/lib/qcd/action/Actions.h index ba6e577d..4a30f8c3 100644 --- a/lib/qcd/action/Actions.h +++ b/lib/qcd/action/Actions.h @@ -195,6 +195,7 @@ typedef WilsonTMFermion WilsonTMFermionD; typedef DomainWallFermion DomainWallFermionR; typedef DomainWallFermion DomainWallFermionF; typedef DomainWallFermion DomainWallFermionD; + typedef MobiusFermion MobiusFermionR; typedef MobiusFermion MobiusFermionF; typedef MobiusFermion MobiusFermionD; @@ -203,6 +204,20 @@ typedef ZMobiusFermion ZMobiusFermionR; typedef ZMobiusFermion ZMobiusFermionF; typedef ZMobiusFermion ZMobiusFermionD; +// Ls vectorised +typedef DomainWallFermion DomainWallFermionVec5dR; +typedef DomainWallFermion DomainWallFermionVec5dF; +typedef DomainWallFermion DomainWallFermionVec5dD; + +typedef MobiusFermion MobiusFermionVec5dR; +typedef MobiusFermion MobiusFermionVec5dF; +typedef MobiusFermion MobiusFermionVec5dD; + +typedef ZMobiusFermion ZMobiusFermionVec5dR; +typedef ZMobiusFermion ZMobiusFermionVec5dF; +typedef ZMobiusFermion ZMobiusFermionVec5dD; + + typedef ScaledShamirFermion ScaledShamirFermionR; typedef ScaledShamirFermion ScaledShamirFermionF; typedef ScaledShamirFermion ScaledShamirFermionD; @@ -254,6 +269,7 @@ typedef MobiusFermion GparityMobiusFermionF; typedef MobiusFermion GparityMobiusFermionD; + }} /////////////////////////////////////////////////////////////////////////////// // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code From 2f92b4860b6e653266fd49bdabcf2690bffbc39b Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 29 Nov 2016 00:15:08 +0000 Subject: [PATCH 10/10] Test the full Mooee sector --- benchmarks/Benchmark_mooee.cc | 237 ++++++++++++++++++++++++++++++++++ 1 file changed, 237 insertions(+) create mode 100644 benchmarks/Benchmark_mooee.cc diff --git a/benchmarks/Benchmark_mooee.cc b/benchmarks/Benchmark_mooee.cc new file mode 100644 index 00000000..c895109f --- /dev/null +++ b/benchmarks/Benchmark_mooee.cc @@ -0,0 +1,237 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./benchmarks/Benchmark_dwf.cc + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int threads = GridThread::GetThreads(); + std::cout< latt4 = GridDefaultLatt(); + const int Ls=8; + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::cout << GridLogMessage << "Making Vec5d innermost grids"< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + std::cout << GridLogMessage << "Seeded"<_Nprocessors; + + + if (1) + { + const int ncall=100; + + std::cout << GridLogMessage<< "*********************************************************" <Barrier(); + + double t0,t1; + t0=usecond(); + for(int i=0;iBarrier(); + + std::cout<Barrier(); + t0=usecond(); + for (int i = 0; i < ncall; i++) { + Dw.DhopEO(src_o, r_e, DaggerNo); + } + t1=usecond(); + FGrid->Barrier(); + std::cout<Barrier(); + t0=usecond(); + for (int i = 0; i < ncall; i++) { + Dw.Mooee(src_o, r_o); + } + t1=usecond(); + FGrid->Barrier(); + std::cout<Barrier(); + t0=usecond(); + for (int i = 0; i < ncall; i++) { + Dw.MooeeInv(src_o, r_o); + } + t1=usecond(); + FGrid->Barrier(); + std::cout<Barrier(); + t0=usecond(); + for (int i = 0; i < ncall; i++) { + Dw.Meooe(src_o, r_e); + } + t1=usecond(); + FGrid->Barrier(); + std::cout<Barrier(); + + double t0,t1; + t0=usecond(); + for(int i=0;iBarrier(); + + std::cout<Barrier(); + t0=usecond(); + for (int i = 0; i < ncall; i++) { + Dw.DhopEO(src_o, r_e, DaggerNo); + } + t1=usecond(); + FGrid->Barrier(); + std::cout<Barrier(); + t0=usecond(); + for (int i = 0; i < ncall; i++) { + Dw.Mooee(src_o, r_o); + } + t1=usecond(); + FGrid->Barrier(); + std::cout<Barrier(); + t0=usecond(); + for (int i = 0; i < ncall; i++) { + Dw.MooeeInv(src_o, r_o); + } + t1=usecond(); + FGrid->Barrier(); + std::cout<Barrier(); + t0=usecond(); + for (int i = 0; i < ncall; i++) { + Dw.Meooe(src_o, r_e); + } + t1=usecond(); + FGrid->Barrier(); + std::cout<