From a2cffb030442cca679038f4655348df4260ab5de Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Mon, 21 Nov 2016 17:47:18 +0100 Subject: [PATCH 01/14] AVXFMA target fixed --- configure.ac | 4 ++-- lib/simd/Grid_avx.h | 18 +++++++++--------- lib/simd/Grid_vector_types.h | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/configure.ac b/configure.ac index a6658a96..90764cb7 100644 --- a/configure.ac +++ b/configure.ac @@ -206,8 +206,8 @@ case ${ax_cv_cxx_compiler_vendor} in AC_DEFINE([AVX1],[1],[AVX intrinsics]) SIMD_FLAGS='-mavx -xavx';; AVXFMA) - AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA4]) - SIMD_FLAGS='-mavx -mfma';; + AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3]) + SIMD_FLAGS='-mavx -fma';; AVX2) AC_DEFINE([AVX2],[1],[AVX2 intrinsics]) SIMD_FLAGS='-march=core-avx2 -xcore-avx2';; diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index f50eae2b..36360102 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -167,7 +167,7 @@ namespace Optimization { } //Integer inline __m256i operator()(__m256i a, __m256i b){ -#if defined (AVX1) || defined (AVXFMA4) +#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4) __m128i a0,a1; __m128i b0,b1; a0 = _mm256_extractf128_si256(a,0); @@ -195,7 +195,7 @@ namespace Optimization { } //Integer inline __m256i operator()(__m256i a, __m256i b){ -#if defined (AVX1) || defined (AVXFMA4) +#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4) __m128i a0,a1; __m128i b0,b1; a0 = _mm256_extractf128_si256(a,0); @@ -216,7 +216,7 @@ namespace Optimization { struct MultComplex{ // Complex float inline __m256 operator()(__m256 a, __m256 b){ -#if defined (AVX1) +#if defined (AVX1) __m256 ymm0,ymm1,ymm2; ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br @@ -233,7 +233,7 @@ namespace Optimization { a_imag = _mm256_mul_ps( a_imag,tmp ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr #endif -#if defined (AVX2) +#if defined (AVX2) || defined (AVXFMA) __m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar __m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) )); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br @@ -264,7 +264,7 @@ namespace Optimization { IF IMM0[3] = 0 THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i ; 0xC unchanged */ -#if defined (AVX1) +#if defined (AVX1) __m256d ymm0,ymm1,ymm2; ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00 ymm0 = _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br @@ -279,7 +279,7 @@ namespace Optimization { a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr #endif -#if defined (AVX2) +#if defined (AVX2) || defined (AVXFMA) __m256d a_real = _mm256_movedup_pd( a ); // Ar Ar __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br @@ -320,7 +320,7 @@ namespace Optimization { #if defined (AVXFMA4) a= _mm256_macc_ps(b,c,a); #endif -#if defined (AVX2) +#if defined (AVX2) || defined (AVXFMA) a= _mm256_fmadd_ps( b, c, a); #endif } @@ -332,7 +332,7 @@ namespace Optimization { #if defined (AVXFMA4) a= _mm256_macc_pd(b,c,a); #endif -#if defined (AVX2) +#if defined (AVX2) || defined (AVXFMA) a= _mm256_fmadd_pd( b, c, a); #endif } @@ -347,7 +347,7 @@ namespace Optimization { } // Integer inline __m256i operator()(__m256i a, __m256i b){ -#if defined (AVX1) +#if defined (AVX1) || defined (AVXFMA) __m128i a0,a1; __m128i b0,b1; a0 = _mm256_extractf128_si256(a,0); diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 184baad9..080dd5c0 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -44,7 +44,7 @@ directory #ifdef SSE4 #include "Grid_sse4.h" #endif -#if defined(AVX1) || defined(AVX2) || defined(AVXFMA4) +#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4) #include "Grid_avx.h" #endif #if defined AVX512 From 5833f247fa7a8f85b6afd5397796455070182792 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Thu, 24 Nov 2016 09:09:48 +0900 Subject: [PATCH 02/14] more FFt optimisations --- lib/FFT.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/FFT.h b/lib/FFT.h index b5b31d82..240f338b 100644 --- a/lib/FFT.h +++ b/lib/FFT.h @@ -244,7 +244,10 @@ namespace Grid { pokeLocalSite(s,pgbuf,cbuf); } } - result = Cshift(result,dim,L); + if (p != processors[dim] - 1) + { + result = Cshift(result,dim,L); + } } // Loop over orthog coords @@ -287,10 +290,10 @@ namespace Grid { cgbuf = clbuf; cgbuf[dim] = clbuf[dim]+L*pc; peekLocalSite(s,pgbuf,cgbuf); - s = s * div; pokeLocalSite(s,result,clbuf); } } + result = result*div; // destroying plan FFTW::fftw_destroy_plan(p); From 3cdf945d840dfb47c8a939bc113425ee3b8ebe68 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Thu, 24 Nov 2016 09:10:03 +0900 Subject: [PATCH 03/14] Test_fftf fix --- tests/core/Test_fftf.cc | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/tests/core/Test_fftf.cc b/tests/core/Test_fftf.cc index 4eb4398d..22838f7b 100644 --- a/tests/core/Test_fftf.cc +++ b/tests/core/Test_fftf.cc @@ -68,7 +68,7 @@ int main (int argc, char ** argv) for(int mu=0;mu<4;mu++){ RealD TwoPiL = M_PI * 2.0/ latt_size[mu]; LatticeCoordinate(coor,mu); - C = C - (TwoPiL * p[mu]) * coor; + C = C + (TwoPiL * p[mu]) * coor; } C = exp(C*ci); @@ -78,10 +78,11 @@ int main (int argc, char ** argv) FFT theFFT(&Fine); - theFFT.FFT_dim(Ctilde,C,0,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()< Date: Thu, 24 Nov 2016 13:24:12 +0000 Subject: [PATCH 04/14] Add QPX Div structure --- lib/simd/Grid_qpx.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index 07933f52..1cd6edae 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -245,6 +245,21 @@ namespace Optimization { } }; + struct Div{ + // Real double + inline vector4double operator()(vector4double a, vector4double b){ + return vec_swdivs(a, b); + } + + // Real float + FLOAT_WRAP_2(operator(), inline) + + // Integer + inline int operator()(int a, int b){ + return a/b; + } + } + struct Conj{ // Complex double inline vector4double operator()(vector4double v){ @@ -412,6 +427,7 @@ template using ReduceSIMD = Optimization::Reduce; // Arithmetic operations typedef Optimization::Sum SumSIMD; typedef Optimization::Sub SubSIMD; +typedef Optimization::Div DivSIMD; typedef Optimization::Mult MultSIMD; typedef Optimization::MultComplex MultComplexSIMD; typedef Optimization::Conj ConjSIMD; From b18950f776954213ecda0b4785aec4cb67833680 Mon Sep 17 00:00:00 2001 From: Lanny91 Date: Fri, 25 Nov 2016 13:21:33 +0000 Subject: [PATCH 05/14] Added simd real divide test with QPX divide fixes --- lib/simd/Grid_qpx.h | 14 +++++++------- tests/Test_simd.cc | 8 ++++++++ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index 1cd6edae..bc86291d 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -244,22 +244,22 @@ namespace Optimization { return a*b; } }; - + struct Div{ // Real double inline vector4double operator()(vector4double a, vector4double b){ - return vec_swdivs(a, b); + return vec_swdiv(a, b); } - + // Real float FLOAT_WRAP_2(operator(), inline) - + // Integer inline int operator()(int a, int b){ return a/b; } - } - + }; + struct Conj{ // Complex double inline vector4double operator()(vector4double v){ @@ -427,8 +427,8 @@ template using ReduceSIMD = Optimization::Reduce; // Arithmetic operations typedef Optimization::Sum SumSIMD; typedef Optimization::Sub SubSIMD; -typedef Optimization::Div DivSIMD; typedef Optimization::Mult MultSIMD; +typedef Optimization::Div DivSIMD; typedef Optimization::MultComplex MultComplexSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc index 189f0559..92f9bcd8 100644 --- a/tests/Test_simd.cc +++ b/tests/Test_simd.cc @@ -50,6 +50,12 @@ public: template void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1*i2;} std::string name(void) const { return std::string("Times"); } }; +class funcDivide { +public: + funcDivide() {}; + template void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1/i2;} + std::string name(void) const { return std::string("Divide"); } +}; class funcConj { public: funcConj() {}; @@ -341,6 +347,7 @@ int main (int argc, char ** argv) Tester(funcPlus()); Tester(funcMinus()); Tester(funcTimes()); + Tester(funcDivide()); Tester(funcAdj()); Tester(funcConj()); Tester(funcInnerProduct()); @@ -371,6 +378,7 @@ int main (int argc, char ** argv) Tester(funcPlus()); Tester(funcMinus()); Tester(funcTimes()); + Tester(funcDivide()); Tester(funcAdj()); Tester(funcConj()); Tester(funcInnerProduct()); From 6c0cc5676bc7969f0d82b73146c9c5f9021c135b Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Sat, 26 Nov 2016 18:25:12 +0000 Subject: [PATCH 06/14] Adding Eigen.inc to the gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 38e3da2d..da7de5e4 100644 --- a/.gitignore +++ b/.gitignore @@ -49,6 +49,7 @@ config.status .deps Make.inc eigen.inc +Eigen.inc # http://www.gnu.org/software/autoconf # ######################################## From 1e44fd309402c1ae099be6d5371100e7a3dd4c75 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Sat, 26 Nov 2016 18:30:53 +0000 Subject: [PATCH 07/14] Added some details on the mpi flags for Cray machines --- README.md | 2 +- configure.ac | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index bfe558a2..c47a257c 100644 --- a/README.md +++ b/README.md @@ -138,7 +138,7 @@ The following options can be use with the `--enable-comms=` option to target dif | `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model | | `shmem ` | Cray SHMEM communications | -For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). +For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead. ### Possible SIMD types diff --git a/configure.ac b/configure.ac index 90764cb7..4225bac5 100644 --- a/configure.ac +++ b/configure.ac @@ -290,7 +290,7 @@ esac case ${ac_COMMS} in *-auto) LX_FIND_MPI - if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["MPI not found"]); fi + if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["The configure could not find the MPI compilation flags. N.B. The -auto mode is not supported by Cray wrappers. Use the non -auto version in this case."]); fi AM_CXXFLAGS="$MPI_CXXFLAGS $AM_CXXFLAGS" AM_CFLAGS="$MPI_CFLAGS $AM_CFLAGS" AM_LDFLAGS="`echo $MPI_CXXLDFLAGS | sed -E 's/-l@<:@^ @:>@+//g'` $AM_LDFLAGS" From ae9688e343acef3e33776a702faf98b25d78cfaa Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Mon, 28 Nov 2016 11:37:02 +0000 Subject: [PATCH 08/14] Reporting also the total mflops --- lib/qcd/action/fermion/WilsonFermion5D.cc | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 4c2d24bf..d2ac96e3 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -194,6 +194,11 @@ void WilsonFermion5D::Report(void) std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + RealD Fullmflops = 1344*volume*DhopCalls/(DhopComputeTime+DhopCommTime)/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; + + } if ( DerivCalls > 0 ) { @@ -209,12 +214,15 @@ void WilsonFermion5D::Report(void) RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime; std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl; - } + + RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NP << std::endl; } if (DerivCalls > 0 || DhopCalls > 0){ - std::cout << GridLogMessage << "WilsonFermion5D Stencil"< Date: Tue, 29 Nov 2016 00:14:36 +0000 Subject: [PATCH 09/14] Actions updated --- lib/qcd/action/Actions.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/lib/qcd/action/Actions.h b/lib/qcd/action/Actions.h index ba6e577d..4a30f8c3 100644 --- a/lib/qcd/action/Actions.h +++ b/lib/qcd/action/Actions.h @@ -195,6 +195,7 @@ typedef WilsonTMFermion WilsonTMFermionD; typedef DomainWallFermion DomainWallFermionR; typedef DomainWallFermion DomainWallFermionF; typedef DomainWallFermion DomainWallFermionD; + typedef MobiusFermion MobiusFermionR; typedef MobiusFermion MobiusFermionF; typedef MobiusFermion MobiusFermionD; @@ -203,6 +204,20 @@ typedef ZMobiusFermion ZMobiusFermionR; typedef ZMobiusFermion ZMobiusFermionF; typedef ZMobiusFermion ZMobiusFermionD; +// Ls vectorised +typedef DomainWallFermion DomainWallFermionVec5dR; +typedef DomainWallFermion DomainWallFermionVec5dF; +typedef DomainWallFermion DomainWallFermionVec5dD; + +typedef MobiusFermion MobiusFermionVec5dR; +typedef MobiusFermion MobiusFermionVec5dF; +typedef MobiusFermion MobiusFermionVec5dD; + +typedef ZMobiusFermion ZMobiusFermionVec5dR; +typedef ZMobiusFermion ZMobiusFermionVec5dF; +typedef ZMobiusFermion ZMobiusFermionVec5dD; + + typedef ScaledShamirFermion ScaledShamirFermionR; typedef ScaledShamirFermion ScaledShamirFermionF; typedef ScaledShamirFermion ScaledShamirFermionD; @@ -254,6 +269,7 @@ typedef MobiusFermion GparityMobiusFermionF; typedef MobiusFermion GparityMobiusFermionD; + }} /////////////////////////////////////////////////////////////////////////////// // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code From 2f92b4860b6e653266fd49bdabcf2690bffbc39b Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 29 Nov 2016 00:15:08 +0000 Subject: [PATCH 10/14] Test the full Mooee sector --- benchmarks/Benchmark_mooee.cc | 237 ++++++++++++++++++++++++++++++++++ 1 file changed, 237 insertions(+) create mode 100644 benchmarks/Benchmark_mooee.cc diff --git a/benchmarks/Benchmark_mooee.cc b/benchmarks/Benchmark_mooee.cc new file mode 100644 index 00000000..c895109f --- /dev/null +++ b/benchmarks/Benchmark_mooee.cc @@ -0,0 +1,237 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./benchmarks/Benchmark_dwf.cc + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int threads = GridThread::GetThreads(); + std::cout< latt4 = GridDefaultLatt(); + const int Ls=8; + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::cout << GridLogMessage << "Making Vec5d innermost grids"< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + std::cout << GridLogMessage << "Seeded"<_Nprocessors; + + + if (1) + { + const int ncall=100; + + std::cout << GridLogMessage<< "*********************************************************" <Barrier(); + + double t0,t1; + t0=usecond(); + for(int i=0;iBarrier(); + + std::cout<Barrier(); + t0=usecond(); + for (int i = 0; i < ncall; i++) { + Dw.DhopEO(src_o, r_e, DaggerNo); + } + t1=usecond(); + FGrid->Barrier(); + std::cout<Barrier(); + t0=usecond(); + for (int i = 0; i < ncall; i++) { + Dw.Mooee(src_o, r_o); + } + t1=usecond(); + FGrid->Barrier(); + std::cout<Barrier(); + t0=usecond(); + for (int i = 0; i < ncall; i++) { + Dw.MooeeInv(src_o, r_o); + } + t1=usecond(); + FGrid->Barrier(); + std::cout<Barrier(); + t0=usecond(); + for (int i = 0; i < ncall; i++) { + Dw.Meooe(src_o, r_e); + } + t1=usecond(); + FGrid->Barrier(); + std::cout<Barrier(); + + double t0,t1; + t0=usecond(); + for(int i=0;iBarrier(); + + std::cout<Barrier(); + t0=usecond(); + for (int i = 0; i < ncall; i++) { + Dw.DhopEO(src_o, r_e, DaggerNo); + } + t1=usecond(); + FGrid->Barrier(); + std::cout<Barrier(); + t0=usecond(); + for (int i = 0; i < ncall; i++) { + Dw.Mooee(src_o, r_o); + } + t1=usecond(); + FGrid->Barrier(); + std::cout<Barrier(); + t0=usecond(); + for (int i = 0; i < ncall; i++) { + Dw.MooeeInv(src_o, r_o); + } + t1=usecond(); + FGrid->Barrier(); + std::cout<Barrier(); + t0=usecond(); + for (int i = 0; i < ncall; i++) { + Dw.Meooe(src_o, r_e); + } + t1=usecond(); + FGrid->Barrier(); + std::cout< Date: Tue, 29 Nov 2016 22:27:55 +0000 Subject: [PATCH 11/14] Serialisation in malloc fixed --- benchmarks/Benchmark_mooee.cc | 128 +++++------------- lib/qcd/action/fermion/CayleyFermion5D.cc | 44 ++++++ lib/qcd/action/fermion/CayleyFermion5D.h | 12 ++ .../action/fermion/CayleyFermion5Dcache.cc | 20 +++ lib/qcd/action/fermion/CayleyFermion5Dvec.cc | 22 ++- 5 files changed, 128 insertions(+), 98 deletions(-) diff --git a/benchmarks/Benchmark_mooee.cc b/benchmarks/Benchmark_mooee.cc index c895109f..097e6da3 100644 --- a/benchmarks/Benchmark_mooee.cc +++ b/benchmarks/Benchmark_mooee.cc @@ -70,7 +70,7 @@ int main (int argc, char ** argv) if (1) { - const int ncall=100; + const int ncall=1000; std::cout << GridLogMessage<< "*********************************************************" <Barrier(); - double t0,t1; - t0=usecond(); - for(int i=0;iBarrier(); - - std::cout<Barrier(); - t0=usecond(); - for (int i = 0; i < ncall; i++) { - Dw.DhopEO(src_o, r_e, DaggerNo); - } - t1=usecond(); - FGrid->Barrier(); - std::cout<Barrier(); - t0=usecond(); - for (int i = 0; i < ncall; i++) { - Dw.Mooee(src_o, r_o); - } - t1=usecond(); - FGrid->Barrier(); - std::cout<Barrier(); \ + t0=usecond(); \ + for(int i=0;iBarrier(); \ + Dw.CayleyReport(); \ + std::cout<Barrier(); - t0=usecond(); - for (int i = 0; i < ncall; i++) { - Dw.MooeeInv(src_o, r_o); - } - t1=usecond(); - FGrid->Barrier(); - std::cout<Barrier(); \ + t0=usecond(); \ + for(int i=0;iBarrier(); \ + Dw.CayleyReport(); \ + std::cout<Barrier(); - t0=usecond(); - for (int i = 0; i < ncall; i++) { - Dw.Meooe(src_o, r_e); - } - t1=usecond(); - FGrid->Barrier(); - std::cout<Barrier(); double t0,t1; - t0=usecond(); - for(int i=0;iBarrier(); - - std::cout<Barrier(); - t0=usecond(); - for (int i = 0; i < ncall; i++) { - Dw.DhopEO(src_o, r_e, DaggerNo); - } - t1=usecond(); - FGrid->Barrier(); - std::cout<Barrier(); - t0=usecond(); - for (int i = 0; i < ncall; i++) { - Dw.Mooee(src_o, r_o); - } - t1=usecond(); - FGrid->Barrier(); - std::cout<Barrier(); - t0=usecond(); - for (int i = 0; i < ncall; i++) { - Dw.MooeeInv(src_o, r_o); - } - t1=usecond(); - FGrid->Barrier(); - std::cout<Barrier(); - t0=usecond(); - for (int i = 0; i < ncall; i++) { - Dw.Meooe(src_o, r_e); - } - t1=usecond(); - FGrid->Barrier(); - std::cout<::Dminus(const FermionField &psi, FermionField &chi) axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi } } + + +template void CayleyFermion5D::CayleyReport(void) +{ + this->Report(); + std::vector latt = GridDefaultLatt(); + RealD volume = this->Ls; for(int mu=0;mu_FourDimGrid->_Nprocessors; + if ( M5Dcalls > 0 ) { + std::cout << GridLogMessage << "#### M5D calls report " << std::endl; + std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls : " << M5Dcalls << std::endl; + std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << M5Dtime / M5Dcalls << " us" << std::endl; + + // Flops = 6.0*(Nc*Ns) *Ls*vol + RealD mflops = 6.0*12*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + } + + if ( MooeeInvCalls > 0 ) { + + std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl; + std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl; + std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl; + + // Flops = 9*12*Ls*vol/2 + RealD mflops = 9.0*12*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + } + +} +template void CayleyFermion5D::CayleyZeroCounters(void) +{ + this->ZeroCounters(); + M5Dflops=0; + M5Dcalls=0; + M5Dtime=0; + MooeeInvFlops=0; + MooeeInvCalls=0; + MooeeInvTime=0; +} + + template void CayleyFermion5D::DminusDag(const FermionField &psi, FermionField &chi) { diff --git a/lib/qcd/action/fermion/CayleyFermion5D.h b/lib/qcd/action/fermion/CayleyFermion5D.h index 1d8c2b95..6fb58234 100644 --- a/lib/qcd/action/fermion/CayleyFermion5D.h +++ b/lib/qcd/action/fermion/CayleyFermion5D.h @@ -120,6 +120,18 @@ namespace Grid { GridRedBlackCartesian &FourDimRedBlackGrid, RealD _mass,RealD _M5,const ImplParams &p= ImplParams()); + + + void CayleyReport(void); + void CayleyZeroCounters(void); + + double M5Dflops; + double M5Dcalls; + double M5Dtime; + + double MooeeInvFlops; + double MooeeInvCalls; + double MooeeInvTime; protected: void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c); diff --git a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc index 62e91dd4..8e7df945 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc @@ -51,6 +51,9 @@ void CayleyFermion5D::M5D(const FermionField &psi, GridBase *grid=psi._grid; assert(phi.checkerboard == psi.checkerboard); chi.checkerboard=psi.checkerboard; + // Flops = 6.0*(Nc*Ns) *Ls*vol + M5Dcalls++; + M5Dtime-=usecond(); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss+=Ls){ // adds Ls for(int s=0;s @@ -91,6 +95,9 @@ void CayleyFermion5D::M5Ddag(const FermionField &psi, assert(phi.checkerboard == psi.checkerboard); chi.checkerboard=psi.checkerboard; + // Flops = 6.0*(Nc*Ns) *Ls*vol + M5Dcalls++; + M5Dtime-=usecond(); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss+=Ls){ // adds Ls auto tmp = psi._odata[0]; @@ -116,6 +123,7 @@ PARALLEL_FOR_LOOP } } } + M5Dtime+=usecond(); } template @@ -126,10 +134,14 @@ void CayleyFermion5D::MooeeInv (const FermionField &psi, FermionField & chi.checkerboard=psi.checkerboard; + MooeeInvCalls++; + MooeeInvTime-=usecond(); + PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss+=Ls){ // adds Ls auto tmp = psi._odata[0]; + // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops // Apply (L^{\prime})^{-1} chi[ss]=psi[ss]; // chi[0]=psi[0] for(int s=1;s @@ -166,6 +181,8 @@ void CayleyFermion5D::MooeeInvDag (const FermionField &psi, FermionField & assert(psi.checkerboard == psi.checkerboard); chi.checkerboard=psi.checkerboard; + MooeeInvCalls++; + MooeeInvTime-=usecond(); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss+=Ls){ // adds Ls @@ -197,6 +214,9 @@ PARALLEL_FOR_LOOP chi[ss+s] = chi[ss+s] - lee[s]*tmp; } } + + MooeeInvTime+=usecond(); + } #ifdef CAYLEY_DPERP_CACHE diff --git a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc index f6569923..3f3f215c 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc @@ -71,6 +71,7 @@ void CayleyFermion5D::M5D(const FermionField &psi, chi.checkerboard=psi.checkerboard; + // just directly address via type pun typedef typename Simd::scalar_type scalar_type; scalar_type * u_p = (scalar_type *)&u[0]; @@ -86,6 +87,8 @@ void CayleyFermion5D::M5D(const FermionField &psi, d_p[ss] = diag[s]; }} + M5Dcalls++; + M5Dtime-=usecond(); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss+=LLs){ // adds LLs @@ -115,6 +118,7 @@ PARALLEL_FOR_LOOP } } + M5Dtime+=usecond(); } template @@ -154,6 +158,8 @@ void CayleyFermion5D::M5Ddag(const FermionField &psi, d_p[ss] = diag[s]; }} + M5Dcalls++; + M5Dtime-=usecond(); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss+=LLs){ // adds LLs @@ -183,6 +189,7 @@ PARALLEL_FOR_LOOP } } + M5Dtime+=usecond(); } template @@ -250,13 +257,11 @@ void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField } } + MooeeInvCalls++; + MooeeInvTime-=usecond(); // Dynamic allocate on stack to get per thread without serialised heap acces -PARALLEL_FOR_LOOP - for(auto site=0;site SitePplus(LLs); Vector SitePminus(LLs); @@ -267,6 +272,9 @@ PARALLEL_FOR_LOOP SiteHalfSpinor BcastP; SiteHalfSpinor BcastM; +#pragma omp for + for(auto site=0;site Date: Wed, 30 Nov 2016 22:11:10 +0000 Subject: [PATCH 12/14] Ls 16 more relevant --- benchmarks/Benchmark_mooee.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_mooee.cc b/benchmarks/Benchmark_mooee.cc index 097e6da3..df80288c 100644 --- a/benchmarks/Benchmark_mooee.cc +++ b/benchmarks/Benchmark_mooee.cc @@ -41,7 +41,7 @@ int main (int argc, char ** argv) std::cout< latt4 = GridDefaultLatt(); - const int Ls=8; + const int Ls=16; GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); From 6adf35da54fb6df2d71980447c8d6271ab34565b Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 1 Dec 2016 11:39:04 +0000 Subject: [PATCH 13/14] Faster Mobius --- lib/qcd/action/fermion/CayleyFermion5Dvec.cc | 137 ++++++++++++++++--- lib/simd/Grid_vector_types.h | 2 +- 2 files changed, 117 insertions(+), 22 deletions(-) diff --git a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc index 3f3f215c..cfd96aaf 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc @@ -60,7 +60,7 @@ void CayleyFermion5D::M5D(const FermionField &psi, GridBase *grid=psi._grid; int Ls = this->Ls; int LLs = grid->_rdimensions[0]; - int nsimd= Simd::Nsimd(); + const int nsimd= Simd::Nsimd(); Vector > u(LLs); Vector > l(LLs); @@ -71,7 +71,6 @@ void CayleyFermion5D::M5D(const FermionField &psi, chi.checkerboard=psi.checkerboard; - // just directly address via type pun typedef typename Simd::scalar_type scalar_type; scalar_type * u_p = (scalar_type *)&u[0]; @@ -87,36 +86,133 @@ void CayleyFermion5D::M5D(const FermionField &psi, d_p[ss] = diag[s]; }} + M5Dcalls++; M5Dtime-=usecond(); + + assert(Nc==3); + PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss+=LLs){ // adds LLs +#if 0 + alignas(64) SiteHalfSpinor hp; + alignas(64) SiteHalfSpinor hm; + alignas(64) SiteSpinor fp; + alignas(64) SiteSpinor fm; - alignas(64) SiteHalfSpinor hp; - alignas(64) SiteHalfSpinor hm; - alignas(64) SiteSpinor fp; - alignas(64) SiteSpinor fm; + for(int v=0;v=v ) rotate(hm,hm,nsimd-1); + + hp=0.5*hp; + hm=0.5*hm; + + spRecon5m(fp,hp); + spRecon5p(fm,hm); + + chi[ss+v] = d[v]*phi[ss+v]; + chi[ss+v] = chi[ss+v] +u[v]*fp; + chi[ss+v] = chi[ss+v] +l[v]*fm; + + } +#else + for(int v=0;v=v ) rotate(hm,hm,nsimd-1); + int vp= (v==LLs-1) ? 0 : v+1; + int vm= (v==0 ) ? LLs-1 : v-1; + + Simd hp_00 = psi[ss+vp]()(2)(0); + Simd hp_01 = psi[ss+vp]()(2)(1); + Simd hp_02 = psi[ss+vp]()(2)(2); + Simd hp_10 = psi[ss+vp]()(3)(0); + Simd hp_11 = psi[ss+vp]()(3)(1); + Simd hp_12 = psi[ss+vp]()(3)(2); + + Simd hm_00 = psi[ss+vm]()(0)(0); + Simd hm_01 = psi[ss+vm]()(0)(1); + Simd hm_02 = psi[ss+vm]()(0)(2); + Simd hm_10 = psi[ss+vm]()(1)(0); + Simd hm_11 = psi[ss+vm]()(1)(1); + Simd hm_12 = psi[ss+vm]()(1)(2); - hp=hp*0.5; - hm=hm*0.5; - spRecon5m(fp,hp); - spRecon5p(fm,hm); + // if ( ss==0) std::cout << " hp_00 " <(hp_00.v); + hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); + hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v); + hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v); + hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v); + hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v); + } + if ( vm>=v ) { + hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v); + hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v); + hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v); + hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v); + hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v); + hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); + } - } + /* + if ( ss==0) std::cout << " dphi_00 " < void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv) { diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 080dd5c0..42f28b34 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -130,7 +130,7 @@ class Grid_simd { Vector_type v; - static inline int Nsimd(void) { + static inline constexpr int Nsimd(void) { return sizeof(Vector_type) / sizeof(Scalar_type); } From e27c6b217c64edc343c054e77b6fa57b58963f93 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 1 Dec 2016 12:42:53 +0000 Subject: [PATCH 14/14] Updating --- benchmarks/Benchmark_mooee.cc | 2 ++ lib/qcd/action/fermion/CayleyFermion5Dvec.cc | 5 ++++- lib/simd/Grid_avx512.h | 8 ++++---- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/benchmarks/Benchmark_mooee.cc b/benchmarks/Benchmark_mooee.cc index df80288c..dfaea627 100644 --- a/benchmarks/Benchmark_mooee.cc +++ b/benchmarks/Benchmark_mooee.cc @@ -101,6 +101,7 @@ int main (int argc, char ** argv) #define BENCH_DW(A,in,out) \ Dw.CayleyZeroCounters(); \ + Dw. A (in,out); \ FGrid->Barrier(); \ t0=usecond(); \ for(int i=0;iBarrier(); \ t0=usecond(); \ for(int i=0;i