From 683550f1166d316f3af722228c8be4b6c87c8d16 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 9 Apr 2017 23:41:04 +0900 Subject: [PATCH 01/22] Const args improvement --- lib/lattice/Lattice_transfer.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/lattice/Lattice_transfer.h b/lib/lattice/Lattice_transfer.h index 8eb75f15..68de52d0 100644 --- a/lib/lattice/Lattice_transfer.h +++ b/lib/lattice/Lattice_transfer.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -359,7 +359,7 @@ void localConvert(const Lattice &in,Lattice &out) template -void InsertSlice(Lattice &lowDim,Lattice & higherDim,int slice, int orthog) +void InsertSlice(const Lattice &lowDim,Lattice & higherDim,int slice, int orthog) { typedef typename vobj::scalar_object sobj; @@ -401,7 +401,7 @@ void InsertSlice(Lattice &lowDim,Lattice & higherDim,int slice, int } template -void ExtractSlice(Lattice &lowDim, Lattice & higherDim,int slice, int orthog) +void ExtractSlice(Lattice &lowDim,const Lattice & higherDim,int slice, int orthog) { typedef typename vobj::scalar_object sobj; @@ -444,7 +444,7 @@ void ExtractSlice(Lattice &lowDim, Lattice & higherDim,int slice, in template -void InsertSliceLocal(Lattice &lowDim, Lattice & higherDim,int slice_lo,int slice_hi, int orthog) +void InsertSliceLocal(const Lattice &lowDim, Lattice & higherDim,int slice_lo,int slice_hi, int orthog) { typedef typename vobj::scalar_object sobj; From db5f6d3ae39a77800faab9cf0e70f2f24898c1e9 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 9 Apr 2017 23:41:30 +0900 Subject: [PATCH 02/22] Verbose fix --- lib/qcd/action/fermion/ImprovedStaggeredFermion.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc index 2ba4f4af..5ce2b335 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc @@ -160,8 +160,6 @@ void ImprovedStaggeredFermion::ImportGauge(const GaugeField &_Uthin,const PokeIndex(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4); } - std::cout << " Umu " << Umu._odata[0]< Date: Sun, 9 Apr 2017 23:42:10 +0900 Subject: [PATCH 03/22] Start of blockCG --- .../iterative/BlockConjugateGradient.h | 337 ++++++++++++++++++ .../solver/Test_staggered_block_cg_unprec.cc | 94 +++++ tests/solver/Test_staggered_cg_unprec.cc | 83 +++++ 3 files changed, 514 insertions(+) create mode 100644 lib/algorithms/iterative/BlockConjugateGradient.h create mode 100644 tests/solver/Test_staggered_block_cg_unprec.cc create mode 100644 tests/solver/Test_staggered_cg_unprec.cc diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h new file mode 100644 index 00000000..616387d8 --- /dev/null +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -0,0 +1,337 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/algorithms/iterative/BlockConjugateGradient.h + +Copyright (C) 2017 + +Author: Azusa Yamaguchi +Author: Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H +#define GRID_BLOCK_CONJUGATE_GRADIENT_H + +#include + +namespace Grid { + +GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) +{ + int NN = BlockSolverGrid->_ndimension; + int nsimd = BlockSolverGrid->Nsimd(); + + std::vector latt_phys(0); + std::vector simd_phys(0); + std::vector mpi_phys(0); + + for(int d=0;d_fdimensions[d]); + simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); + mpi_phys.push_back(BlockSolverGrid->_processors[d]); + } + } + return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); +} + ////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Need to move sliceInnerProduct, sliceAxpy, sliceNorm etc... into lattice sector along with sliceSum + ////////////////////////////////////////////////////////////////////////////////////////////////////////////// +template +static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + // If we based this on Cshift it would work for spread out + // but it would be even slower + for(int i=0;i +static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + GridBase *FullGrid = lhs._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + int Nblock = FullGrid->GlobalDimensions()[Orthog]; + + Lattice Lslice(SliceGrid); + Lattice Rslice(SliceGrid); + + mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); + + for(int i=0;i +static void sliceInnerProductVector( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced scalar; + typedef typename scalar::scalar_object scomplex; + + int Nblock = lhs._grid->GlobalDimensions()[Orthog]; + + vec.resize(Nblock); + std::vector sip(Nblock); + Lattice IP(lhs._grid); + + IP=localInnerProduct(lhs,rhs); + sliceSum(IP,sip,Orthog); + + for(int ss=0;ss +static void sliceNorm (std::vector &sn,const Lattice &rhs,int Orthog) { + + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = rhs._grid->GlobalDimensions()[Orthog]; + std::vector ip(Nblock); + sn.resize(Nblock); + + sliceInnerProductVector(ip,rhs,rhs,Orthog); + for(int ss=0;ss +static void sliceInnerProductMatrixOld( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced scalar; + typedef typename scalar::scalar_object scomplex; + + int Nblock = lhs._grid->GlobalDimensions()[Orthog]; + + std::cout << " sliceInnerProductMatrix Dim "< IP(lhs._grid); + std::vector sip(Nblock); + + mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); + + Lattice tmp = rhs; + + for(int s1=0;s1 +class BlockConjugateGradient : public OperatorFunction { + public: + + typedef typename Field::scalar_type scomplex; + + const int blockDim = 0; + + int Nblock; + bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. + // Defaults true. + RealD Tolerance; + Integer MaxIterations; + Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion + + BlockConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true) + : Tolerance(tol), + MaxIterations(maxit), + ErrorOnNoConverge(err_on_no_conv){}; + +void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) +{ + int Orthog = 0; // First dimension is block dim + Nblock = Src._grid->_fdimensions[Orthog]; + std::cout< residuals(Nblock); + std::vector ssq(Nblock); + + sliceNorm(ssq,Src,Orthog); + RealD sssum=0; + for(int b=0;b max_resid ) max_resid = rr; + } + + if ( max_resid < Tolerance*Tolerance ) { + std::cout << GridLogMessage<<" Block solver has converged in " + < +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + +int main (int argc, char ** argv) +{ + typedef typename ImprovedStaggeredFermion5DR::FermionField FermionField; + typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField; + typename ImprovedStaggeredFermion5DR::ImplParams params; + + const int Ls=8; + + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::vector seeds({1,2,3,4}); + GridParallelRNG pRNG(UGrid ); pRNG.SeedFixedIntegers(seeds); + GridParallelRNG pRNG5(FGrid); pRNG5.SeedFixedIntegers(seeds); + + FermionField src(FGrid); random(pRNG5,src); + FermionField result(FGrid); result=zero; + RealD nrm = norm2(src); + + LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu); + + RealD mass=0.01; + ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); + + MdagMLinearOperator HermOp(Ds); + + ConjugateGradient CG(1.0e-8,10000); + BlockConjugateGradient BCG(1.0e-8,10000); + + std::cout << GridLogMessage << " Calling CG "< +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + +int main (int argc, char ** argv) +{ + typedef typename ImprovedStaggeredFermionR::FermionField FermionField; + typedef typename ImprovedStaggeredFermionR::ComplexField ComplexField; + typename ImprovedStaggeredFermionR::ImplParams params; + + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout); + + std::vector seeds({1,2,3,4}); + GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); + + FermionField src(&Grid); random(pRNG,src); + RealD nrm = norm2(src); + FermionField result(&Grid); result=zero; + LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + + double volume=1; + for(int mu=0;mu HermOp(Ds); + ConjugateGradient CG(1.0e-8,10000); + CG(HermOp,src,result); + + Grid_finalize(); +} From d80d802f9daafee01462579ca7eb4b9b8f452c6e Mon Sep 17 00:00:00 2001 From: paboyle Date: Mon, 10 Apr 2017 00:12:12 +0900 Subject: [PATCH 04/22] MultiRHS solver test --- .../iterative/BlockConjugateGradient.h | 159 ++++++++++++++++++ .../solver/Test_staggered_block_cg_unprec.cc | 5 + 2 files changed, 164 insertions(+) diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h index 616387d8..c333f8e7 100644 --- a/lib/algorithms/iterative/BlockConjugateGradient.h +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -81,6 +81,30 @@ static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice } }; template +static void sliceMaddVector (Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, + int Orthog,RealD scale=1.0) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + // If we based this on Cshift it would work for spread out + // but it would be even slower + for(int i=0;i static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) { typedef typename vobj::scalar_object sobj; @@ -194,6 +218,8 @@ static void sliceInnerProductMatrixOld( Eigen::MatrixXcd &mat, const Lattice &Linop, const Field &Src, Field &Psi) IterationsToComplete = k; } }; + + +////////////////////////////////////////////////////////////////////////// +// multiRHS conjugate gradient. Dimension zero should be the block direction +////////////////////////////////////////////////////////////////////////// +template +class MultiRHSConjugateGradient : public OperatorFunction { + public: + + typedef typename Field::scalar_type scomplex; + + const int blockDim = 0; + + int Nblock; + bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. + // Defaults true. + RealD Tolerance; + Integer MaxIterations; + Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion + + MultiRHSConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true) + : Tolerance(tol), + MaxIterations(maxit), + ErrorOnNoConverge(err_on_no_conv){}; + +void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) +{ + int Orthog = 0; // First dimension is block dim + Nblock = Src._grid->_fdimensions[Orthog]; + std::cout< v_pAp(Nblock); + std::vector v_rr (Nblock); + std::vector v_rr_inv(Nblock); + std::vector v_alpha(Nblock); + std::vector v_beta(Nblock); + + // Initial residual computation & set up + std::vector residuals(Nblock); + std::vector ssq(Nblock); + + sliceNorm(ssq,Src,Orthog); + RealD sssum=0; + for(int b=0;b max_resid ) max_resid = rr; + } + + if ( max_resid < Tolerance*Tolerance ) { + std::cout << GridLogMessage<<" MultiRHS solver has converged in " + < CG(1.0e-8,10000); BlockConjugateGradient BCG(1.0e-8,10000); + MultiRHSConjugateGradient mCG(1.0e-8,10000); std::cout << GridLogMessage << " Calling CG "< Date: Mon, 10 Apr 2017 20:38:20 +0900 Subject: [PATCH 05/22] Commenting and clean up --- .../iterative/BlockConjugateGradient.h | 20 +++++++++++++++++++ lib/algorithms/iterative/ConjugateGradient.h | 3 +-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h index c333f8e7..0f4a3a80 100644 --- a/lib/algorithms/iterative/BlockConjugateGradient.h +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -69,8 +69,14 @@ static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice Lattice Xslice(SliceGrid); Lattice Rslice(SliceGrid); + // FIXME: Implementation is slow // If we based this on Cshift it would work for spread out // but it would be even slower + // + // Repeated extract slice is inefficient + // + // Best base the linear combination by constructing a + // set of vectors of size grid->_rdimensions[Orthog]. for(int i=0;i static void sliceMaddVector (Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, int Orthog,RealD scale=1.0) { + // FIXME: Implementation is slow + // Best base the linear combination by constructing a + // set of vectors of size grid->_rdimensions[Orthog]. typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; @@ -107,6 +116,8 @@ static void sliceMaddVector (Lattice &R,std::vector &a,const Lattic template static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) { + // FIXME: Implementation is slow + // Not sure of best solution.. think about it typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; @@ -141,6 +152,9 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice template static void sliceInnerProductVector( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) { + // FIXME: Implementation is slow + // Look at localInnerProduct implementation, + // and do inside a site loop with block strided iterators typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; @@ -257,6 +271,10 @@ void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) Field AP(Src); Field R(Src); + GridStopWatch LinalgTimer; + GridStopWatch MatrixTimer; + GridStopWatch SolverTimer; + Eigen::MatrixXcd m_pAp = Eigen::MatrixXcd::Identity(Nblock,Nblock); Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock); Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(Nblock,Nblock); @@ -339,8 +357,10 @@ void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) } if ( max_resid < Tolerance*Tolerance ) { + std::cout << GridLogMessage<<" Block solver has converged in " < { } std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: k=0 residual " << cp << " target " << rsq - << std::endl; + << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl; GridStopWatch LinalgTimer; GridStopWatch MatrixTimer; From cb6b81ae8291a3cecf2cc896a04842792735e67e Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 12 Apr 2017 19:32:37 +0100 Subject: [PATCH 06/22] Half precision conversion --- configure.ac | 10 ++--- lib/simd/Grid_avx.h | 37 ++++++++++++++++++ lib/simd/Grid_avx512.h | 36 +++++++++++++++++ lib/simd/Grid_sse4.h | 57 +++++++++++++++++++++++---- lib/simd/Grid_vector_types.h | 65 ++++++++++++++++++++++++++++-- tests/Test_simd.cc | 76 ++++++++++++++++++++++++++++++++++-- 6 files changed, 261 insertions(+), 20 deletions(-) diff --git a/configure.ac b/configure.ac index 3a323a5d..6924fae1 100644 --- a/configure.ac +++ b/configure.ac @@ -176,19 +176,19 @@ case ${ax_cv_cxx_compiler_vendor} in case ${ac_SIMD} in SSE4) AC_DEFINE([SSE4],[1],[SSE4 intrinsics]) - SIMD_FLAGS='-msse4.2';; + SIMD_FLAGS='-msse4.2 -mf16c';; AVX) AC_DEFINE([AVX1],[1],[AVX intrinsics]) - SIMD_FLAGS='-mavx';; + SIMD_FLAGS='-mavx -mf16c';; AVXFMA4) AC_DEFINE([AVXFMA4],[1],[AVX intrinsics with FMA4]) - SIMD_FLAGS='-mavx -mfma4';; + SIMD_FLAGS='-mavx -mfma4 -mf16c';; AVXFMA) AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3]) - SIMD_FLAGS='-mavx -mfma';; + SIMD_FLAGS='-mavx -mfma -mf16c';; AVX2) AC_DEFINE([AVX2],[1],[AVX2 intrinsics]) - SIMD_FLAGS='-mavx2 -mfma';; + SIMD_FLAGS='-mavx2 -mfma -mf16c';; AVX512) AC_DEFINE([AVX512],[1],[AVX512 intrinsics]) SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';; diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index 2dbe26f4..86422f51 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -471,6 +471,42 @@ namespace Optimization { }; }; + struct PrecisionChange { + static inline __m256i StoH (__m256 a,__m256 b) { + __m128i ha = _mm256_cvtps_ph(a,0); + __m128i hb = _mm256_cvtps_ph(b,0); + __m256 h = _mm256_castps128_ps256(ha); + h = _mm256_insertf128_ps(h,hb,1); + return h; + } + static inline void HtoS (__m256i h,__m256 &sa,__m256 &sb) { + sa = _mm256_cvtph_ps(_mm256_extractf128_ps(h,0)); + sb = _mm256_cvtph_ps(_mm256_extractf128_ps(h,1)); + } + static inline __m256 DtoS (__m256d a,__m256d b) { + __m128 sa = _mm256_cvtpd_ps(a); + __m128 sb = _mm256_cvtpd_ps(b); + __m256 s = _mm256_castps128_ps256(sa); + s = _mm256_insertf128_ps(s,sb,1); + return s; + } + static inline void StoD (__m256 s,__m256d &a,__m256d &b) { + a = _mm256_cvtps_pd(_mm256_extractf128_ps(s,0)); + b = _mm256_cvtps_pd(_mm256_extractf128_ps(s,1)); + } + static inline __m256 DtoH (__m256i a,__m256 b,__m256 c,__m256 d) { + __m256 sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); + } + static inline void HtoD (__m256i h,__m256d &a,__m256d &b,__m256d &c,__m256d &d) { + __m256 sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); + } + }; struct Exchange{ // 3210 ordering static inline void Exchange0(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){ @@ -675,6 +711,7 @@ namespace Optimization { ////////////////////////////////////////////////////////////////////////////////////// // Here assign types + typedef __m256i SIMD_Htype; // Single precision type typedef __m256 SIMD_Ftype; // Single precision type typedef __m256d SIMD_Dtype; // Double precision type typedef __m256i SIMD_Itype; // Integer type diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index f39c4033..647fba77 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -343,6 +343,42 @@ namespace Optimization { }; + struct PrecisionChange { + static inline __m512i StoH (__m512 a,__m512 b) { + __m256i ha = _mm512_cvtps_ph(a,0); + __m256i hb = _mm512_cvtps_ph(b,0); + __m512 h = _mm512_castps256_ps512(ha); + h = _mm512_insertf256_ps(h,hb,1); + return h; + } + static inline void HtoS (__m512i h,__m512 &sa,__m512 &sb) { + sa = _mm512_cvtph_ps(_mm512_extractf256_ps(h,0)); + sb = _mm512_cvtph_ps(_mm512_extractf256_ps(h,1)); + } + static inline __m512 DtoS (__m512d a,__m512d b) { + __m256 sa = _mm512_cvtpd_ps(a); + __m256 sb = _mm512_cvtpd_ps(b); + __m512 s = _mm512_castps256_ps512(sa); + s = _mm512_insertf256_ps(s,sb,1); + return s; + } + static inline void StoD (__m512 s,__m512d &a,__m512d &b) { + a = _mm512_cvtps_pd(_mm512_extractf256_ps(s,0)); + b = _mm512_cvtps_pd(_mm512_extractf256_ps(s,1)); + } + static inline __m512 DtoH (__m512i a,__m512 b,__m512 c,__m512 d) { + __m512 sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); + } + static inline void HtoD (__m512i h,__m512d &a,__m512d &b,__m512d &c,__m512d &d) { + __m512 sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); + } + }; // On extracting face: Ah Al , Bh Bl -> Ah Bh, Al Bl // On merging buffers: Ah,Bh , Al Bl -> Ah Al, Bh, Bl // The operation is its own inverse diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index fcad4c28..cf78aa52 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -38,6 +38,7 @@ Author: neo #include + namespace Grid { namespace Optimization { @@ -328,6 +329,48 @@ namespace Optimization { }; }; + +#ifndef _mm_alignr_epi64 +#define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) +#define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) +#endif + struct PrecisionChange { + static inline __m128i StoH (__m128 a,__m128 b) { + __m128i ha = _mm_cvtps_ph(a,0); + __m128i hb = _mm_cvtps_ph(b,0); + __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); + return h; + } + static inline void HtoS (__m128i h,__m128 &sa,__m128 &sb) { + sa = _mm_cvtph_ps(h); + h = (__m128)_mm_alignr_epi32((__m128i)h,(__m128i)h,2); + sb = _mm_cvtph_ps(h); + } + static inline __m128 DtoS (__m128d a,__m128d b) { + __m128 sa = _mm_cvtpd_ps(a); + __m128 sb = _mm_cvtpd_ps(b); + __m128 s = _mm_shuffle_ps(sa,sb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); + return s; + } + static inline void StoD (__m128 s,__m128d &a,__m128d &b) { + a = _mm_cvtps_pd(s); + s = (__m128)_mm_alignr_epi32((__m128i)s,(__m128i)s,2); + b = _mm_cvtps_pd(s); + } + static inline __m128 DtoH (__m128i a,__m128 b,__m128 c,__m128 d) { + __m128 sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); + } + static inline void HtoD (__m128i h,__m128d &a,__m128d &b,__m128d &c,__m128d &d) { + __m128 sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); + } + }; + struct Exchange{ // 3210 ordering static inline void Exchange0(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ @@ -335,8 +378,10 @@ namespace Optimization { out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2)); }; static inline void Exchange1(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ - out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); - out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); + out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); /*ACEG*/ + out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); /*BDFH*/ + out1= _mm_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ + out2= _mm_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ }; static inline void Exchange2(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ assert(0); @@ -383,11 +428,6 @@ namespace Optimization { default: assert(0); } } - -#ifndef _mm_alignr_epi64 -#define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) -#define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) -#endif template static inline __m128 tRotate(__m128 in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); }; template static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); }; @@ -450,7 +490,8 @@ namespace Optimization { ////////////////////////////////////////////////////////////////////////////////////// // Here assign types - typedef __m128 SIMD_Ftype; // Single precision type + typedef __m128i SIMD_Htype; // Single precision type + typedef __m128 SIMD_Ftype; // Single precision type typedef __m128d SIMD_Dtype; // Double precision type typedef __m128i SIMD_Itype; // Integer type diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 57e7f11e..ccedf99c 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -358,16 +358,12 @@ class Grid_simd { { if (n==3) { Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v); - // std::cout << " Exchange3 "<< out1<<" "<< out2<<" <- " << in1 << " "<, SIMD_Ftype> vComplexF; typedef Grid_simd, SIMD_Dtype> vComplexD; typedef Grid_simd vInteger; +// Half precision; no arithmetic support +typedef Grid_simd vRealH; +typedef Grid_simd, SIMD_Htype> vComplexH; + +inline void precisionChange(vRealF *out,vRealD *in,int nvec) +{ + assert((nvec&0x1)==0); + for(int m=0;m*2({45,12,81,9})); + const int Ndp = 16; + const int Nsp = Ndp/2; + const int Nhp = Ndp/4; + std::vector > H (Nhp); + std::vector > F (Nsp); + std::vector > FF(Nsp); + std::vector > D (Ndp); + std::vector > DD(Ndp); + for(int i=0;i<16;i++){ + random(sRNG,D[i]); + } + // Double to Single + precisionChange(&F[0],&D[0],Ndp); + precisionChange(&DD[0],&F[0],Ndp); + std::cout << GridLogMessage<<"Double to single"; + for(int i=0;i Date: Thu, 13 Apr 2017 08:38:12 +0100 Subject: [PATCH 07/22] Exchange in generic Precision change in AVX, SSE, AVX512, Generic. QPX still to do. --- configure.ac | 2 +- lib/simd/Grid_avx.h | 6 +-- lib/simd/Grid_avx512.h | 4 +- lib/simd/Grid_generic.h | 88 +++++++++++++++++++++++++++++++++++ lib/simd/Grid_generic_types.h | 11 +++-- lib/simd/Grid_qpx.h | 1 - lib/simd/Grid_sse4.h | 2 +- tests/Test_simd.cc | 41 +++++++++------- 8 files changed, 126 insertions(+), 29 deletions(-) diff --git a/configure.ac b/configure.ac index 6924fae1..298c04de 100644 --- a/configure.ac +++ b/configure.ac @@ -10,7 +10,7 @@ AC_CONFIG_HEADERS([lib/Config.h],[sed -i 's|PACKAGE_|GRID_|' lib/Config.h]) m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) ############### Checks for programs -CXXFLAGS="-O3 $CXXFLAGS" +CXXFLAGS="-g $CXXFLAGS" AC_PROG_CXX AC_PROG_RANLIB diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index 86422f51..c2e02cb2 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -377,8 +377,8 @@ namespace Optimization { b0 = _mm256_extractf128_si256(b,0); a1 = _mm256_extractf128_si256(a,1); b1 = _mm256_extractf128_si256(b,1); - a0 = _mm_mul_epi32(a0,b0); - a1 = _mm_mul_epi32(a1,b1); + a0 = _mm_mullo_epi32(a0,b0); + a1 = _mm_mullo_epi32(a1,b1); return _mm256_set_m128i(a1,a0); #endif #if defined (AVX2) @@ -494,7 +494,7 @@ namespace Optimization { a = _mm256_cvtps_pd(_mm256_extractf128_ps(s,0)); b = _mm256_cvtps_pd(_mm256_extractf128_ps(s,1)); } - static inline __m256 DtoH (__m256i a,__m256 b,__m256 c,__m256 d) { + static inline __m256i DtoH (__m256d a,__m256d b,__m256d c,__m256d d) { __m256 sa,sb; sa = DtoS(a,b); sb = DtoS(c,d); diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index 647fba77..87798637 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -235,11 +235,9 @@ namespace Optimization { inline void mac(__m512 &a, __m512 b, __m512 c){ a= _mm512_fmadd_ps( b, c, a); } - inline void mac(__m512d &a, __m512d b, __m512d c){ a= _mm512_fmadd_pd( b, c, a); } - // Real float inline __m512 operator()(__m512 a, __m512 b){ return _mm512_mul_ps(a,b); @@ -366,7 +364,7 @@ namespace Optimization { a = _mm512_cvtps_pd(_mm512_extractf256_ps(s,0)); b = _mm512_cvtps_pd(_mm512_extractf256_ps(s,1)); } - static inline __m512 DtoH (__m512i a,__m512 b,__m512 c,__m512 d) { + static inline __m512i DtoH (__m512d a,__m512d b,__m512d c,__m512d d) { __m512 sa,sb; sa = DtoS(a,b); sb = DtoS(c,d); diff --git a/lib/simd/Grid_generic.h b/lib/simd/Grid_generic.h index 7972da55..466aa2f7 100644 --- a/lib/simd/Grid_generic.h +++ b/lib/simd/Grid_generic.h @@ -279,6 +279,93 @@ namespace Optimization { #undef timesi + struct PrecisionChange { + static inline vech StoH (const vecf &a,const vecf &b) { + vech ret; + vech *ha = (vech *)&a; + vech *hb = (vech *)&b; + const int nf = W::r; + // VECTOR_FOR(i, nf,1){ ret.v[i] = ( (uint16_t *) &a.v[i])[1] ; } + // VECTOR_FOR(i, nf,1){ ret.v[i+nf] = ( (uint16_t *) &b.v[i])[1] ; } + VECTOR_FOR(i, nf,1){ ret.v[i] = ha->v[2*i+1]; } + VECTOR_FOR(i, nf,1){ ret.v[i+nf] = hb->v[2*i+1]; } + return ret; + } + static inline void HtoS (vech h,vecf &sa,vecf &sb) { + const int nf = W::r; + const int nh = W::r; + vech *ha = (vech *)&sa; + vech *hb = (vech *)&sb; + VECTOR_FOR(i, nf, 1){ sb.v[i]= sa.v[i] = 0; } + // VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sa.v[i]))[1] = h.v[i];} + // VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sb.v[i]))[1] = h.v[i+nf];} + VECTOR_FOR(i, nf, 1){ ha->v[2*i+1]=h.v[i]; } + VECTOR_FOR(i, nf, 1){ hb->v[2*i+1]=h.v[i+nf]; } + } + static inline vecf DtoS (vecd a,vecd b) { + const int nd = W::r; + const int nf = W::r; + vecf ret; + VECTOR_FOR(i, nd,1){ ret.v[i] = a.v[i] ; } + VECTOR_FOR(i, nd,1){ ret.v[i+nd] = b.v[i] ; } + return ret; + } + static inline void StoD (vecf s,vecd &a,vecd &b) { + const int nd = W::r; + VECTOR_FOR(i, nd,1){ a.v[i] = s.v[i] ; } + VECTOR_FOR(i, nd,1){ b.v[i] = s.v[i+nd] ; } + } + static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) { + vecf sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); + } + static inline void HtoD (vech h,vecd &a,vecd &b,vecd &c,vecd &d) { + vecf sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); + } + }; + + ////////////////////////////////////////////// + // Exchange support + struct Exchange{ + + template + static inline void ExchangeN(vec &out1,vec &out2,vec &in1,vec &in2){ + const int w = W::r; + unsigned int mask = w >> (n + 1); + // std::cout << " Exchange "< + static inline void Exchange0(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN(out1,out2,in1,in2); + }; + template + static inline void Exchange1(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN(out1,out2,in1,in2); + }; + template + static inline void Exchange2(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN(out1,out2,in1,in2); + }; + template + static inline void Exchange3(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN(out1,out2,in1,in2); + }; + }; + + ////////////////////////////////////////////// // Some Template specialization #define perm(a, b, n, w)\ @@ -403,6 +490,7 @@ namespace Optimization { ////////////////////////////////////////////////////////////////////////////////////// // Here assign types + typedef Optimization::vech SIMD_Htype; // Reduced precision type typedef Optimization::vecf SIMD_Ftype; // Single precision type typedef Optimization::vecd SIMD_Dtype; // Double precision type typedef Optimization::veci SIMD_Itype; // Integer type diff --git a/lib/simd/Grid_generic_types.h b/lib/simd/Grid_generic_types.h index 2142bc8e..642f6ffe 100644 --- a/lib/simd/Grid_generic_types.h +++ b/lib/simd/Grid_generic_types.h @@ -66,6 +66,10 @@ namespace Optimization { template <> struct W { constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; }; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/4u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/2u; + }; // SIMD vector types template @@ -73,8 +77,9 @@ namespace Optimization { alignas(GEN_SIMD_WIDTH) T v[W::r]; }; - typedef vec vecf; - typedef vec vecd; - typedef vec veci; + typedef vec vecf; + typedef vec vecd; + typedef vec vech; // half precision comms + typedef vec veci; }} diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index d77a560a..757c84c9 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -125,7 +125,6 @@ namespace Optimization { f[2] = a.v2; f[3] = a.v3; } - //Double inline void operator()(double *d, vector4double a){ vec_st(a, 0, d); diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index cf78aa52..89eb7daa 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -357,7 +357,7 @@ namespace Optimization { s = (__m128)_mm_alignr_epi32((__m128i)s,(__m128i)s,2); b = _mm_cvtps_pd(s); } - static inline __m128 DtoH (__m128i a,__m128 b,__m128 c,__m128 d) { + static inline __m128i DtoH (__m128d a,__m128d b,__m128d c,__m128d d) { __m128 sa,sb; sa = DtoS(a,b); sb = DtoS(c,d); diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc index 423372f1..abec2410 100644 --- a/tests/Test_simd.cc +++ b/tests/Test_simd.cc @@ -308,18 +308,23 @@ public: int n; funcExchange(int _n) { n=_n;}; template void operator()(vec &r1,vec &r2,vec &i1,vec &i2) const { exchange(r1,r2,i1,i2,n);} - template void apply(std::vector &r1,std::vector &r2,std::vector &in1,std::vector &in2) const { + template void apply(std::vector &r1, + std::vector &r2, + std::vector &in1, + std::vector &in2) const + { int sz=in1.size(); - - int msk = sz>>(n+1); - int j1=0; - int j2=0; - for(int i=0;i Date: Thu, 13 Apr 2017 08:40:44 +0100 Subject: [PATCH 08/22] Align cast fixed for __mm128i gcc complained --- lib/simd/Grid_sse4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index 89eb7daa..b43c0628 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -343,7 +343,7 @@ namespace Optimization { } static inline void HtoS (__m128i h,__m128 &sa,__m128 &sb) { sa = _mm_cvtph_ps(h); - h = (__m128)_mm_alignr_epi32((__m128i)h,(__m128i)h,2); + h = (__m128i)_mm_alignr_epi32((__m128i)h,(__m128i)h,2); sb = _mm_cvtph_ps(h); } static inline __m128 DtoS (__m128d a,__m128d b) { From 9c3065b860bb349483201d6fee365794aa0a21fc Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 10:01:32 +0100 Subject: [PATCH 09/22] Debug flags off again --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 298c04de..6924fae1 100644 --- a/configure.ac +++ b/configure.ac @@ -10,7 +10,7 @@ AC_CONFIG_HEADERS([lib/Config.h],[sed -i 's|PACKAGE_|GRID_|' lib/Config.h]) m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) ############### Checks for programs -CXXFLAGS="-g $CXXFLAGS" +CXXFLAGS="-O3 $CXXFLAGS" AC_PROG_CXX AC_PROG_RANLIB From c38400b26fab1f7596bcac21fe23c69c0437fa59 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 10:35:20 +0100 Subject: [PATCH 10/22] Trap signals --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index bc6dd0ef..107343ed 100644 --- a/.travis.yml +++ b/.travis.yml @@ -92,11 +92,11 @@ script: - cd build - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none - make -j4 - - ./benchmarks/Benchmark_dwf --threads 1 + - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - echo make clean - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none - make -j4 - - ./benchmarks/Benchmark_dwf --threads 1 + - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - echo make clean - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto From 1c2577331990df1a4376fed3b0c971dda7d04ebb Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 10:51:40 +0100 Subject: [PATCH 11/22] Trap illegal instructions --- lib/util/Init.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/util/Init.cc b/lib/util/Init.cc index dd3e6d13..990a4e0f 100644 --- a/lib/util/Init.cc +++ b/lib/util/Init.cc @@ -457,5 +457,6 @@ void Grid_debug_handler_init(void) sigaction(SIGFPE,&sa,NULL); sigaction(SIGKILL,&sa,NULL); + sigaction(SIGILL,&sa,NULL); } } From 73cdf0fffea01ec30c835092657dd1785652e0c2 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 11:23:41 +0100 Subject: [PATCH 12/22] Drop f16c from SSE because of a macos compile error on travis --- configure.ac | 3 ++- lib/simd/Grid_sse4.h | 15 +++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/configure.ac b/configure.ac index 6924fae1..c45693c5 100644 --- a/configure.ac +++ b/configure.ac @@ -176,7 +176,8 @@ case ${ax_cv_cxx_compiler_vendor} in case ${ac_SIMD} in SSE4) AC_DEFINE([SSE4],[1],[SSE4 intrinsics]) - SIMD_FLAGS='-msse4.2 -mf16c';; +# SIMD_FLAGS='-msse4.2 -mf16c';; + SIMD_FLAGS='-msse4.2';; AVX) AC_DEFINE([AVX1],[1],[AVX intrinsics]) SIMD_FLAGS='-mavx -mf16c';; diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index b43c0628..a6ab369d 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -336,15 +336,18 @@ namespace Optimization { #endif struct PrecisionChange { static inline __m128i StoH (__m128 a,__m128 b) { - __m128i ha = _mm_cvtps_ph(a,0); - __m128i hb = _mm_cvtps_ph(b,0); - __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); + // __m128i ha = _mm_cvtps_ph(a,0); + // __m128i hb = _mm_cvtps_ph(b,0); + // __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); + __m128i h = (__m128i)a; + assert(0); return h; } static inline void HtoS (__m128i h,__m128 &sa,__m128 &sb) { - sa = _mm_cvtph_ps(h); - h = (__m128i)_mm_alignr_epi32((__m128i)h,(__m128i)h,2); - sb = _mm_cvtph_ps(h); + // sa = _mm_cvtph_ps(h); + // h = (__m128i)_mm_alignr_epi32((__m128i)h,(__m128i)h,2); + // sb = _mm_cvtph_ps(h); + assert(0); } static inline __m128 DtoS (__m128d a,__m128d b) { __m128 sa = _mm_cvtpd_ps(a); From 1d502e4ed6e481647702f2f1fe39fa9bb8c6084f Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 11:55:24 +0100 Subject: [PATCH 13/22] FP16 optional compile time --- configure.ac | 13 +++++++++++++ lib/simd/Grid_avx.h | 8 ++++++++ lib/simd/Grid_avx512.h | 8 ++++++++ lib/simd/Grid_generic.h | 8 ++++++++ lib/simd/Grid_sse4.h | 19 +++++++++++++------ 5 files changed, 50 insertions(+), 6 deletions(-) diff --git a/configure.ac b/configure.ac index c45693c5..e0820d79 100644 --- a/configure.ac +++ b/configure.ac @@ -83,6 +83,19 @@ case ${ac_LAPACK} in AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);; esac +############### FP16 conversions +AC_ARG_ENABLE([fp16], + [AC_HELP_STRING([--enable-fp16=yes|no], [enable fp16 comms])], + [ac_FP16=${enable_fp16}], [ac_FP16=no]) +case ${ac_FP16} in + no) + ;; + yes) + AC_DEFINE([USE_FP16],[1],[conversion to fp16]);; + *) + ;; +esac + ############### MKL AC_ARG_ENABLE([mkl], [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])], diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index c2e02cb2..73792f84 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -473,15 +473,23 @@ namespace Optimization { struct PrecisionChange { static inline __m256i StoH (__m256 a,__m256 b) { +#ifdef USE_FP16 __m128i ha = _mm256_cvtps_ph(a,0); __m128i hb = _mm256_cvtps_ph(b,0); __m256 h = _mm256_castps128_ps256(ha); h = _mm256_insertf128_ps(h,hb,1); +#else + assert(0); +#endif return h; } static inline void HtoS (__m256i h,__m256 &sa,__m256 &sb) { +#ifdef USE_FP16 sa = _mm256_cvtph_ps(_mm256_extractf128_ps(h,0)); sb = _mm256_cvtph_ps(_mm256_extractf128_ps(h,1)); +#else + assert(0); +#endif } static inline __m256 DtoS (__m256d a,__m256d b) { __m128 sa = _mm256_cvtpd_ps(a); diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index 87798637..dae3c1c7 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -343,15 +343,23 @@ namespace Optimization { struct PrecisionChange { static inline __m512i StoH (__m512 a,__m512 b) { +#ifdef USE_FP16 __m256i ha = _mm512_cvtps_ph(a,0); __m256i hb = _mm512_cvtps_ph(b,0); __m512 h = _mm512_castps256_ps512(ha); h = _mm512_insertf256_ps(h,hb,1); +#else + assert(0); +#endif return h; } static inline void HtoS (__m512i h,__m512 &sa,__m512 &sb) { +#ifdef USE_FP16 sa = _mm512_cvtph_ps(_mm512_extractf256_ps(h,0)); sb = _mm512_cvtph_ps(_mm512_extractf256_ps(h,1)); +#else + assert(0); +#endif } static inline __m512 DtoS (__m512d a,__m512d b) { __m256 sa = _mm512_cvtpd_ps(a); diff --git a/lib/simd/Grid_generic.h b/lib/simd/Grid_generic.h index 466aa2f7..bdf5aa01 100644 --- a/lib/simd/Grid_generic.h +++ b/lib/simd/Grid_generic.h @@ -281,6 +281,7 @@ namespace Optimization { struct PrecisionChange { static inline vech StoH (const vecf &a,const vecf &b) { +#ifdef USE_FP16 vech ret; vech *ha = (vech *)&a; vech *hb = (vech *)&b; @@ -289,9 +290,13 @@ namespace Optimization { // VECTOR_FOR(i, nf,1){ ret.v[i+nf] = ( (uint16_t *) &b.v[i])[1] ; } VECTOR_FOR(i, nf,1){ ret.v[i] = ha->v[2*i+1]; } VECTOR_FOR(i, nf,1){ ret.v[i+nf] = hb->v[2*i+1]; } +#else + assert(0); +#endif return ret; } static inline void HtoS (vech h,vecf &sa,vecf &sb) { +#ifdef USE_FP16 const int nf = W::r; const int nh = W::r; vech *ha = (vech *)&sa; @@ -301,6 +306,9 @@ namespace Optimization { // VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sb.v[i]))[1] = h.v[i+nf];} VECTOR_FOR(i, nf, 1){ ha->v[2*i+1]=h.v[i]; } VECTOR_FOR(i, nf, 1){ hb->v[2*i+1]=h.v[i+nf]; } +#else + assert(0); +#endif } static inline vecf DtoS (vecd a,vecd b) { const int nd = W::r; diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index a6ab369d..969ba3ed 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -334,20 +334,27 @@ namespace Optimization { #define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) #define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) #endif + struct PrecisionChange { static inline __m128i StoH (__m128 a,__m128 b) { - // __m128i ha = _mm_cvtps_ph(a,0); - // __m128i hb = _mm_cvtps_ph(b,0); - // __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); +#ifdef USE_FP16 + __m128i ha = _mm_cvtps_ph(a,0); + __m128i hb = _mm_cvtps_ph(b,0); + __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); +#else __m128i h = (__m128i)a; assert(0); +#endif return h; } static inline void HtoS (__m128i h,__m128 &sa,__m128 &sb) { - // sa = _mm_cvtph_ps(h); - // h = (__m128i)_mm_alignr_epi32((__m128i)h,(__m128i)h,2); - // sb = _mm_cvtph_ps(h); +#ifdef USE_FP16 + sa = _mm_cvtph_ps(h); + h = (__m128i)_mm_alignr_epi32((__m128i)h,(__m128i)h,2); + sb = _mm_cvtph_ps(h); +#else assert(0); +#endif } static inline __m128 DtoS (__m128d a,__m128d b) { __m128 sa = _mm_cvtpd_ps(a); From 2846f079e5002692e36edc866700895d31d6def2 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 12:08:05 +0100 Subject: [PATCH 14/22] Predicate tests on fp16 being enabled --- tests/Test_simd.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc index abec2410..c0bbef1d 100644 --- a/tests/Test_simd.cc +++ b/tests/Test_simd.cc @@ -725,6 +725,7 @@ int main (int argc, char ** argv) std::cout <<" OK ! "< Date: Thu, 13 Apr 2017 12:22:40 +0100 Subject: [PATCH 15/22] Update to use Xcode 8.3 since -mfp16 causes SIGILL --- .travis.yml | 2 +- configure.ac | 5 ++--- lib/simd/Grid_sse4.h | 14 ++++++-------- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/.travis.yml b/.travis.yml index 107343ed..20716334 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,7 @@ cache: matrix: include: - os: osx - osx_image: xcode7.2 + osx_image: xcode8.3 compiler: clang - compiler: gcc addons: diff --git a/configure.ac b/configure.ac index e0820d79..6b233c0d 100644 --- a/configure.ac +++ b/configure.ac @@ -86,7 +86,7 @@ esac ############### FP16 conversions AC_ARG_ENABLE([fp16], [AC_HELP_STRING([--enable-fp16=yes|no], [enable fp16 comms])], - [ac_FP16=${enable_fp16}], [ac_FP16=no]) + [ac_FP16=${enable_fp16}], [ac_FP16=yes]) case ${ac_FP16} in no) ;; @@ -189,8 +189,7 @@ case ${ax_cv_cxx_compiler_vendor} in case ${ac_SIMD} in SSE4) AC_DEFINE([SSE4],[1],[SSE4 intrinsics]) -# SIMD_FLAGS='-msse4.2 -mf16c';; - SIMD_FLAGS='-msse4.2';; + SIMD_FLAGS='-msse4.2 -mf16c';; AVX) AC_DEFINE([AVX1],[1],[AVX intrinsics]) SIMD_FLAGS='-mavx -mf16c';; diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index 969ba3ed..8a4537c2 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -330,10 +330,8 @@ namespace Optimization { }; -#ifndef _mm_alignr_epi64 -#define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) -#define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) -#endif +#define _my_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) +#define _my_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) struct PrecisionChange { static inline __m128i StoH (__m128 a,__m128 b) { @@ -350,7 +348,7 @@ namespace Optimization { static inline void HtoS (__m128i h,__m128 &sa,__m128 &sb) { #ifdef USE_FP16 sa = _mm_cvtph_ps(h); - h = (__m128i)_mm_alignr_epi32((__m128i)h,(__m128i)h,2); + h = (__m128i)_my_alignr_epi32((__m128i)h,(__m128i)h,2); sb = _mm_cvtph_ps(h); #else assert(0); @@ -364,7 +362,7 @@ namespace Optimization { } static inline void StoD (__m128 s,__m128d &a,__m128d &b) { a = _mm_cvtps_pd(s); - s = (__m128)_mm_alignr_epi32((__m128i)s,(__m128i)s,2); + s = (__m128)_my_alignr_epi32((__m128i)s,(__m128i)s,2); b = _mm_cvtps_pd(s); } static inline __m128i DtoH (__m128d a,__m128d b,__m128d c,__m128d d) { @@ -439,8 +437,8 @@ namespace Optimization { } } - template static inline __m128 tRotate(__m128 in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); }; - template static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); }; + template static inline __m128 tRotate(__m128 in){ return (__m128)_my_alignr_epi32((__m128i)in,(__m128i)in,n); }; + template static inline __m128d tRotate(__m128d in){ return (__m128d)_my_alignr_epi64((__m128i)in,(__m128i)in,n); }; }; ////////////////////////////////////////////// From eb8e26018b3c579ffa29232df7ada67a024372b1 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 12:35:11 +0100 Subject: [PATCH 16/22] Travis update for macos --- .travis.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 20716334..df7eb5a7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -73,8 +73,6 @@ before_install: - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install openmpi; fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi install: - export CC=$CC$VERSION @@ -98,9 +96,11 @@ script: - make -j4 - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - echo make clean - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi - - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto - - make -j4 - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then + - export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; + - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto; + - make -j4; + - mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; + - fi From 5a4eafbf7e6ba77515eca2f5b201109f57c5da69 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 12:50:43 +0100 Subject: [PATCH 17/22] .travis --- .travis.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index df7eb5a7..6eab0a4a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -96,8 +96,7 @@ script: - make -j4 - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - echo make clean - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then - - export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto; - make -j4; - mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; From 4226c633c4fb1284fe92c37ced904c1c81b742ba Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 12:51:39 +0100 Subject: [PATCH 18/22] Default to FP16 off again --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 6b233c0d..52acbfde 100644 --- a/configure.ac +++ b/configure.ac @@ -86,7 +86,7 @@ esac ############### FP16 conversions AC_ARG_ENABLE([fp16], [AC_HELP_STRING([--enable-fp16=yes|no], [enable fp16 comms])], - [ac_FP16=${enable_fp16}], [ac_FP16=yes]) + [ac_FP16=${enable_fp16}], [ac_FP16=no]) case ${ac_FP16} in no) ;; From 75ea306ce9c96b76a5663d0488399465b14295ed Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 13:05:32 +0100 Subject: [PATCH 19/22] Another try at travis --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 6eab0a4a..278d2c1c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -96,8 +96,8 @@ script: - make -j4 - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - echo make clean - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; - - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto; + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then + - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; - make -j4; - mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; - fi From d3b9a7fa145b46e2fbdcb5f0318345b79a0563de Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 13:19:11 +0100 Subject: [PATCH 20/22] F16c apparently requires AVX, even if the 128 bit are used. Seems odd. --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 52acbfde..f18beb32 100644 --- a/configure.ac +++ b/configure.ac @@ -189,7 +189,7 @@ case ${ax_cv_cxx_compiler_vendor} in case ${ac_SIMD} in SSE4) AC_DEFINE([SSE4],[1],[SSE4 intrinsics]) - SIMD_FLAGS='-msse4.2 -mf16c';; + SIMD_FLAGS='-msse4.2';; AVX) AC_DEFINE([AVX1],[1],[AVX intrinsics]) SIMD_FLAGS='-mavx -mf16c';; From 2ed6c76fc5a256103ca7949db726fec53b6b9ed6 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 13:43:13 +0100 Subject: [PATCH 21/22] Getting multiline if then fi working --- .travis.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 278d2c1c..32c3c918 100644 --- a/.travis.yml +++ b/.travis.yml @@ -96,10 +96,10 @@ script: - make -j4 - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - echo make clean - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then - - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; - - make -j4; - - mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; - - fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then + ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; + make -j4; + mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; + fi From 0957378679c2f4a5cfe9d912858b1724b33d679c Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 13:47:56 +0100 Subject: [PATCH 22/22] Fixing conditional ugly way --- .travis.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 32c3c918..a33d8ec0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -96,10 +96,8 @@ script: - make -j4 - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - echo make clean - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; - make -j4; - mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; - fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make -j4; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi