From 86187d7cca3b29b7050c096938d4c89aed51194e Mon Sep 17 00:00:00 2001 From: Richard Rollins Date: Tue, 14 Jun 2016 15:34:20 +0100 Subject: [PATCH 01/21] Removed write to stdout in constructor for MPI CartesianCommunicator --- lib/communicator/Communicator_mpi.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc index f804e8fe..dff9811a 100644 --- a/lib/communicator/Communicator_mpi.cc +++ b/lib/communicator/Communicator_mpi.cc @@ -53,7 +53,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors) _Nprocessors=1; _processors = processors; _processor_coor.resize(_ndimension); - std::cout << processors << std::endl; MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator); MPI_Comm_rank(communicator,&_processor); From d6737e4bd8eefed1923dcbeef868a0c48134f6bb Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Tue, 14 Jun 2016 19:07:01 +0100 Subject: [PATCH 02/21] Travis fix for Linux clang builds --- .travis.yml | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index cd73fbac..82066d87 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,9 @@ language: cpp +cache: + directories: + - clang + matrix: include: - os: osx @@ -38,29 +42,31 @@ matrix: apt: sources: - ubuntu-toolchain-r-test - - llvm-toolchain-precise-3.7 packages: - - clang-3.7 + - g++-4.8 - libmpfr-dev - libgmp-dev - libmpc-dev - binutils-dev - env: VERSION=-3.7 + env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz - compiler: clang addons: apt: sources: - ubuntu-toolchain-r-test - - llvm-toolchain-precise-3.8 packages: - - clang-3.8 + - g++-4.8 - libmpfr-dev - libgmp-dev - libmpc-dev - binutils-dev - env: VERSION=-3.8 + env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz before_install: + - export GRIDDIR=`pwd` + - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi @@ -68,6 +74,11 @@ before_install: install: - export CC=$CC$VERSION - export CXX=$CXX$VERSION + - echo $PATH + - which $CC + - $CC --version + - which $CXX + - $CXX --version - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi script: From 1b7f88dd003eb5d863e39ea95ceb6564f19de791 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 19 Jun 2016 11:45:58 -0700 Subject: [PATCH 03/21] Enable reordering of the loops in the assembler for cache friendly. This gets in the way of L2 prefetching however. Do next next link in stencil prefetching. --- benchmarks/Benchmark_dwf_sweep.cc | 358 ++++++++++++++++++ lib/qcd/action/fermion/DomainWallFermion.h | 2 +- lib/qcd/action/fermion/WilsonFermion.cc | 14 +- lib/qcd/action/fermion/WilsonFermion.h | 6 +- lib/qcd/action/fermion/WilsonFermion5D.cc | 4 +- lib/qcd/action/fermion/WilsonKernels.cc | 20 +- lib/qcd/action/fermion/WilsonKernels.h | 14 +- lib/qcd/action/fermion/WilsonKernelsAsm.cc | 36 +- lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 76 ++-- .../action/fermion/WilsonKernelsAsmBody.h.ab | 163 ++++++++ lib/qcd/action/fermion/WilsonKernelsHand.cc | 36 +- lib/simd/Intel512common.h | 6 +- lib/simd/Intel512wilson.h | 32 +- lib/stencil/Lebesgue.cc | 18 +- lib/stencil/Lebesgue.h | 1 + 15 files changed, 670 insertions(+), 116 deletions(-) create mode 100644 benchmarks/Benchmark_dwf_sweep.cc create mode 100644 lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab diff --git a/benchmarks/Benchmark_dwf_sweep.cc b/benchmarks/Benchmark_dwf_sweep.cc new file mode 100644 index 00000000..302059a4 --- /dev/null +++ b/benchmarks/Benchmark_dwf_sweep.cc @@ -0,0 +1,358 @@ + + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./benchmarks/Benchmark_dwf.cc + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::GammaMatrix Gmu [] = { + Gamma::GammaX, + Gamma::GammaY, + Gamma::GammaZ, + Gamma::GammaT + }; + +void benchDw(std::vector & L, int Ls, int threads, int report =0 ); +void benchsDw(std::vector & L, int Ls, int threads, int report=0 ); + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + const int Ls=16; + int threads = GridThread::GetThreads(); + std::cout< latt4(4,L); + for(int d=4;d>0;d--){ + if ( d<=3 ) latt4[d]*=2; + std::cout << GridLogMessage <<"\t"; + for(int d=0;d latt4(4,16); + std::cout< & latt4, int Ls, int threads,int report ) +{ + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + +#ifdef CHECK + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + LatticeFermion src (FGrid); random(RNG5,src); + LatticeGaugeField Umu(UGrid); + random(RNG4,Umu); +#else + LatticeFermion src (FGrid); src=zero; + LatticeGaugeField Umu(UGrid); Umu=zero; +#endif + + LatticeFermion result(FGrid); result=zero; + LatticeFermion ref(FGrid); ref=zero; + LatticeFermion tmp(FGrid); + LatticeFermion err(FGrid); + + ColourMatrix cm = Complex(1.0,0.0); + + + LatticeGaugeField Umu5d(FGrid); + + // replicate across fifth dimension + for(int ss=0;ssoSites();ss++){ + for(int s=0;s U(4,FGrid); + for(int mu=0;mu(Umu5d,mu); + } + +#ifdef CHECK + if (1) + { + ref = zero; + for(int mu=0;mu_Nprocessors; + + DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + + double t0=usecond(); + Dw.Dhop(src,result,0); + double t1=usecond(); + + int ncall =1+(int) ((5.0*1000*1000)/(t1-t0)); + + if (ncall < 5 ) exit(0); + + Dw.Dhop(src,result,0); + + PerformanceCounter Counter(8); + Counter.Start(); + t0=usecond(); + for(int i=0;i 1.0e-4 ) { + std::cout< & latt4, int Ls, int threads, int report ) +{ + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi()); + GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid); + GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid); + + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + +#ifdef CHECK_SDW + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + LatticeFermion src (FGrid); random(RNG5,src); + LatticeGaugeField Umu(UGrid); + random(RNG4,Umu); +#else + LatticeFermion src (FGrid); src=zero; + LatticeGaugeField Umu(UGrid); Umu=zero; +#endif + + LatticeFermion result(FGrid); result=zero; + LatticeFermion ref(FGrid); ref=zero; + LatticeFermion tmp(FGrid); + LatticeFermion err(FGrid); + + ColourMatrix cm = Complex(1.0,0.0); + + LatticeGaugeField Umu5d(FGrid); + + // replicate across fifth dimension + for(int ss=0;ssoSites();ss++){ + for(int s=0;s WilsonFermion5DF; + LatticeFermionF ssrc(sFGrid); + LatticeFermionF sref(sFGrid); + LatticeFermionF sresult(sFGrid); + WilsonFermion5DF sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5); + + for(int x=0;x site({s,x,y,z,t}); + SpinColourVectorF tmp; + peekSite(tmp,src,site); + pokeSite(tmp,ssrc,site); + }}}}} + + double t0=usecond(); + sDw.Dhop(ssrc,sresult,0); + double t1=usecond(); + + int ncall =1+(int) ((5.0*1000*1000)/(t1-t0)); + + PerformanceCounter Counter(8); + Counter.Start(); + t0=usecond(); + for(int i=0;iLs);// eps is ignored for higham assert(zdata->n==this->Ls); - std::cout< #define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri) #define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri) -#define VPREFETCHNTA(O,A) -#define VPREFETCH(O,A) #define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n" #define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n" diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h index 1955cc6d..2bc0545d 100644 --- a/lib/simd/Intel512wilson.h +++ b/lib/simd/Intel512wilson.h @@ -559,22 +559,23 @@ Author: paboyle VSUB(UChi_02,result_22,result_22)\ VSUB(UChi_12,result_32,result_32) ); -#define PREFETCH_CHIMU(A) \ +#define PREFETCH_CHIMU(A) +/* LOAD64(%r9,A) \ __asm__ ( \ - VPREFETCHG(12,%r9)\ - VPREFETCHG(13,%r9)\ - VPREFETCHG(14,%r9)\ - VPREFETCHG(15,%r9)\ - VPREFETCHG(16,%r9)\ - VPREFETCHG(17,%r9)\ - VPREFETCHG(18,%r9)\ - VPREFETCHG(19,%r9)\ - VPREFETCHG(20,%r9)\ - VPREFETCHG(21,%r9)\ - VPREFETCHG(22,%r9)\ - VPREFETCHG(23,%r9)); - + VPREFETCHG(0,%r9)\ + VPREFETCHG(1,%r9)\ + VPREFETCHG(2,%r9)\ + VPREFETCHG(3,%r9)\ + VPREFETCHG(4,%r9)\ + VPREFETCHG(5,%r9)\ + VPREFETCHG(6,%r9)\ + VPREFETCHG(7,%r9)\ + VPREFETCHG(8,%r9)\ + VPREFETCHG(9,%r9)\ + VPREFETCHG(10,%r9)\ + VPREFETCHG(11,%r9)); +*/ #define PERMUTE_DIR0 __asm__ ( \ VPERM0(Chi_00,Chi_00) \ VPERM0(Chi_01,Chi_01) \ @@ -612,8 +613,7 @@ Author: paboyle LOAD64(%r8,ptr) \ LOAD64(%r9,pf) \ __asm__ ( \ - VPREFETCH2(9,%r8) \ - VPREFETCH2(10,%r8) \ + VPREFETCH2(9,%r8) VPREFETCH2(10,%r8) \ VPREFETCH2(11,%r8) \ VPREFETCH2(12,%r8) \ VPREFETCH2(13,%r8) \ diff --git a/lib/stencil/Lebesgue.cc b/lib/stencil/Lebesgue.cc index 7704e08f..c34b5c96 100644 --- a/lib/stencil/Lebesgue.cc +++ b/lib/stencil/Lebesgue.cc @@ -49,16 +49,25 @@ LebesgueOrder::LebesgueOrder(GridBase *_grid) { grid = _grid; if ( Block[0]==0) ZGraph(); + else if ( Block[1]==0) NoBlocking(); else CartesianBlocking(); } +void LebesgueOrder::NoBlocking(void) +{ + std::cout<oSites();s++){ + _LebesgueReorder.push_back(s); + } +} void LebesgueOrder::CartesianBlocking(void) { _LebesgueReorder.resize(0); - std::cout << GridLogMessage << " CartesianBlocking "; - for(int d=0;d_ndimension; @@ -116,7 +125,8 @@ void LebesgueOrder::IterateI(int ND, void LebesgueOrder::ZGraph(void) { _LebesgueReorder.resize(0); - + + std::cout << GridLogDebug << " Lebesgue order "< Block; + void NoBlocking(void); void CartesianBlocking(void); void IterateO(int ND,int dim, std::vector & xo, From 17a8f51a9b5ada82ec5ecf5739c04847a5f7ed3f Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 19 Jun 2016 11:59:10 -0700 Subject: [PATCH 04/21] update file lists --- benchmarks/Make.inc | 6 +++++- lib/Make.inc | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/benchmarks/Make.inc b/benchmarks/Make.inc index b60ec835..8d0721a4 100644 --- a/benchmarks/Make.inc +++ b/benchmarks/Make.inc @@ -1,5 +1,5 @@ -bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_dwf_ntpf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm +bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_dwf_ntpf Benchmark_dwf_sweep Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm Benchmark_comms_SOURCES=Benchmark_comms.cc @@ -14,6 +14,10 @@ Benchmark_dwf_ntpf_SOURCES=Benchmark_dwf_ntpf.cc Benchmark_dwf_ntpf_LDADD=-lGrid +Benchmark_dwf_sweep_SOURCES=Benchmark_dwf_sweep.cc +Benchmark_dwf_sweep_LDADD=-lGrid + + Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc Benchmark_memory_asynch_LDADD=-lGrid diff --git a/lib/Make.inc b/lib/Make.inc index 900da916..8763692a 100644 --- a/lib/Make.inc +++ b/lib/Make.inc @@ -1,4 +1,4 @@ -HFILES=./Algorithms.h ./AlignedAllocator.h ./Cartesian.h ./Communicator.h ./Cshift.h ./Grid.h ./Init.h ./Lattice.h ./Lexicographic.h ./Log.h ./Old/Tensor_peek.h ./Old/Tensor_poke.h ./PerfCount.h ./Simd.h ./Stencil.h ./Tensors.h ./Threads.h ./Timer.h ./algorithms/CoarsenedMatrix.h ./algorithms/LinearOperator.h ./algorithms/Preconditioner.h ./algorithms/SparseMatrix.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/DenseMatrix.h ./algorithms/iterative/EigenSort.h ./algorithms/iterative/Francis.h ./algorithms/iterative/Householder.h ./algorithms/iterative/ImplicitlyRestartedLanczos.h ./algorithms/iterative/Matrix.h ./algorithms/iterative/MatrixUtils.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/SchurRedBlack.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./communicator/Communicator_base.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./lattice/Lattice_ET.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./parallelIO/BinaryIO.h ./parallelIO/NerscIO.h ./pugixml/pugixml.h ./qcd/QCD.h ./qcd/action/ActionBase.h ./qcd/action/ActionParams.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/fermion/WilsonTMFermion.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/WilsonKernelsAsmBody.h ./qcd/action/gauge/GaugeImpl.h ./qcd/action/gauge/PlaqPlusRectangleAction.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/hmc/HMC.h ./qcd/hmc/HmcRunner.h ./qcd/hmc/NerscCheckpointer.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SUn.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/WilsonLoops.h ./serialisation/BaseIO.h ./serialisation/BinaryIO.h ./serialisation/MacroMagic.h ./serialisation/Serialisation.h ./serialisation/TextIO.h ./serialisation/XmlIO.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_imci.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./simd/Intel512avx.h ./simd/Intel512common.h ./simd/Intel512double.h ./simd/Intel512imci.h ./simd/Intel512single.h ./simd/Intel512wilson.h ./stencil/Lebesgue.h ./tensors/Tensor_Ta.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_exp.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_index.h ./tensors/Tensor_inner.h ./tensors/Tensor_logical.h ./tensors/Tensor_outer.h ./tensors/Tensor_reality.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h +HFILES=./Algorithms.h ./AlignedAllocator.h ./Cartesian.h ./Communicator.h ./Cshift.h ./Grid.h ./Init.h ./Lattice.h ./Lexicographic.h ./Log.h ./Old/Tensor_peek.h ./Old/Tensor_poke.h ./PerfCount.h ./Simd.h ./Stencil.h ./Tensors.h ./Threads.h ./Timer.h ./algorithms/CoarsenedMatrix.h ./algorithms/LinearOperator.h ./algorithms/Preconditioner.h ./algorithms/SparseMatrix.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/DenseMatrix.h ./algorithms/iterative/EigenSort.h ./algorithms/iterative/Francis.h ./algorithms/iterative/Householder.h ./algorithms/iterative/ImplicitlyRestartedLanczos.h ./algorithms/iterative/Matrix.h ./algorithms/iterative/MatrixUtils.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/SchurRedBlack.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./communicator/Communicator_base.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./lattice/Lattice_ET.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./parallelIO/BinaryIO.h ./parallelIO/NerscIO.h ./pugixml/pugixml.h ./qcd/QCD.h ./qcd/action/ActionBase.h ./qcd/action/ActionParams.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/fermion/WilsonTMFermion.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/WilsonKernelsAsmBody.h ./qcd/action/gauge/GaugeImpl.h ./qcd/action/gauge/PlaqPlusRectangleAction.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/hmc/HMC.h ./qcd/hmc/HmcRunner.h ./qcd/hmc/NerscCheckpointer.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SUn.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/WilsonLoops.h ./serialisation/BaseIO.h ./serialisation/BinaryIO.h ./serialisation/MacroMagic.h ./serialisation/Serialisation.h ./serialisation/TextIO.h ./serialisation/XmlIO.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_imci.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./simd/Intel512avx.h ./simd/Intel512wilson.h ./simd/Intel512common.h ./simd/Intel512double.h ./simd/Intel512imci.h ./simd/Intel512single.h ./stencil/Lebesgue.h ./tensors/Tensor_Ta.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_exp.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_index.h ./tensors/Tensor_inner.h ./tensors/Tensor_logical.h ./tensors/Tensor_outer.h ./tensors/Tensor_reality.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h CCFILES=./Init.cc ./Log.cc ./PerfCount.cc ./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./pugixml/pugixml.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsAsm.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/action/fermion/WilsonTMFermion.cc ./qcd/hmc/HMC.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./serialisation/BinaryIO.cc ./serialisation/TextIO.cc ./serialisation/XmlIO.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc From 09fe3caebd985fe8e2db1ff85164e3779cdaf17d Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 25 Jun 2016 11:08:05 -0700 Subject: [PATCH 05/21] Tweaks --- benchmarks/Benchmark_dwf_sweep.cc | 16 +- lib/qcd/action/fermion/WilsonKernelsAsm.cc | 7 +- lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 79 +++----- .../action/fermion/WilsonKernelsAsmBody.h.abc | 187 ++++++++++++++++++ lib/simd/Intel512common.h | 4 +- 5 files changed, 242 insertions(+), 51 deletions(-) create mode 100644 lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc diff --git a/benchmarks/Benchmark_dwf_sweep.cc b/benchmarks/Benchmark_dwf_sweep.cc index 302059a4..94a00903 100644 --- a/benchmarks/Benchmark_dwf_sweep.cc +++ b/benchmarks/Benchmark_dwf_sweep.cc @@ -68,10 +68,12 @@ int main (int argc, char ** argv) std::cout< latt4(4,L); - for(int d=4;d>0;d--){ + for(int d=4;d>dmin;d--){ if ( d<=3 ) latt4[d]*=2; std::cout << GridLogMessage <<"\t"; for(int d=0;d & latt4, int Ls, int threads,int report ) Dw.Dhop(src,result,0); double t1=usecond(); +#ifdef TIMERS_OFF + int ncall =10; +#else int ncall =1+(int) ((5.0*1000*1000)/(t1-t0)); +#endif if (ncall < 5 ) exit(0); @@ -297,7 +303,11 @@ void benchsDw(std::vector & latt4, int Ls, int threads, int report ) sDw.Dhop(ssrc,sresult,0); double t1=usecond(); +#ifdef TIMERS_OFF + int ncall =10; +#else int ncall =1+(int) ((5.0*1000*1000)/(t1-t0)); +#endif PerformanceCounter Counter(8); Counter.Start(); @@ -340,7 +350,9 @@ void benchsDw(std::vector & latt4, int Ls, int threads, int report ) CounterSdw.Start(); t0=usecond(); for(int i=0;i void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, std::vector > &buf, @@ -80,6 +83,8 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrd #undef VMOVRDUP #undef MAYBEPERM #undef MULT_2SPIN +#undef FX +#define FX(A) DWFASM_ ## A #define MAYBEPERM(A,B) #define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C) #define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C) diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index bd96b7d5..d3e86276 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -1,8 +1,7 @@ { int locala,perma, ptypea; int localb,permb, ptypeb; - int localc,permc, ptypec; - uint64_t basea, baseb, basec; + uint64_t basea, baseb; uint64_t basex; const uint64_t plocal =(uint64_t) & in._odata[0]; @@ -12,22 +11,15 @@ MASK_REGS; for(int site=0;site shuffle and xor the real part sign bit YM_PROJMEM(baseb); @@ -56,7 +47,7 @@ LOAD_CHI(baseb); } { - MULT_2SPIN_DIR_PFYP(Yp,basec); + MULT_2SPIN_DIR_PFYP(Yp,basea); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit YM_RECON_ACCUM; @@ -65,16 +56,15 @@ // Zp //////////////////////////////// baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; - PREFETCH_CHIMU(baseb); - if ( localc ) { + if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - ZM_PROJMEM(basec); - MAYBEPERM(PERMUTE_DIR1,permc); + ZM_PROJMEM(basea); + MAYBEPERM(PERMUTE_DIR1,perma); } else { - LOAD_CHI(basec); + LOAD_CHI(basea); } { - MULT_2SPIN_DIR_PFZP(Zp,basea); + MULT_2SPIN_DIR_PFZP(Zp,baseb); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZM_RECON_ACCUM; @@ -82,17 +72,16 @@ //////////////////////////////// // Tp //////////////////////////////// - basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++; - PREFETCH_CHIMU(basec); - if ( locala ) { + basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; + if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - TM_PROJMEM(basea); - MAYBEPERM(PERMUTE_DIR0,perma); + TM_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR0,permb); } else { - LOAD_CHI(basea); + LOAD_CHI(baseb); } { - MULT_2SPIN_DIR_PFTP(Tp,baseb); + MULT_2SPIN_DIR_PFTP(Tp,basea); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TM_RECON_ACCUM; @@ -100,17 +89,16 @@ //////////////////////////////// // Xm //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++; - PREFETCH_CHIMU(basea); - if ( localb ) { + baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; + if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - XP_PROJMEM(baseb); - MAYBEPERM(PERMUTE_DIR3,permb); + XP_PROJMEM(basea); + MAYBEPERM(PERMUTE_DIR3,perma); } else { - LOAD_CHI(baseb); + LOAD_CHI(basea); } { - MULT_2SPIN_DIR_PFXM(Xm,basec); + MULT_2SPIN_DIR_PFXM(Xm,baseb); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit XP_RECON_ACCUM; @@ -118,14 +106,13 @@ //////////////////////////////// // Ym //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++; - PREFETCH_CHIMU(baseb); - if ( localc ) { + basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; + if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - YP_PROJMEM(basec); - MAYBEPERM(PERMUTE_DIR2,permc); + YP_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR2,permb); } else { - LOAD_CHI(basec); + LOAD_CHI(baseb); } { MULT_2SPIN_DIR_PFYM(Ym,basea); @@ -136,8 +123,7 @@ //////////////////////////////// // Zm //////////////////////////////// - basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++; - PREFETCH_CHIMU(basec); + baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZP_PROJMEM(basea); @@ -155,7 +141,6 @@ // Tm //////////////////////////////// basea = (uint64_t)&out._odata[ss]; - PREFETCH_CHIMU(basea); if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TP_PROJMEM(baseb); @@ -163,16 +148,16 @@ } else { LOAD_CHI(baseb); } + baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); { - MULT_2SPIN_DIR_PFTM(Tm,basec); + MULT_2SPIN_DIR_PFTM(Tm,basea); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TP_RECON_ACCUM; - // PREFETCH_CHIMU(basex); - SAVE_RESULT(&out._odata[ss]); - - } + SAVE_RESULT(&out._odata[ss],baseb); + + } ssU++; } } diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc new file mode 100644 index 00000000..5a3e01f7 --- /dev/null +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc @@ -0,0 +1,187 @@ +{ + int locala,perma, ptypea; + int localb,permb, ptypeb; + int localc,permc, ptypec; + uint64_t basea, baseb, basec; + uint64_t basex; + const uint64_t plocal =(uint64_t) & in._odata[0]; + + // vComplexF isigns[2] = { signs[0], signs[1] }; + vComplexF *isigns = &signs[0]; + + MASK_REGS; + + for(int site=0;site shuffle and xor the real part sign bit + YM_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR2,permb); + } else { + LOAD_CHI(baseb); + } + { + MULT_2SPIN_DIR_PFYP(Yp,basec); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + YM_RECON_ACCUM; + + //////////////////////////////// + // Zp + //////////////////////////////// + baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; + PREFETCH_CHIMU(baseb); + label(FX(ZP) ); + if ( localc ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + ZM_PROJMEM(basec); + MAYBEPERM(PERMUTE_DIR1,permc); + } else { + LOAD_CHI(basec); + } + { + MULT_2SPIN_DIR_PFZP(Zp,basea); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + ZM_RECON_ACCUM; + + //////////////////////////////// + // Tp + //////////////////////////////// + basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++; + PREFETCH_CHIMU(basec); + label(FX(TP) ); + if ( locala ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + TM_PROJMEM(basea); + MAYBEPERM(PERMUTE_DIR0,perma); + } else { + LOAD_CHI(basea); + } + { + MULT_2SPIN_DIR_PFTP(Tp,baseb); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + TM_RECON_ACCUM; + + //////////////////////////////// + // Xm + //////////////////////////////// + basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++; + PREFETCH_CHIMU(basea); + label(FX(XM) ); + if ( localb ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + XP_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR3,permb); + } else { + LOAD_CHI(baseb); + } + { + MULT_2SPIN_DIR_PFXM(Xm,basec); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + XP_RECON_ACCUM; + + //////////////////////////////// + // Ym + //////////////////////////////// + baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++; + PREFETCH_CHIMU(baseb); + label(FX(YM) ); + if ( localc ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + YP_PROJMEM(basec); + MAYBEPERM(PERMUTE_DIR2,permc); + } else { + LOAD_CHI(basec); + } + { + MULT_2SPIN_DIR_PFYM(Ym,basea); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + YP_RECON_ACCUM; + + //////////////////////////////// + // Zm + //////////////////////////////// + basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++; + PREFETCH_CHIMU(basec); + label(FX(ZM) ); + if ( locala ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + ZP_PROJMEM(basea); + MAYBEPERM(PERMUTE_DIR1,perma); + } else { + LOAD_CHI(basea); + } + { + MULT_2SPIN_DIR_PFZM(Zm,baseb); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + ZP_RECON_ACCUM; + + //////////////////////////////// + // Tm + //////////////////////////////// + basea = (uint64_t)&out._odata[ss]; + PREFETCH_CHIMU(basea); + label(FX(TM) ); + if ( localb ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + TP_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR0,permb); + } else { + LOAD_CHI(baseb); + } + { + MULT_2SPIN_DIR_PFTM(Tm,basec); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + TP_RECON_ACCUM; + + // PREFETCH_CHIMU(basex); + label(FX(SAV) ); + SAVE_RESULT(&out._odata[ss]); + + } + ssU++; + } +} diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h index a3cd980d..6878bcfb 100644 --- a/lib/simd/Intel512common.h +++ b/lib/simd/Intel512common.h @@ -1,4 +1,4 @@ - /************************************************************************************* + /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -37,6 +37,8 @@ Author: paboyle "mov $0x5555, %%eax \n"\ "kmovw %%eax, %%k7 \n" : : : "%eax"); +//#define label(B) __asm__ ( __func__ __LINE__ #B ":\n" ); + #define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n" #define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n" From 22e88eaf541f5d279df7c20bc97ab6c20d9409de Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 25 Jun 2016 12:54:14 -0700 Subject: [PATCH 06/21] Prefetch during save --- lib/simd/Intel512wilson.h | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h index 2bc0545d..207d9db8 100644 --- a/lib/simd/Intel512wilson.h +++ b/lib/simd/Intel512wilson.h @@ -104,7 +104,7 @@ Author: paboyle #define LOAD_CHI(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi ); #define SAVE_UCHI(PTR) SAVE_UCHIi(PTR) #define SAVE_CHI(PTR) SAVE_CHIi(PTR) -#define SAVE_RESULT(PTR) SAVE_RESULTi(PTR) +#define SAVE_RESULT(PT,R) SAVE_RESULTi(PT,R) #define LOAD_CHIMUi \ LOAD_CHIMU01i \ @@ -169,21 +169,22 @@ Author: paboyle VSTORE(5,%r8,Chi_12) \ ); -#define SAVE_RESULTi(PTR)\ +#define SAVE_RESULTi(PTR,pf) \ LOAD64(%r8,PTR) \ + LOAD64(%r9,pf) \ __asm__ ( \ - VSTORE(0,%r8,result_00) \ - VSTORE(1,%r8,result_01) \ - VSTORE(2,%r8,result_02) \ - VSTORE(3,%r8,result_10) \ - VSTORE(4,%r8,result_11) \ - VSTORE(5,%r8,result_12) \ - VSTORE(6,%r8,result_20) \ - VSTORE(7,%r8,result_21) \ - VSTORE(8,%r8,result_22) \ - VSTORE(9,%r8,result_30) \ - VSTORE(10,%r8,result_31) \ - VSTORE(11,%r8,result_32) \ + VSTORE(0,%r8,result_00) VPREFETCHG(0,%r9) \ + VSTORE(1,%r8,result_01) VPREFETCHG(1,%r9) \ + VSTORE(2,%r8,result_02) VPREFETCHG(2,%r9) \ + VSTORE(3,%r8,result_10) VPREFETCHG(3,%r9) \ + VSTORE(4,%r8,result_11) VPREFETCHG(4,%r9) \ + VSTORE(5,%r8,result_12) VPREFETCHG(5,%r9) \ + VSTORE(6,%r8,result_20) VPREFETCHG(6,%r9) \ + VSTORE(7,%r8,result_21) VPREFETCHG(7,%r9) \ + VSTORE(8,%r8,result_22) VPREFETCHG(8,%r9) \ + VSTORE(9,%r8,result_30) VPREFETCHG(9,%r9) \ + VSTORE(10,%r8,result_31) VPREFETCHG(10,%r9) \ + VSTORE(11,%r8,result_32) VPREFETCHG(11,%r9) \ ); #define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p) From db057cc2762a3b503f6e7c306621dbfc8aab5a1d Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 25 Jun 2016 12:54:50 -0700 Subject: [PATCH 07/21] Prefetch change --- lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index d3e86276..d50999f6 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -2,7 +2,6 @@ int locala,perma, ptypea; int localb,permb, ptypeb; uint64_t basea, baseb; - uint64_t basex; const uint64_t plocal =(uint64_t) & in._odata[0]; // vComplexF isigns[2] = { signs[0], signs[1] }; @@ -20,7 +19,6 @@ int ent=ss*8;// 2*Ndim basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; - basex = basea; if ( locala ) { LOAD64(%r10,isigns); @@ -38,7 +36,7 @@ //////////////////////////////// // Yp //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; + basea = st.GetInfo(ptypea,locala,perma,Zp,ent,plocal); ent++; if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit YM_PROJMEM(baseb); @@ -55,7 +53,7 @@ //////////////////////////////// // Zp //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; + baseb = st.GetInfo(ptypeb,localb,permb,Tp,ent,plocal); ent++; if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZM_PROJMEM(basea); @@ -72,7 +70,7 @@ //////////////////////////////// // Tp //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; + basea = st.GetInfo(ptypea,locala,perma,Xm,ent,plocal); ent++; if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TM_PROJMEM(baseb); @@ -89,7 +87,7 @@ //////////////////////////////// // Xm //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; + baseb = st.GetInfo(ptypeb,localb,permb,Ym,ent,plocal); ent++; if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit XP_PROJMEM(basea); @@ -106,7 +104,7 @@ //////////////////////////////// // Ym //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; + basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++; if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit YP_PROJMEM(baseb); @@ -123,7 +121,7 @@ //////////////////////////////// // Zm //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; + baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++; if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZP_PROJMEM(basea); @@ -148,7 +146,7 @@ } else { LOAD_CHI(baseb); } - baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); + baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); { MULT_2SPIN_DIR_PFTM(Tm,basea); } From b2933a0557d211fa3b3d86d9d9ee4d38ebb854b1 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 25 Jun 2016 12:55:25 -0700 Subject: [PATCH 08/21] COntrol the prefetch strategy --- lib/simd/Intel512common.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h index 6878bcfb..a05f978c 100644 --- a/lib/simd/Intel512common.h +++ b/lib/simd/Intel512common.h @@ -28,6 +28,14 @@ Author: paboyle #ifndef GRID_ASM_INTEL_COMMON_512_H #define GRID_ASM_INTEL_COMMON_512_H +//////////////////////////////////////////////////////////////////////////////////////////////////// +// Peformance options +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define AVX512_PF_L1 +#undef AVX512_PF_L2_LINEAR +#undef AVX512_PF_L2_TABLE +#undef AVX512_PF_L2_WRITE + //////////////////////////////////////////////////////////////////////////////////////////////////// // Opcodes common //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -88,10 +96,30 @@ Author: paboyle #define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n" #define VMOVd(A,DEST) "vmovapd " #A ", " #DEST ";\n" +#ifdef AVX512_PF_L1 #define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n" +#else +#define VPREFETCHG(O,A) +#endif + +#ifdef AVX512_PF_L2_LINEAR #define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n" +#else +#define VPREFETCH2(O,A) +#endif + +#ifdef AVX512_PF_L2_TABLE #define VPREFETCHP(O,A) "prefetcht1 "#O"*64("#A");\n" +#else +#define VPREFETCHP(O,A) +#endif + +#ifdef AVX512_PF_L2_WRITE #define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n" +#else +#define VPREFETCHW(O,A) +#endif + #define VPREFETCHNTA(O,A) #define VPREFETCH(O,A) From 4bc08ed9956b2a461ec227b4144dc93c2b134968 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 26 Jun 2016 12:54:14 -0700 Subject: [PATCH 09/21] Improved the prefetching when using cache blocking codes --- lib/Stencil.h | 5 +- lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 86 ++++--- .../action/fermion/WilsonKernelsAsmBody.h.ab | 18 +- lib/simd/Intel512common.h | 24 +- lib/simd/Intel512wilson.h | 237 +++++++++++------- 5 files changed, 208 insertions(+), 162 deletions(-) diff --git a/lib/Stencil.h b/lib/Stencil.h index 8019e3f9..bc015370 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -1,4 +1,4 @@ - /************************************************************************************* + /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -261,6 +261,9 @@ } }; + inline uint64_t Touch(int ent) { + // _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0); + } inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { _mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0); local = _entries[ent]._is_local; diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index d50999f6..7373d2eb 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -1,7 +1,9 @@ { int locala,perma, ptypea; int localb,permb, ptypeb; - uint64_t basea, baseb; + int localc,permc, ptypec; + uint64_t basea, baseb, basec; + const uint64_t plocal =(uint64_t) & in._odata[0]; // vComplexF isigns[2] = { signs[0], signs[1] }; @@ -10,15 +12,22 @@ MASK_REGS; for(int site=0;site shuffle and xor the real part sign bit YM_PROJMEM(baseb); @@ -45,7 +55,7 @@ LOAD_CHI(baseb); } { - MULT_2SPIN_DIR_PFYP(Yp,basea); + MULT_2SPIN_DIR_PFYP(Yp,basec); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit YM_RECON_ACCUM; @@ -53,16 +63,17 @@ //////////////////////////////// // Zp //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Tp,ent,plocal); ent++; - if ( locala ) { + baseb = st.GetInfo(ptypeb,localb,permb,Xm,ent,plocal); ent++; + PREFETCH_CHIMU(baseb); + if ( localc ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - ZM_PROJMEM(basea); - MAYBEPERM(PERMUTE_DIR1,perma); + ZM_PROJMEM(basec); + MAYBEPERM(PERMUTE_DIR1,permc); } else { - LOAD_CHI(basea); + LOAD_CHI(basec); } { - MULT_2SPIN_DIR_PFZP(Zp,baseb); + MULT_2SPIN_DIR_PFZP(Zp,basea); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZM_RECON_ACCUM; @@ -70,16 +81,17 @@ //////////////////////////////// // Tp //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Xm,ent,plocal); ent++; - if ( localb ) { + basec = st.GetInfo(ptypec,localc,permc,Ym,ent,plocal); ent++; + PREFETCH_CHIMU(basec); + if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - TM_PROJMEM(baseb); - MAYBEPERM(PERMUTE_DIR0,permb); + TM_PROJMEM(basea); + MAYBEPERM(PERMUTE_DIR0,perma); } else { - LOAD_CHI(baseb); + LOAD_CHI(basea); } { - MULT_2SPIN_DIR_PFTP(Tp,basea); + MULT_2SPIN_DIR_PFTP(Tp,baseb); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TM_RECON_ACCUM; @@ -87,16 +99,17 @@ //////////////////////////////// // Xm //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Ym,ent,plocal); ent++; - if ( locala ) { + basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++; + PREFETCH_CHIMU(basea); + if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - XP_PROJMEM(basea); - MAYBEPERM(PERMUTE_DIR3,perma); + XP_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR3,permb); } else { - LOAD_CHI(basea); + LOAD_CHI(baseb); } { - MULT_2SPIN_DIR_PFXM(Xm,baseb); + MULT_2SPIN_DIR_PFXM(Xm,basec); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit XP_RECON_ACCUM; @@ -104,13 +117,14 @@ //////////////////////////////// // Ym //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++; - if ( localb ) { + baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++; + PREFETCH_CHIMU(baseb); + if ( localc ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - YP_PROJMEM(baseb); - MAYBEPERM(PERMUTE_DIR2,permb); + YP_PROJMEM(basec); + MAYBEPERM(PERMUTE_DIR2,permc); } else { - LOAD_CHI(baseb); + LOAD_CHI(basec); } { MULT_2SPIN_DIR_PFYM(Ym,basea); @@ -121,7 +135,8 @@ //////////////////////////////// // Zm //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++; + basec = (uint64_t)&out._odata[ss]; + PREFETCH_CHIMU(basec); if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZP_PROJMEM(basea); @@ -138,7 +153,8 @@ //////////////////////////////// // Tm //////////////////////////////// - basea = (uint64_t)&out._odata[ss]; + // basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; + // PREFETCH_CHIMU(basea); if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TP_PROJMEM(baseb); @@ -146,16 +162,16 @@ } else { LOAD_CHI(baseb); } - baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); { - MULT_2SPIN_DIR_PFTM(Tm,basea); + MULT_2SPIN_DIR_PFTM(Tm,basec); } + // baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TP_RECON_ACCUM; - SAVE_RESULT(&out._odata[ss],baseb); - - } + SAVE_RESULT(&out._odata[ss],basec); + + } ssU++; } } diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab index 3ba9eec6..d50999f6 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab @@ -2,7 +2,6 @@ int locala,perma, ptypea; int localb,permb, ptypeb; uint64_t basea, baseb; - uint64_t basex; const uint64_t plocal =(uint64_t) & in._odata[0]; // vComplexF isigns[2] = { signs[0], signs[1] }; @@ -19,9 +18,7 @@ //////////////////////////////// int ent=ss*8;// 2*Ndim basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; - PREFETCH_CHIMU(basea); baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; - basex = basea; if ( locala ) { LOAD64(%r10,isigns); @@ -39,7 +36,7 @@ //////////////////////////////// // Yp //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; + basea = st.GetInfo(ptypea,locala,perma,Zp,ent,plocal); ent++; if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit YM_PROJMEM(baseb); @@ -56,7 +53,7 @@ //////////////////////////////// // Zp //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; + baseb = st.GetInfo(ptypeb,localb,permb,Tp,ent,plocal); ent++; if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZM_PROJMEM(basea); @@ -73,7 +70,7 @@ //////////////////////////////// // Tp //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; + basea = st.GetInfo(ptypea,locala,perma,Xm,ent,plocal); ent++; if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TM_PROJMEM(baseb); @@ -90,7 +87,7 @@ //////////////////////////////// // Xm //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; + baseb = st.GetInfo(ptypeb,localb,permb,Ym,ent,plocal); ent++; if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit XP_PROJMEM(basea); @@ -107,7 +104,7 @@ //////////////////////////////// // Ym //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; + basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++; if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit YP_PROJMEM(baseb); @@ -124,7 +121,7 @@ //////////////////////////////// // Zm //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; + baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++; if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZP_PROJMEM(basea); @@ -149,13 +146,14 @@ } else { LOAD_CHI(baseb); } + baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); { MULT_2SPIN_DIR_PFTM(Tm,basea); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TP_RECON_ACCUM; - SAVE_RESULT(&out._odata[ss]); + SAVE_RESULT(&out._odata[ss],baseb); } ssU++; diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h index a05f978c..dabbf6d8 100644 --- a/lib/simd/Intel512common.h +++ b/lib/simd/Intel512common.h @@ -31,9 +31,6 @@ Author: paboyle //////////////////////////////////////////////////////////////////////////////////////////////////// // Peformance options //////////////////////////////////////////////////////////////////////////////////////////////////// -#define AVX512_PF_L1 -#undef AVX512_PF_L2_LINEAR -#undef AVX512_PF_L2_TABLE #undef AVX512_PF_L2_WRITE //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -45,7 +42,7 @@ Author: paboyle "mov $0x5555, %%eax \n"\ "kmovw %%eax, %%k7 \n" : : : "%eax"); -//#define label(B) __asm__ ( __func__ __LINE__ #B ":\n" ); +//#define label(B) __asm__ ( __func__ _LINE__ #B ":\n" ); #define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n" #define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n" @@ -96,30 +93,13 @@ Author: paboyle #define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n" #define VMOVd(A,DEST) "vmovapd " #A ", " #DEST ";\n" -#ifdef AVX512_PF_L1 -#define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n" -#else -#define VPREFETCHG(O,A) -#endif - -#ifdef AVX512_PF_L2_LINEAR +#define VPREFETCH1(O,A) "prefetcht0 "#O"*64("#A");\n" #define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n" -#else -#define VPREFETCH2(O,A) -#endif - -#ifdef AVX512_PF_L2_TABLE -#define VPREFETCHP(O,A) "prefetcht1 "#O"*64("#A");\n" -#else -#define VPREFETCHP(O,A) -#endif - #ifdef AVX512_PF_L2_WRITE #define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n" #else #define VPREFETCHW(O,A) #endif - #define VPREFETCHNTA(O,A) #define VPREFETCH(O,A) diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h index 207d9db8..9deffd80 100644 --- a/lib/simd/Intel512wilson.h +++ b/lib/simd/Intel512wilson.h @@ -169,23 +169,6 @@ Author: paboyle VSTORE(5,%r8,Chi_12) \ ); -#define SAVE_RESULTi(PTR,pf) \ - LOAD64(%r8,PTR) \ - LOAD64(%r9,pf) \ - __asm__ ( \ - VSTORE(0,%r8,result_00) VPREFETCHG(0,%r9) \ - VSTORE(1,%r8,result_01) VPREFETCHG(1,%r9) \ - VSTORE(2,%r8,result_02) VPREFETCHG(2,%r9) \ - VSTORE(3,%r8,result_10) VPREFETCHG(3,%r9) \ - VSTORE(4,%r8,result_11) VPREFETCHG(4,%r9) \ - VSTORE(5,%r8,result_12) VPREFETCHG(5,%r9) \ - VSTORE(6,%r8,result_20) VPREFETCHG(6,%r9) \ - VSTORE(7,%r8,result_21) VPREFETCHG(7,%r9) \ - VSTORE(8,%r8,result_22) VPREFETCHG(8,%r9) \ - VSTORE(9,%r8,result_30) VPREFETCHG(9,%r9) \ - VSTORE(10,%r8,result_31) VPREFETCHG(10,%r9) \ - VSTORE(11,%r8,result_32) VPREFETCHG(11,%r9) \ - ); #define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p) #define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p) @@ -560,24 +543,89 @@ Author: paboyle VSUB(UChi_02,result_22,result_22)\ VSUB(UChi_12,result_32,result_32) ); -#define PREFETCH_CHIMU(A) -/* - LOAD64(%r9,A) \ - __asm__ ( \ - VPREFETCHG(0,%r9)\ - VPREFETCHG(1,%r9)\ - VPREFETCHG(2,%r9)\ - VPREFETCHG(3,%r9)\ - VPREFETCHG(4,%r9)\ - VPREFETCHG(5,%r9)\ - VPREFETCHG(6,%r9)\ - VPREFETCHG(7,%r9)\ - VPREFETCHG(8,%r9)\ - VPREFETCHG(9,%r9)\ - VPREFETCHG(10,%r9)\ - VPREFETCHG(11,%r9)); -*/ -#define PERMUTE_DIR0 __asm__ ( \ +#define AVX512_PF_L1 +#define AVX512_PF_L2_GAUGE +#define AVX512_PF_L2_TABLE +#undef AVX512_PF_L2_LINEAR + +#ifdef AVX512_PF_L2_TABLE +#define VPREFETCH_P1(A,B) VPREFETCH1(A,B) +#define VPREFETCH_P2(A,B) VPREFETCH1(A,B) +#else +#define VPREFETCH_P1(A,B) +#define VPREFETCH_P2(A,B) +#endif +#ifdef AVX512_PF_L2_LINEAR +#define VPREFETCH_M1(A,B) +#define VPREFETCH_M2(A,B) +#else +#define VPREFETCH_M1(A,B) VPREFETCH1(A,B) +#define VPREFETCH_M2(A,B) VPREFETCH2(A,B) +#endif +#ifdef AVX512_PF_L2_GAUGE +#define VPREFETCH_G1(A,B) VPREFETCH1(A,B) +#define VPREFETCH_G2(A,B) VPREFETCH2(A,B) +#else +#endif + +#define PF_GAUGE(A) \ + LOAD64(%r8,&U._odata[sU](A)) \ + __asm__ ( \ + VPREFETCH_G1(0,%r8) VPREFETCH_G1(1,%r8) \ + VPREFETCH_G1(2,%r8) VPREFETCH_G1(3,%r8) \ + ); + +#define SAVE_RESULTi(PTR,pf) \ + LOAD64(%r8,PTR) \ + LOAD64(%r9,pf) \ + __asm__ ( \ + VSTORE(0,%r8,result_00) VPREFETCH_M1(0,%r9) \ + VSTORE(1,%r8,result_01) VPREFETCH_M1(1,%r9) \ + VSTORE(2,%r8,result_02) VPREFETCH_M1(2,%r9) \ + VSTORE(3,%r8,result_10) VPREFETCH_M1(3,%r9) \ + VSTORE(4,%r8,result_11) VPREFETCH_M1(4,%r9) \ + VSTORE(5,%r8,result_12) VPREFETCH_M1(5,%r9) \ + VSTORE(6,%r8,result_20) VPREFETCH_M1(6,%r9) \ + VSTORE(7,%r8,result_21) VPREFETCH_M1(7,%r9) \ + VSTORE(8,%r8,result_22) VPREFETCH_M1(8,%r9) \ + VSTORE(9,%r8,result_30) VPREFETCH_M1(9,%r9) \ + VSTORE(10,%r8,result_31) VPREFETCH_M1(10,%r9) \ + VSTORE(11,%r8,result_32) VPREFETCH_M1(11,%r9) \ + ); + +#define PREFETCH_CHIMU(A) \ + LOAD64(%r9,A) \ + __asm__ ( \ + VPREFETCH_P2(0,%r9) \ + VPREFETCH_P2(1,%r9) \ + VPREFETCH_P2(2,%r9) \ + VPREFETCH_P2(3,%r9) \ + VPREFETCH_P2(4,%r9) \ + VPREFETCH_P2(5,%r9) \ + VPREFETCH_P2(6,%r9) \ + VPREFETCH_P2(7,%r9) \ + VPREFETCH_P2(8,%r9) \ + VPREFETCH_P2(9,%r9) \ + VPREFETCH_P2(10,%r9) \ + VPREFETCH_P2(11,%r9)); + +#define PREFETCH1_CHIMU(A) \ + LOAD64(%r9,A) \ + __asm__ ( \ + VPREFETCH_P1(0,%r9) \ + VPREFETCH_P1(1,%r9) \ + VPREFETCH_P1(2,%r9) \ + VPREFETCH_P1(3,%r9) \ + VPREFETCH_P1(4,%r9) \ + VPREFETCH_P1(5,%r9) \ + VPREFETCH_P1(6,%r9) \ + VPREFETCH_P1(7,%r9) \ + VPREFETCH_P1(8,%r9) \ + VPREFETCH_P1(9,%r9) \ + VPREFETCH_P1(10,%r9) \ + VPREFETCH_P1(11,%r9)); + +#define PERMUTE_DIR0 __asm__ ( \ VPERM0(Chi_00,Chi_00) \ VPERM0(Chi_01,Chi_01) \ VPERM0(Chi_02,Chi_02) \ @@ -614,14 +662,15 @@ Author: paboyle LOAD64(%r8,ptr) \ LOAD64(%r9,pf) \ __asm__ ( \ - VPREFETCH2(9,%r8) VPREFETCH2(10,%r8) \ - VPREFETCH2(11,%r8) \ - VPREFETCH2(12,%r8) \ - VPREFETCH2(13,%r8) \ - VPREFETCH2(14,%r8) \ - VPREFETCH2(15,%r8) \ - VPREFETCH2(16,%r8) \ - VPREFETCH2(17,%r8) \ + VPREFETCH_G2(9,%r8) \ + VPREFETCH_G2(10,%r8) \ + VPREFETCH_G2(11,%r8) \ + VPREFETCH_G2(12,%r8) \ + VPREFETCH_G2(13,%r8) \ + VPREFETCH_G2(14,%r8) \ + VPREFETCH_G2(15,%r8) \ + VPREFETCH_G2(16,%r8) \ + VPREFETCH_G2(17,%r8) \ VSHUF(Chi_00,T1) \ VMOVIDUP(0,%r8,Z0 ) \ VMOVIDUP(3,%r8,Z1 ) \ @@ -633,10 +682,10 @@ Author: paboyle VMUL(Z1,T2,UChi_11) VMOVIDUP(1,%r8,Z0 ) \ VMUL(Z2,T1,UChi_02) VMOVIDUP(4,%r8,Z1 ) \ VMUL(Z2,T2,UChi_12) VMOVIDUP(7,%r8,Z2 ) \ - VPREFETCHG(0,%r9) \ - VPREFETCHG(1,%r9) \ - VPREFETCHG(2,%r9) \ - VPREFETCHG(3,%r9) \ + VPREFETCH_M1(0,%r9) \ + VPREFETCH_M1(1,%r9) \ + VPREFETCH_M1(2,%r9) \ + VPREFETCH_M1(3,%r9) \ /*18*/ \ VMADDSUB(Z3,Chi_00,UChi_00) VSHUF(Chi_01,T1) \ VMADDSUB(Z3,Chi_10,UChi_10) \ @@ -644,10 +693,10 @@ Author: paboyle VMADDSUB(Z4,Chi_10,UChi_11) VSHUF(Chi_11,T2) \ VMADDSUB(Z5,Chi_00,UChi_02) VMOVRDUP(4,%r8,Z4 ) \ VMADDSUB(Z5,Chi_10,UChi_12) \ - VPREFETCHG(4,%r9) \ - VPREFETCHG(5,%r9) \ - VPREFETCHG(6,%r9) \ - VPREFETCHG(7,%r9) \ + VPREFETCH_M1(4,%r9) \ + VPREFETCH_M1(5,%r9) \ + VPREFETCH_M1(6,%r9) \ + VPREFETCH_M1(7,%r9) \ /*28*/ \ VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(7,%r8,Z5 ) \ VMADDSUB(Z0,T2,UChi_10) \ @@ -674,15 +723,15 @@ Author: paboyle VMADDSUB(Z4,Chi_11,UChi_11) VSHUF(Chi_12,T2) \ VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \ VMADDSUB(Z5,Chi_11,UChi_12) \ - VPREFETCHG(9,%r8) \ - VPREFETCHG(10,%r8) \ - VPREFETCHG(11,%r8) \ - VPREFETCHG(12,%r8) \ - VPREFETCHG(13,%r8) \ - VPREFETCHG(14,%r8) \ - VPREFETCHG(15,%r8) \ - VPREFETCHG(16,%r8) \ - VPREFETCHG(17,%r8) \ + VPREFETCH_M1(9,%r8) \ + VPREFETCH_M1(10,%r8) \ + VPREFETCH_M1(11,%r8) \ + VPREFETCH_M1(12,%r8) \ + VPREFETCH_M1(13,%r8) \ + VPREFETCH_M1(14,%r8) \ + VPREFETCH_M1(15,%r8) \ + VPREFETCH_M1(16,%r8) \ + VPREFETCH_M1(17,%r8) \ /*48*/ \ VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \ VMADDSUB(Z0,T2,UChi_10) \ @@ -690,10 +739,10 @@ Author: paboyle VMADDSUB(Z1,T2,UChi_11) \ VMADDSUB(Z2,T1,UChi_02) \ VMADDSUB(Z2,T2,UChi_12) \ - VPREFETCHG(8,%r9) \ - VPREFETCHG(9,%r9) \ - VPREFETCHG(10,%r9) \ - VPREFETCHG(11,%r9) \ + VPREFETCH_M1(8,%r9) \ + VPREFETCH_M1(9,%r9) \ + VPREFETCH_M1(10,%r9) \ + VPREFETCH_M1(11,%r9) \ /*55*/ \ VMADDSUB(Z3,Chi_02,UChi_00) \ VMADDSUB(Z3,Chi_12,UChi_10) \ @@ -712,56 +761,56 @@ Author: paboyle VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \ VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \ VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \ - VPREFETCHG(0,%r9) \ - VPREFETCHG(1,%r9) \ - VPREFETCHG(2,%r9) \ - VPREFETCHG(3,%r9) \ + VPREFETCH_M1(0,%r9) \ + VPREFETCH_M1(1,%r9) \ + VPREFETCH_M1(2,%r9) \ + VPREFETCH_M1(3,%r9) \ /*8*/ \ VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \ VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \ VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \ VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \ - VPREFETCHG(4,%r9) \ - VPREFETCHG(5,%r9) \ - VPREFETCHG(6,%r9) \ - VPREFETCHG(7,%r9) \ + VPREFETCH_M1(4,%r9) \ + VPREFETCH_M1(5,%r9) \ + VPREFETCH_M1(6,%r9) \ + VPREFETCH_M1(7,%r9) \ /*16*/ \ VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \ VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \ VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \ - VPREFETCHG(8,%r9) \ - VPREFETCHG(9,%r9) \ - VPREFETCHG(10,%r9) \ - VPREFETCHG(11,%r9) \ + VPREFETCH_M1(8,%r9) \ + VPREFETCH_M1(9,%r9) \ + VPREFETCH_M1(10,%r9) \ + VPREFETCH_M1(11,%r9) \ /*22*/ \ VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \ VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \ VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \ VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \ - VPREFETCH2(12,%r9) \ - VPREFETCH2(13,%r9) \ - VPREFETCH2(14,%r9) \ - VPREFETCH2(15,%r9) \ + VPREFETCH_M2(12,%r9) \ + VPREFETCH_M2(13,%r9) \ + VPREFETCH_M2(14,%r9) \ + VPREFETCH_M2(15,%r9) \ /*30*/ \ VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \ VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \ - VPREFETCH2(16,%r9) \ - VPREFETCH2(17,%r9) \ - VPREFETCH2(18,%r9) \ - VPREFETCH2(19,%r9) \ + VPREFETCH_M2(16,%r9) \ + VPREFETCH_M2(17,%r9) \ + VPREFETCH_M2(18,%r9) \ + VPREFETCH_M2(19,%r9) \ VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \ /*36*/ \ VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \ VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \ VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \ - VPREFETCH2(20,%r9) \ - VPREFETCH2(21,%r9) \ - VPREFETCH2(22,%r9) \ - VPREFETCH2(23,%r9) \ - VPREFETCHG(2,%r8) \ - VPREFETCHG(3,%r8) \ - VPREFETCH2(4,%r8) \ - VPREFETCH2(5,%r8) \ + VPREFETCH_M2(20,%r9) \ + VPREFETCH_M2(21,%r9) \ + VPREFETCH_M2(22,%r9) \ + VPREFETCH_M2(23,%r9) \ + VPREFETCH_G1(2,%r8) \ + VPREFETCH_G1(3,%r8) \ + VPREFETCH_G2(4,%r8) \ + VPREFETCH_G2(5,%r8) \ /*42 insns*/ ); #define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) \ @@ -794,8 +843,8 @@ Author: paboyle VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \ VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \ VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \ - /* VPREFETCHG(2,%r8)*/ \ - /* VPREFETCHG(3,%r8)*/ \ + /* VPREFETCH1(2,%r8)*/ \ + /* VPREFETCH1(3,%r8)*/ \ /*42 insns*/ ); From 661b0ab45de2e6765180fa60b21fff8fc9bbe2fc Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 30 Jun 2016 13:07:42 -0700 Subject: [PATCH 10/21] Updated to have perfect prefetching for the s-vectorised kernel with any cache blocking. --- lib/Stencil.h | 5 + lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 153 +++++++++--------- lib/simd/Intel512wilson.h | 58 ++++--- 3 files changed, 119 insertions(+), 97 deletions(-) diff --git a/lib/Stencil.h b/lib/Stencil.h index bc015370..f5b6c288 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -272,6 +272,11 @@ if (local) return base + _entries[ent]._byte_offset; else return _entries[ent]._byte_offset; } + inline uint64_t GetPFInfo(int ent,uint64_t base) { + int local = _entries[ent]._is_local; + if (local) return base + _entries[ent]._byte_offset; + else return _entries[ent]._byte_offset; + } // Comms buffers std::vector > u_simd_send_buf; diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index 7373d2eb..4f3ef861 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -1,43 +1,44 @@ { - int locala,perma, ptypea; - int localb,permb, ptypeb; - int localc,permc, ptypec; - uint64_t basea, baseb, basec; - + int local,perm, ptype; + uint64_t base; + uint64_t basep; const uint64_t plocal =(uint64_t) & in._odata[0]; // vComplexF isigns[2] = { signs[0], signs[1] }; vComplexF *isigns = &signs[0]; MASK_REGS; - + int nmax=U._grid->oSites(); for(int site=0;site=nmax) ssn=0; + int sUn=lo.Reorder(ssn); for(int s=0;s shuffle and xor the real part sign bit - YM_PROJMEM(baseb); - MAYBEPERM(PERMUTE_DIR2,permb); + YM_PROJMEM(base); + MAYBEPERM(PERMUTE_DIR2,perm); } else { - LOAD_CHI(baseb); + LOAD_CHI(base); } + base = st.GetInfo(ptype,local,perm,Zp,ent,plocal); ent++; + PREFETCH_CHIMU(base); { - MULT_2SPIN_DIR_PFYP(Yp,basec); + MULT_2SPIN_DIR_PFYP(Yp,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit YM_RECON_ACCUM; @@ -63,17 +65,18 @@ //////////////////////////////// // Zp //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Xm,ent,plocal); ent++; - PREFETCH_CHIMU(baseb); - if ( localc ) { + basep = st.GetPFInfo(nent,plocal); nent++; + if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - ZM_PROJMEM(basec); - MAYBEPERM(PERMUTE_DIR1,permc); + ZM_PROJMEM(base); + MAYBEPERM(PERMUTE_DIR1,perm); } else { - LOAD_CHI(basec); + LOAD_CHI(base); } + base = st.GetInfo(ptype,local,perm,Tp,ent,plocal); ent++; + PREFETCH_CHIMU(base); { - MULT_2SPIN_DIR_PFZP(Zp,basea); + MULT_2SPIN_DIR_PFZP(Zp,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZM_RECON_ACCUM; @@ -81,17 +84,18 @@ //////////////////////////////// // Tp //////////////////////////////// - basec = st.GetInfo(ptypec,localc,permc,Ym,ent,plocal); ent++; - PREFETCH_CHIMU(basec); - if ( locala ) { + basep = st.GetPFInfo(nent,plocal); nent++; + if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - TM_PROJMEM(basea); - MAYBEPERM(PERMUTE_DIR0,perma); + TM_PROJMEM(base); + MAYBEPERM(PERMUTE_DIR0,perm); } else { - LOAD_CHI(basea); + LOAD_CHI(base); } + base = st.GetInfo(ptype,local,perm,Xm,ent,plocal); ent++; + PREFETCH_CHIMU(base); { - MULT_2SPIN_DIR_PFTP(Tp,baseb); + MULT_2SPIN_DIR_PFTP(Tp,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TM_RECON_ACCUM; @@ -99,17 +103,19 @@ //////////////////////////////// // Xm //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++; - PREFETCH_CHIMU(basea); - if ( localb ) { + basep= (uint64_t) &out._odata[ss]; + // basep= st.GetPFInfo(nent,plocal); nent++; + if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - XP_PROJMEM(baseb); - MAYBEPERM(PERMUTE_DIR3,permb); + XP_PROJMEM(base); + MAYBEPERM(PERMUTE_DIR3,perm); } else { - LOAD_CHI(baseb); + LOAD_CHI(base); } + base = st.GetInfo(ptype,local,perm,Ym,ent,plocal); ent++; + PREFETCH_CHIMU(base); { - MULT_2SPIN_DIR_PFXM(Xm,basec); + MULT_2SPIN_DIR_PFXM(Xm,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit XP_RECON_ACCUM; @@ -117,17 +123,18 @@ //////////////////////////////// // Ym //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++; - PREFETCH_CHIMU(baseb); - if ( localc ) { + basep= st.GetPFInfo(nent,plocal); nent++; + if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - YP_PROJMEM(basec); - MAYBEPERM(PERMUTE_DIR2,permc); + YP_PROJMEM(base); + MAYBEPERM(PERMUTE_DIR2,perm); } else { - LOAD_CHI(basec); + LOAD_CHI(base); } + base = st.GetInfo(ptype,local,perm,Zm,ent,plocal); ent++; + PREFETCH_CHIMU(base); { - MULT_2SPIN_DIR_PFYM(Ym,basea); + MULT_2SPIN_DIR_PFYM(Ym,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit YP_RECON_ACCUM; @@ -135,17 +142,18 @@ //////////////////////////////// // Zm //////////////////////////////// - basec = (uint64_t)&out._odata[ss]; - PREFETCH_CHIMU(basec); - if ( locala ) { + basep= st.GetPFInfo(nent,plocal); nent++; + if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - ZP_PROJMEM(basea); - MAYBEPERM(PERMUTE_DIR1,perma); + ZP_PROJMEM(base); + MAYBEPERM(PERMUTE_DIR1,perm); } else { - LOAD_CHI(basea); + LOAD_CHI(base); } + base = st.GetInfo(ptype,local,perm,Tm,ent,plocal); ent++; + PREFETCH_CHIMU(base); { - MULT_2SPIN_DIR_PFZM(Zm,baseb); + MULT_2SPIN_DIR_PFZM(Zm,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZP_RECON_ACCUM; @@ -153,23 +161,24 @@ //////////////////////////////// // Tm //////////////////////////////// - // basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; - // PREFETCH_CHIMU(basea); - if ( localb ) { + basep= st.GetPFInfo(nent,plocal); nent++; + if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - TP_PROJMEM(baseb); - MAYBEPERM(PERMUTE_DIR0,permb); + TP_PROJMEM(base); + MAYBEPERM(PERMUTE_DIR0,perm); } else { - LOAD_CHI(baseb); + LOAD_CHI(base); } + base= (uint64_t) &out._odata[ss]; + PREFETCH_CHIMU(base); { - MULT_2SPIN_DIR_PFTM(Tm,basec); + MULT_2SPIN_DIR_PFTM(Tm,basep); } - // baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TP_RECON_ACCUM; - SAVE_RESULT(&out._odata[ss],basec); + basep= st.GetPFInfo(nent,plocal); nent++; + SAVE_RESULT(base,basep); } ssU++; diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h index 9deffd80..660d07d6 100644 --- a/lib/simd/Intel512wilson.h +++ b/lib/simd/Intel512wilson.h @@ -261,8 +261,8 @@ Author: paboyle #define XM_PROJMEM(PTR) \ LOAD64(%r8,PTR)\ __asm__ ( \ - SHUF_CHIMU23i \ LOAD_CHIi \ + SHUF_CHIMU23i \ VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\ VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\ VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\ @@ -290,8 +290,8 @@ Author: paboyle #define ZM_PROJMEM(PTR) \ LOAD64(%r8,PTR) \ __asm__ ( \ - SHUF_CHIMU23i \ LOAD_CHIi \ + SHUF_CHIMU23i \ VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\ VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\ VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\ @@ -548,24 +548,25 @@ Author: paboyle #define AVX512_PF_L2_TABLE #undef AVX512_PF_L2_LINEAR -#ifdef AVX512_PF_L2_TABLE -#define VPREFETCH_P1(A,B) VPREFETCH1(A,B) -#define VPREFETCH_P2(A,B) VPREFETCH1(A,B) -#else -#define VPREFETCH_P1(A,B) -#define VPREFETCH_P2(A,B) -#endif -#ifdef AVX512_PF_L2_LINEAR -#define VPREFETCH_M1(A,B) +#ifdef AVX512_PF_L2_TABLE +// P1 Fetches the base pointer for next link into L1 with P1 +// M1 Fetches the next site pointer into L2 +#define VPREFETCH_P1(A,B) VPREFETCH1(A,B) +#define VPREFETCH_P2(A,B) +#define VPREFETCH_M1(A,B) VPREFETCH2(A,B) #define VPREFETCH_M2(A,B) -#else +#endif + +#ifdef AVX512_PF_L2_LINEAR #define VPREFETCH_M1(A,B) VPREFETCH1(A,B) #define VPREFETCH_M2(A,B) VPREFETCH2(A,B) +#define VPREFETCH_P1(A,B) +#define VPREFETCH_P2(A,B) #endif + #ifdef AVX512_PF_L2_GAUGE #define VPREFETCH_G1(A,B) VPREFETCH1(A,B) #define VPREFETCH_G2(A,B) VPREFETCH2(A,B) -#else #endif #define PF_GAUGE(A) \ @@ -593,21 +594,26 @@ Author: paboyle VSTORE(11,%r8,result_32) VPREFETCH_M1(11,%r9) \ ); +#ifdef AVX512_PF_L2_TABLE #define PREFETCH_CHIMU(A) \ LOAD64(%r9,A) \ __asm__ ( \ - VPREFETCH_P2(0,%r9) \ - VPREFETCH_P2(1,%r9) \ - VPREFETCH_P2(2,%r9) \ - VPREFETCH_P2(3,%r9) \ - VPREFETCH_P2(4,%r9) \ - VPREFETCH_P2(5,%r9) \ - VPREFETCH_P2(6,%r9) \ - VPREFETCH_P2(7,%r9) \ - VPREFETCH_P2(8,%r9) \ - VPREFETCH_P2(9,%r9) \ - VPREFETCH_P2(10,%r9) \ - VPREFETCH_P2(11,%r9)); + VPREFETCH_P1(0,%r9) \ + VPREFETCH_P1(1,%r9) \ + VPREFETCH_P1(2,%r9) \ + VPREFETCH_P1(3,%r9) \ + VPREFETCH_P1(4,%r9) \ + VPREFETCH_P1(5,%r9) \ + VPREFETCH_P1(6,%r9) \ + VPREFETCH_P1(7,%r9) \ + VPREFETCH_P1(8,%r9) \ + VPREFETCH_P1(9,%r9) \ + VPREFETCH_P1(10,%r9) \ + VPREFETCH_P1(11,%r9)); + +#else +#define PREFETCH_CHIMU(A) +#endif #define PREFETCH1_CHIMU(A) \ LOAD64(%r9,A) \ @@ -811,6 +817,8 @@ Author: paboyle VPREFETCH_G1(3,%r8) \ VPREFETCH_G2(4,%r8) \ VPREFETCH_G2(5,%r8) \ + VPREFETCH_G2(6,%r8) \ + VPREFETCH_G2(7,%r8) \ /*42 insns*/ ); #define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) \ From 532f41dd61c3105369f58fb6dbd9329d29eca90b Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 30 Jun 2016 14:00:34 -0700 Subject: [PATCH 11/21] Asm only for avx512 --- lib/qcd/action/fermion/WilsonKernels.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc index 672c23d6..4edd25f9 100644 --- a/lib/qcd/action/fermion/WilsonKernels.cc +++ b/lib/qcd/action/fermion/WilsonKernels.cc @@ -42,12 +42,15 @@ void WilsonKernels::DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,Dou std::vector > &buf, int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out) { +#ifdef AVX512 if ( AsmOpt ) { WilsonKernels::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out); } else { - +#else + { +#endif for(int site=0;site::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out); From 6d58cb2a68bf939e253acff4b3e55d40aa5f9f7b Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 19 Jun 2016 11:45:58 -0700 Subject: [PATCH 12/21] Enable reordering of the loops in the assembler for cache friendly. This gets in the way of L2 prefetching however. Do next next link in stencil prefetching. --- benchmarks/Benchmark_dwf_sweep.cc | 358 ++++++++++++++++++ lib/qcd/action/fermion/DomainWallFermion.h | 2 +- lib/qcd/action/fermion/WilsonFermion.cc | 14 +- lib/qcd/action/fermion/WilsonFermion.h | 6 +- lib/qcd/action/fermion/WilsonFermion5D.cc | 4 +- lib/qcd/action/fermion/WilsonKernels.cc | 20 +- lib/qcd/action/fermion/WilsonKernels.h | 14 +- lib/qcd/action/fermion/WilsonKernelsAsm.cc | 36 +- lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 76 ++-- .../action/fermion/WilsonKernelsAsmBody.h.ab | 163 ++++++++ lib/qcd/action/fermion/WilsonKernelsHand.cc | 36 +- lib/simd/Intel512common.h | 6 +- lib/simd/Intel512wilson.h | 32 +- lib/stencil/Lebesgue.cc | 18 +- lib/stencil/Lebesgue.h | 1 + 15 files changed, 670 insertions(+), 116 deletions(-) create mode 100644 benchmarks/Benchmark_dwf_sweep.cc create mode 100644 lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab diff --git a/benchmarks/Benchmark_dwf_sweep.cc b/benchmarks/Benchmark_dwf_sweep.cc new file mode 100644 index 00000000..302059a4 --- /dev/null +++ b/benchmarks/Benchmark_dwf_sweep.cc @@ -0,0 +1,358 @@ + + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./benchmarks/Benchmark_dwf.cc + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::GammaMatrix Gmu [] = { + Gamma::GammaX, + Gamma::GammaY, + Gamma::GammaZ, + Gamma::GammaT + }; + +void benchDw(std::vector & L, int Ls, int threads, int report =0 ); +void benchsDw(std::vector & L, int Ls, int threads, int report=0 ); + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + const int Ls=16; + int threads = GridThread::GetThreads(); + std::cout< latt4(4,L); + for(int d=4;d>0;d--){ + if ( d<=3 ) latt4[d]*=2; + std::cout << GridLogMessage <<"\t"; + for(int d=0;d latt4(4,16); + std::cout< & latt4, int Ls, int threads,int report ) +{ + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + +#ifdef CHECK + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + LatticeFermion src (FGrid); random(RNG5,src); + LatticeGaugeField Umu(UGrid); + random(RNG4,Umu); +#else + LatticeFermion src (FGrid); src=zero; + LatticeGaugeField Umu(UGrid); Umu=zero; +#endif + + LatticeFermion result(FGrid); result=zero; + LatticeFermion ref(FGrid); ref=zero; + LatticeFermion tmp(FGrid); + LatticeFermion err(FGrid); + + ColourMatrix cm = Complex(1.0,0.0); + + + LatticeGaugeField Umu5d(FGrid); + + // replicate across fifth dimension + for(int ss=0;ssoSites();ss++){ + for(int s=0;s U(4,FGrid); + for(int mu=0;mu(Umu5d,mu); + } + +#ifdef CHECK + if (1) + { + ref = zero; + for(int mu=0;mu_Nprocessors; + + DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + + double t0=usecond(); + Dw.Dhop(src,result,0); + double t1=usecond(); + + int ncall =1+(int) ((5.0*1000*1000)/(t1-t0)); + + if (ncall < 5 ) exit(0); + + Dw.Dhop(src,result,0); + + PerformanceCounter Counter(8); + Counter.Start(); + t0=usecond(); + for(int i=0;i 1.0e-4 ) { + std::cout< & latt4, int Ls, int threads, int report ) +{ + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi()); + GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid); + GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid); + + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + +#ifdef CHECK_SDW + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + LatticeFermion src (FGrid); random(RNG5,src); + LatticeGaugeField Umu(UGrid); + random(RNG4,Umu); +#else + LatticeFermion src (FGrid); src=zero; + LatticeGaugeField Umu(UGrid); Umu=zero; +#endif + + LatticeFermion result(FGrid); result=zero; + LatticeFermion ref(FGrid); ref=zero; + LatticeFermion tmp(FGrid); + LatticeFermion err(FGrid); + + ColourMatrix cm = Complex(1.0,0.0); + + LatticeGaugeField Umu5d(FGrid); + + // replicate across fifth dimension + for(int ss=0;ssoSites();ss++){ + for(int s=0;s WilsonFermion5DF; + LatticeFermionF ssrc(sFGrid); + LatticeFermionF sref(sFGrid); + LatticeFermionF sresult(sFGrid); + WilsonFermion5DF sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5); + + for(int x=0;x site({s,x,y,z,t}); + SpinColourVectorF tmp; + peekSite(tmp,src,site); + pokeSite(tmp,ssrc,site); + }}}}} + + double t0=usecond(); + sDw.Dhop(ssrc,sresult,0); + double t1=usecond(); + + int ncall =1+(int) ((5.0*1000*1000)/(t1-t0)); + + PerformanceCounter Counter(8); + Counter.Start(); + t0=usecond(); + for(int i=0;iLs);// eps is ignored for higham assert(zdata->n==this->Ls); - std::cout< #define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri) #define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri) -#define VPREFETCHNTA(O,A) -#define VPREFETCH(O,A) #define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n" #define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n" diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h index 1955cc6d..2bc0545d 100644 --- a/lib/simd/Intel512wilson.h +++ b/lib/simd/Intel512wilson.h @@ -559,22 +559,23 @@ Author: paboyle VSUB(UChi_02,result_22,result_22)\ VSUB(UChi_12,result_32,result_32) ); -#define PREFETCH_CHIMU(A) \ +#define PREFETCH_CHIMU(A) +/* LOAD64(%r9,A) \ __asm__ ( \ - VPREFETCHG(12,%r9)\ - VPREFETCHG(13,%r9)\ - VPREFETCHG(14,%r9)\ - VPREFETCHG(15,%r9)\ - VPREFETCHG(16,%r9)\ - VPREFETCHG(17,%r9)\ - VPREFETCHG(18,%r9)\ - VPREFETCHG(19,%r9)\ - VPREFETCHG(20,%r9)\ - VPREFETCHG(21,%r9)\ - VPREFETCHG(22,%r9)\ - VPREFETCHG(23,%r9)); - + VPREFETCHG(0,%r9)\ + VPREFETCHG(1,%r9)\ + VPREFETCHG(2,%r9)\ + VPREFETCHG(3,%r9)\ + VPREFETCHG(4,%r9)\ + VPREFETCHG(5,%r9)\ + VPREFETCHG(6,%r9)\ + VPREFETCHG(7,%r9)\ + VPREFETCHG(8,%r9)\ + VPREFETCHG(9,%r9)\ + VPREFETCHG(10,%r9)\ + VPREFETCHG(11,%r9)); +*/ #define PERMUTE_DIR0 __asm__ ( \ VPERM0(Chi_00,Chi_00) \ VPERM0(Chi_01,Chi_01) \ @@ -612,8 +613,7 @@ Author: paboyle LOAD64(%r8,ptr) \ LOAD64(%r9,pf) \ __asm__ ( \ - VPREFETCH2(9,%r8) \ - VPREFETCH2(10,%r8) \ + VPREFETCH2(9,%r8) VPREFETCH2(10,%r8) \ VPREFETCH2(11,%r8) \ VPREFETCH2(12,%r8) \ VPREFETCH2(13,%r8) \ diff --git a/lib/stencil/Lebesgue.cc b/lib/stencil/Lebesgue.cc index 7704e08f..c34b5c96 100644 --- a/lib/stencil/Lebesgue.cc +++ b/lib/stencil/Lebesgue.cc @@ -49,16 +49,25 @@ LebesgueOrder::LebesgueOrder(GridBase *_grid) { grid = _grid; if ( Block[0]==0) ZGraph(); + else if ( Block[1]==0) NoBlocking(); else CartesianBlocking(); } +void LebesgueOrder::NoBlocking(void) +{ + std::cout<oSites();s++){ + _LebesgueReorder.push_back(s); + } +} void LebesgueOrder::CartesianBlocking(void) { _LebesgueReorder.resize(0); - std::cout << GridLogMessage << " CartesianBlocking "; - for(int d=0;d_ndimension; @@ -116,7 +125,8 @@ void LebesgueOrder::IterateI(int ND, void LebesgueOrder::ZGraph(void) { _LebesgueReorder.resize(0); - + + std::cout << GridLogDebug << " Lebesgue order "< Block; + void NoBlocking(void); void CartesianBlocking(void); void IterateO(int ND,int dim, std::vector & xo, From 51cb2d43289cec27a511c64825883c4bc7662279 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 19 Jun 2016 11:59:10 -0700 Subject: [PATCH 13/21] update file lists --- benchmarks/Make.inc | 6 +++++- lib/Make.inc | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/benchmarks/Make.inc b/benchmarks/Make.inc index b60ec835..8d0721a4 100644 --- a/benchmarks/Make.inc +++ b/benchmarks/Make.inc @@ -1,5 +1,5 @@ -bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_dwf_ntpf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm +bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_dwf_ntpf Benchmark_dwf_sweep Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm Benchmark_comms_SOURCES=Benchmark_comms.cc @@ -14,6 +14,10 @@ Benchmark_dwf_ntpf_SOURCES=Benchmark_dwf_ntpf.cc Benchmark_dwf_ntpf_LDADD=-lGrid +Benchmark_dwf_sweep_SOURCES=Benchmark_dwf_sweep.cc +Benchmark_dwf_sweep_LDADD=-lGrid + + Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc Benchmark_memory_asynch_LDADD=-lGrid diff --git a/lib/Make.inc b/lib/Make.inc index 900da916..8763692a 100644 --- a/lib/Make.inc +++ b/lib/Make.inc @@ -1,4 +1,4 @@ -HFILES=./Algorithms.h ./AlignedAllocator.h ./Cartesian.h ./Communicator.h ./Cshift.h ./Grid.h ./Init.h ./Lattice.h ./Lexicographic.h ./Log.h ./Old/Tensor_peek.h ./Old/Tensor_poke.h ./PerfCount.h ./Simd.h ./Stencil.h ./Tensors.h ./Threads.h ./Timer.h ./algorithms/CoarsenedMatrix.h ./algorithms/LinearOperator.h ./algorithms/Preconditioner.h ./algorithms/SparseMatrix.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/DenseMatrix.h ./algorithms/iterative/EigenSort.h ./algorithms/iterative/Francis.h ./algorithms/iterative/Householder.h ./algorithms/iterative/ImplicitlyRestartedLanczos.h ./algorithms/iterative/Matrix.h ./algorithms/iterative/MatrixUtils.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/SchurRedBlack.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./communicator/Communicator_base.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./lattice/Lattice_ET.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./parallelIO/BinaryIO.h ./parallelIO/NerscIO.h ./pugixml/pugixml.h ./qcd/QCD.h ./qcd/action/ActionBase.h ./qcd/action/ActionParams.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/fermion/WilsonTMFermion.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/WilsonKernelsAsmBody.h ./qcd/action/gauge/GaugeImpl.h ./qcd/action/gauge/PlaqPlusRectangleAction.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/hmc/HMC.h ./qcd/hmc/HmcRunner.h ./qcd/hmc/NerscCheckpointer.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SUn.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/WilsonLoops.h ./serialisation/BaseIO.h ./serialisation/BinaryIO.h ./serialisation/MacroMagic.h ./serialisation/Serialisation.h ./serialisation/TextIO.h ./serialisation/XmlIO.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_imci.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./simd/Intel512avx.h ./simd/Intel512common.h ./simd/Intel512double.h ./simd/Intel512imci.h ./simd/Intel512single.h ./simd/Intel512wilson.h ./stencil/Lebesgue.h ./tensors/Tensor_Ta.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_exp.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_index.h ./tensors/Tensor_inner.h ./tensors/Tensor_logical.h ./tensors/Tensor_outer.h ./tensors/Tensor_reality.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h +HFILES=./Algorithms.h ./AlignedAllocator.h ./Cartesian.h ./Communicator.h ./Cshift.h ./Grid.h ./Init.h ./Lattice.h ./Lexicographic.h ./Log.h ./Old/Tensor_peek.h ./Old/Tensor_poke.h ./PerfCount.h ./Simd.h ./Stencil.h ./Tensors.h ./Threads.h ./Timer.h ./algorithms/CoarsenedMatrix.h ./algorithms/LinearOperator.h ./algorithms/Preconditioner.h ./algorithms/SparseMatrix.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/DenseMatrix.h ./algorithms/iterative/EigenSort.h ./algorithms/iterative/Francis.h ./algorithms/iterative/Householder.h ./algorithms/iterative/ImplicitlyRestartedLanczos.h ./algorithms/iterative/Matrix.h ./algorithms/iterative/MatrixUtils.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/SchurRedBlack.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./communicator/Communicator_base.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./lattice/Lattice_ET.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./parallelIO/BinaryIO.h ./parallelIO/NerscIO.h ./pugixml/pugixml.h ./qcd/QCD.h ./qcd/action/ActionBase.h ./qcd/action/ActionParams.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/fermion/WilsonTMFermion.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/WilsonKernelsAsmBody.h ./qcd/action/gauge/GaugeImpl.h ./qcd/action/gauge/PlaqPlusRectangleAction.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/hmc/HMC.h ./qcd/hmc/HmcRunner.h ./qcd/hmc/NerscCheckpointer.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SUn.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/WilsonLoops.h ./serialisation/BaseIO.h ./serialisation/BinaryIO.h ./serialisation/MacroMagic.h ./serialisation/Serialisation.h ./serialisation/TextIO.h ./serialisation/XmlIO.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_imci.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./simd/Intel512avx.h ./simd/Intel512wilson.h ./simd/Intel512common.h ./simd/Intel512double.h ./simd/Intel512imci.h ./simd/Intel512single.h ./stencil/Lebesgue.h ./tensors/Tensor_Ta.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_exp.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_index.h ./tensors/Tensor_inner.h ./tensors/Tensor_logical.h ./tensors/Tensor_outer.h ./tensors/Tensor_reality.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h CCFILES=./Init.cc ./Log.cc ./PerfCount.cc ./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./pugixml/pugixml.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsAsm.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/action/fermion/WilsonTMFermion.cc ./qcd/hmc/HMC.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./serialisation/BinaryIO.cc ./serialisation/TextIO.cc ./serialisation/XmlIO.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc From 2d8bb4c594b52233b204cf1ed6d318e231acfe14 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 25 Jun 2016 11:08:05 -0700 Subject: [PATCH 14/21] Tweaks --- benchmarks/Benchmark_dwf_sweep.cc | 16 +- lib/qcd/action/fermion/WilsonKernelsAsm.cc | 7 +- lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 79 +++----- .../action/fermion/WilsonKernelsAsmBody.h.abc | 187 ++++++++++++++++++ lib/simd/Intel512common.h | 4 +- 5 files changed, 242 insertions(+), 51 deletions(-) create mode 100644 lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc diff --git a/benchmarks/Benchmark_dwf_sweep.cc b/benchmarks/Benchmark_dwf_sweep.cc index 302059a4..94a00903 100644 --- a/benchmarks/Benchmark_dwf_sweep.cc +++ b/benchmarks/Benchmark_dwf_sweep.cc @@ -68,10 +68,12 @@ int main (int argc, char ** argv) std::cout< latt4(4,L); - for(int d=4;d>0;d--){ + for(int d=4;d>dmin;d--){ if ( d<=3 ) latt4[d]*=2; std::cout << GridLogMessage <<"\t"; for(int d=0;d & latt4, int Ls, int threads,int report ) Dw.Dhop(src,result,0); double t1=usecond(); +#ifdef TIMERS_OFF + int ncall =10; +#else int ncall =1+(int) ((5.0*1000*1000)/(t1-t0)); +#endif if (ncall < 5 ) exit(0); @@ -297,7 +303,11 @@ void benchsDw(std::vector & latt4, int Ls, int threads, int report ) sDw.Dhop(ssrc,sresult,0); double t1=usecond(); +#ifdef TIMERS_OFF + int ncall =10; +#else int ncall =1+(int) ((5.0*1000*1000)/(t1-t0)); +#endif PerformanceCounter Counter(8); Counter.Start(); @@ -340,7 +350,9 @@ void benchsDw(std::vector & latt4, int Ls, int threads, int report ) CounterSdw.Start(); t0=usecond(); for(int i=0;i void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, std::vector > &buf, @@ -80,6 +83,8 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrd #undef VMOVRDUP #undef MAYBEPERM #undef MULT_2SPIN +#undef FX +#define FX(A) DWFASM_ ## A #define MAYBEPERM(A,B) #define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C) #define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C) diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index bd96b7d5..d3e86276 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -1,8 +1,7 @@ { int locala,perma, ptypea; int localb,permb, ptypeb; - int localc,permc, ptypec; - uint64_t basea, baseb, basec; + uint64_t basea, baseb; uint64_t basex; const uint64_t plocal =(uint64_t) & in._odata[0]; @@ -12,22 +11,15 @@ MASK_REGS; for(int site=0;site shuffle and xor the real part sign bit YM_PROJMEM(baseb); @@ -56,7 +47,7 @@ LOAD_CHI(baseb); } { - MULT_2SPIN_DIR_PFYP(Yp,basec); + MULT_2SPIN_DIR_PFYP(Yp,basea); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit YM_RECON_ACCUM; @@ -65,16 +56,15 @@ // Zp //////////////////////////////// baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; - PREFETCH_CHIMU(baseb); - if ( localc ) { + if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - ZM_PROJMEM(basec); - MAYBEPERM(PERMUTE_DIR1,permc); + ZM_PROJMEM(basea); + MAYBEPERM(PERMUTE_DIR1,perma); } else { - LOAD_CHI(basec); + LOAD_CHI(basea); } { - MULT_2SPIN_DIR_PFZP(Zp,basea); + MULT_2SPIN_DIR_PFZP(Zp,baseb); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZM_RECON_ACCUM; @@ -82,17 +72,16 @@ //////////////////////////////// // Tp //////////////////////////////// - basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++; - PREFETCH_CHIMU(basec); - if ( locala ) { + basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; + if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - TM_PROJMEM(basea); - MAYBEPERM(PERMUTE_DIR0,perma); + TM_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR0,permb); } else { - LOAD_CHI(basea); + LOAD_CHI(baseb); } { - MULT_2SPIN_DIR_PFTP(Tp,baseb); + MULT_2SPIN_DIR_PFTP(Tp,basea); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TM_RECON_ACCUM; @@ -100,17 +89,16 @@ //////////////////////////////// // Xm //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++; - PREFETCH_CHIMU(basea); - if ( localb ) { + baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; + if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - XP_PROJMEM(baseb); - MAYBEPERM(PERMUTE_DIR3,permb); + XP_PROJMEM(basea); + MAYBEPERM(PERMUTE_DIR3,perma); } else { - LOAD_CHI(baseb); + LOAD_CHI(basea); } { - MULT_2SPIN_DIR_PFXM(Xm,basec); + MULT_2SPIN_DIR_PFXM(Xm,baseb); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit XP_RECON_ACCUM; @@ -118,14 +106,13 @@ //////////////////////////////// // Ym //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++; - PREFETCH_CHIMU(baseb); - if ( localc ) { + basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; + if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - YP_PROJMEM(basec); - MAYBEPERM(PERMUTE_DIR2,permc); + YP_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR2,permb); } else { - LOAD_CHI(basec); + LOAD_CHI(baseb); } { MULT_2SPIN_DIR_PFYM(Ym,basea); @@ -136,8 +123,7 @@ //////////////////////////////// // Zm //////////////////////////////// - basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++; - PREFETCH_CHIMU(basec); + baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZP_PROJMEM(basea); @@ -155,7 +141,6 @@ // Tm //////////////////////////////// basea = (uint64_t)&out._odata[ss]; - PREFETCH_CHIMU(basea); if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TP_PROJMEM(baseb); @@ -163,16 +148,16 @@ } else { LOAD_CHI(baseb); } + baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); { - MULT_2SPIN_DIR_PFTM(Tm,basec); + MULT_2SPIN_DIR_PFTM(Tm,basea); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TP_RECON_ACCUM; - // PREFETCH_CHIMU(basex); - SAVE_RESULT(&out._odata[ss]); - - } + SAVE_RESULT(&out._odata[ss],baseb); + + } ssU++; } } diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc new file mode 100644 index 00000000..5a3e01f7 --- /dev/null +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc @@ -0,0 +1,187 @@ +{ + int locala,perma, ptypea; + int localb,permb, ptypeb; + int localc,permc, ptypec; + uint64_t basea, baseb, basec; + uint64_t basex; + const uint64_t plocal =(uint64_t) & in._odata[0]; + + // vComplexF isigns[2] = { signs[0], signs[1] }; + vComplexF *isigns = &signs[0]; + + MASK_REGS; + + for(int site=0;site shuffle and xor the real part sign bit + YM_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR2,permb); + } else { + LOAD_CHI(baseb); + } + { + MULT_2SPIN_DIR_PFYP(Yp,basec); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + YM_RECON_ACCUM; + + //////////////////////////////// + // Zp + //////////////////////////////// + baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; + PREFETCH_CHIMU(baseb); + label(FX(ZP) ); + if ( localc ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + ZM_PROJMEM(basec); + MAYBEPERM(PERMUTE_DIR1,permc); + } else { + LOAD_CHI(basec); + } + { + MULT_2SPIN_DIR_PFZP(Zp,basea); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + ZM_RECON_ACCUM; + + //////////////////////////////// + // Tp + //////////////////////////////// + basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++; + PREFETCH_CHIMU(basec); + label(FX(TP) ); + if ( locala ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + TM_PROJMEM(basea); + MAYBEPERM(PERMUTE_DIR0,perma); + } else { + LOAD_CHI(basea); + } + { + MULT_2SPIN_DIR_PFTP(Tp,baseb); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + TM_RECON_ACCUM; + + //////////////////////////////// + // Xm + //////////////////////////////// + basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++; + PREFETCH_CHIMU(basea); + label(FX(XM) ); + if ( localb ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + XP_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR3,permb); + } else { + LOAD_CHI(baseb); + } + { + MULT_2SPIN_DIR_PFXM(Xm,basec); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + XP_RECON_ACCUM; + + //////////////////////////////// + // Ym + //////////////////////////////// + baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++; + PREFETCH_CHIMU(baseb); + label(FX(YM) ); + if ( localc ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + YP_PROJMEM(basec); + MAYBEPERM(PERMUTE_DIR2,permc); + } else { + LOAD_CHI(basec); + } + { + MULT_2SPIN_DIR_PFYM(Ym,basea); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + YP_RECON_ACCUM; + + //////////////////////////////// + // Zm + //////////////////////////////// + basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++; + PREFETCH_CHIMU(basec); + label(FX(ZM) ); + if ( locala ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + ZP_PROJMEM(basea); + MAYBEPERM(PERMUTE_DIR1,perma); + } else { + LOAD_CHI(basea); + } + { + MULT_2SPIN_DIR_PFZM(Zm,baseb); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + ZP_RECON_ACCUM; + + //////////////////////////////// + // Tm + //////////////////////////////// + basea = (uint64_t)&out._odata[ss]; + PREFETCH_CHIMU(basea); + label(FX(TM) ); + if ( localb ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + TP_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR0,permb); + } else { + LOAD_CHI(baseb); + } + { + MULT_2SPIN_DIR_PFTM(Tm,basec); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + TP_RECON_ACCUM; + + // PREFETCH_CHIMU(basex); + label(FX(SAV) ); + SAVE_RESULT(&out._odata[ss]); + + } + ssU++; + } +} diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h index a3cd980d..6878bcfb 100644 --- a/lib/simd/Intel512common.h +++ b/lib/simd/Intel512common.h @@ -1,4 +1,4 @@ - /************************************************************************************* + /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -37,6 +37,8 @@ Author: paboyle "mov $0x5555, %%eax \n"\ "kmovw %%eax, %%k7 \n" : : : "%eax"); +//#define label(B) __asm__ ( __func__ __LINE__ #B ":\n" ); + #define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n" #define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n" From a25bec87d981a393d7f03b81254587543ae8d3d6 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 25 Jun 2016 12:54:14 -0700 Subject: [PATCH 15/21] Prefetch during save --- lib/simd/Intel512wilson.h | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h index 2bc0545d..207d9db8 100644 --- a/lib/simd/Intel512wilson.h +++ b/lib/simd/Intel512wilson.h @@ -104,7 +104,7 @@ Author: paboyle #define LOAD_CHI(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi ); #define SAVE_UCHI(PTR) SAVE_UCHIi(PTR) #define SAVE_CHI(PTR) SAVE_CHIi(PTR) -#define SAVE_RESULT(PTR) SAVE_RESULTi(PTR) +#define SAVE_RESULT(PT,R) SAVE_RESULTi(PT,R) #define LOAD_CHIMUi \ LOAD_CHIMU01i \ @@ -169,21 +169,22 @@ Author: paboyle VSTORE(5,%r8,Chi_12) \ ); -#define SAVE_RESULTi(PTR)\ +#define SAVE_RESULTi(PTR,pf) \ LOAD64(%r8,PTR) \ + LOAD64(%r9,pf) \ __asm__ ( \ - VSTORE(0,%r8,result_00) \ - VSTORE(1,%r8,result_01) \ - VSTORE(2,%r8,result_02) \ - VSTORE(3,%r8,result_10) \ - VSTORE(4,%r8,result_11) \ - VSTORE(5,%r8,result_12) \ - VSTORE(6,%r8,result_20) \ - VSTORE(7,%r8,result_21) \ - VSTORE(8,%r8,result_22) \ - VSTORE(9,%r8,result_30) \ - VSTORE(10,%r8,result_31) \ - VSTORE(11,%r8,result_32) \ + VSTORE(0,%r8,result_00) VPREFETCHG(0,%r9) \ + VSTORE(1,%r8,result_01) VPREFETCHG(1,%r9) \ + VSTORE(2,%r8,result_02) VPREFETCHG(2,%r9) \ + VSTORE(3,%r8,result_10) VPREFETCHG(3,%r9) \ + VSTORE(4,%r8,result_11) VPREFETCHG(4,%r9) \ + VSTORE(5,%r8,result_12) VPREFETCHG(5,%r9) \ + VSTORE(6,%r8,result_20) VPREFETCHG(6,%r9) \ + VSTORE(7,%r8,result_21) VPREFETCHG(7,%r9) \ + VSTORE(8,%r8,result_22) VPREFETCHG(8,%r9) \ + VSTORE(9,%r8,result_30) VPREFETCHG(9,%r9) \ + VSTORE(10,%r8,result_31) VPREFETCHG(10,%r9) \ + VSTORE(11,%r8,result_32) VPREFETCHG(11,%r9) \ ); #define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p) From 05c884a62a116033fb426354c5d07b7bdd6c5a82 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 25 Jun 2016 12:54:50 -0700 Subject: [PATCH 16/21] Prefetch change --- lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index d3e86276..d50999f6 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -2,7 +2,6 @@ int locala,perma, ptypea; int localb,permb, ptypeb; uint64_t basea, baseb; - uint64_t basex; const uint64_t plocal =(uint64_t) & in._odata[0]; // vComplexF isigns[2] = { signs[0], signs[1] }; @@ -20,7 +19,6 @@ int ent=ss*8;// 2*Ndim basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; - basex = basea; if ( locala ) { LOAD64(%r10,isigns); @@ -38,7 +36,7 @@ //////////////////////////////// // Yp //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; + basea = st.GetInfo(ptypea,locala,perma,Zp,ent,plocal); ent++; if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit YM_PROJMEM(baseb); @@ -55,7 +53,7 @@ //////////////////////////////// // Zp //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; + baseb = st.GetInfo(ptypeb,localb,permb,Tp,ent,plocal); ent++; if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZM_PROJMEM(basea); @@ -72,7 +70,7 @@ //////////////////////////////// // Tp //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; + basea = st.GetInfo(ptypea,locala,perma,Xm,ent,plocal); ent++; if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TM_PROJMEM(baseb); @@ -89,7 +87,7 @@ //////////////////////////////// // Xm //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; + baseb = st.GetInfo(ptypeb,localb,permb,Ym,ent,plocal); ent++; if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit XP_PROJMEM(basea); @@ -106,7 +104,7 @@ //////////////////////////////// // Ym //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; + basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++; if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit YP_PROJMEM(baseb); @@ -123,7 +121,7 @@ //////////////////////////////// // Zm //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; + baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++; if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZP_PROJMEM(basea); @@ -148,7 +146,7 @@ } else { LOAD_CHI(baseb); } - baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); + baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); { MULT_2SPIN_DIR_PFTM(Tm,basea); } From 1445189361272305c0101081ea5da69300deb3ff Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 25 Jun 2016 12:55:25 -0700 Subject: [PATCH 17/21] COntrol the prefetch strategy --- lib/simd/Intel512common.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h index 6878bcfb..a05f978c 100644 --- a/lib/simd/Intel512common.h +++ b/lib/simd/Intel512common.h @@ -28,6 +28,14 @@ Author: paboyle #ifndef GRID_ASM_INTEL_COMMON_512_H #define GRID_ASM_INTEL_COMMON_512_H +//////////////////////////////////////////////////////////////////////////////////////////////////// +// Peformance options +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define AVX512_PF_L1 +#undef AVX512_PF_L2_LINEAR +#undef AVX512_PF_L2_TABLE +#undef AVX512_PF_L2_WRITE + //////////////////////////////////////////////////////////////////////////////////////////////////// // Opcodes common //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -88,10 +96,30 @@ Author: paboyle #define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n" #define VMOVd(A,DEST) "vmovapd " #A ", " #DEST ";\n" +#ifdef AVX512_PF_L1 #define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n" +#else +#define VPREFETCHG(O,A) +#endif + +#ifdef AVX512_PF_L2_LINEAR #define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n" +#else +#define VPREFETCH2(O,A) +#endif + +#ifdef AVX512_PF_L2_TABLE #define VPREFETCHP(O,A) "prefetcht1 "#O"*64("#A");\n" +#else +#define VPREFETCHP(O,A) +#endif + +#ifdef AVX512_PF_L2_WRITE #define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n" +#else +#define VPREFETCHW(O,A) +#endif + #define VPREFETCHNTA(O,A) #define VPREFETCH(O,A) From 8fcefc021a604431a1de19c05f848c4af022e61a Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 26 Jun 2016 12:54:14 -0700 Subject: [PATCH 18/21] Improved the prefetching when using cache blocking codes --- lib/Stencil.h | 5 +- lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 86 ++++--- .../action/fermion/WilsonKernelsAsmBody.h.ab | 18 +- lib/simd/Intel512common.h | 24 +- lib/simd/Intel512wilson.h | 237 +++++++++++------- 5 files changed, 208 insertions(+), 162 deletions(-) diff --git a/lib/Stencil.h b/lib/Stencil.h index 8019e3f9..bc015370 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -1,4 +1,4 @@ - /************************************************************************************* + /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -261,6 +261,9 @@ } }; + inline uint64_t Touch(int ent) { + // _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0); + } inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { _mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0); local = _entries[ent]._is_local; diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index d50999f6..7373d2eb 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -1,7 +1,9 @@ { int locala,perma, ptypea; int localb,permb, ptypeb; - uint64_t basea, baseb; + int localc,permc, ptypec; + uint64_t basea, baseb, basec; + const uint64_t plocal =(uint64_t) & in._odata[0]; // vComplexF isigns[2] = { signs[0], signs[1] }; @@ -10,15 +12,22 @@ MASK_REGS; for(int site=0;site shuffle and xor the real part sign bit YM_PROJMEM(baseb); @@ -45,7 +55,7 @@ LOAD_CHI(baseb); } { - MULT_2SPIN_DIR_PFYP(Yp,basea); + MULT_2SPIN_DIR_PFYP(Yp,basec); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit YM_RECON_ACCUM; @@ -53,16 +63,17 @@ //////////////////////////////// // Zp //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Tp,ent,plocal); ent++; - if ( locala ) { + baseb = st.GetInfo(ptypeb,localb,permb,Xm,ent,plocal); ent++; + PREFETCH_CHIMU(baseb); + if ( localc ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - ZM_PROJMEM(basea); - MAYBEPERM(PERMUTE_DIR1,perma); + ZM_PROJMEM(basec); + MAYBEPERM(PERMUTE_DIR1,permc); } else { - LOAD_CHI(basea); + LOAD_CHI(basec); } { - MULT_2SPIN_DIR_PFZP(Zp,baseb); + MULT_2SPIN_DIR_PFZP(Zp,basea); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZM_RECON_ACCUM; @@ -70,16 +81,17 @@ //////////////////////////////// // Tp //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Xm,ent,plocal); ent++; - if ( localb ) { + basec = st.GetInfo(ptypec,localc,permc,Ym,ent,plocal); ent++; + PREFETCH_CHIMU(basec); + if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - TM_PROJMEM(baseb); - MAYBEPERM(PERMUTE_DIR0,permb); + TM_PROJMEM(basea); + MAYBEPERM(PERMUTE_DIR0,perma); } else { - LOAD_CHI(baseb); + LOAD_CHI(basea); } { - MULT_2SPIN_DIR_PFTP(Tp,basea); + MULT_2SPIN_DIR_PFTP(Tp,baseb); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TM_RECON_ACCUM; @@ -87,16 +99,17 @@ //////////////////////////////// // Xm //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Ym,ent,plocal); ent++; - if ( locala ) { + basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++; + PREFETCH_CHIMU(basea); + if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - XP_PROJMEM(basea); - MAYBEPERM(PERMUTE_DIR3,perma); + XP_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR3,permb); } else { - LOAD_CHI(basea); + LOAD_CHI(baseb); } { - MULT_2SPIN_DIR_PFXM(Xm,baseb); + MULT_2SPIN_DIR_PFXM(Xm,basec); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit XP_RECON_ACCUM; @@ -104,13 +117,14 @@ //////////////////////////////// // Ym //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++; - if ( localb ) { + baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++; + PREFETCH_CHIMU(baseb); + if ( localc ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - YP_PROJMEM(baseb); - MAYBEPERM(PERMUTE_DIR2,permb); + YP_PROJMEM(basec); + MAYBEPERM(PERMUTE_DIR2,permc); } else { - LOAD_CHI(baseb); + LOAD_CHI(basec); } { MULT_2SPIN_DIR_PFYM(Ym,basea); @@ -121,7 +135,8 @@ //////////////////////////////// // Zm //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++; + basec = (uint64_t)&out._odata[ss]; + PREFETCH_CHIMU(basec); if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZP_PROJMEM(basea); @@ -138,7 +153,8 @@ //////////////////////////////// // Tm //////////////////////////////// - basea = (uint64_t)&out._odata[ss]; + // basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; + // PREFETCH_CHIMU(basea); if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TP_PROJMEM(baseb); @@ -146,16 +162,16 @@ } else { LOAD_CHI(baseb); } - baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); { - MULT_2SPIN_DIR_PFTM(Tm,basea); + MULT_2SPIN_DIR_PFTM(Tm,basec); } + // baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TP_RECON_ACCUM; - SAVE_RESULT(&out._odata[ss],baseb); - - } + SAVE_RESULT(&out._odata[ss],basec); + + } ssU++; } } diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab index 3ba9eec6..d50999f6 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab @@ -2,7 +2,6 @@ int locala,perma, ptypea; int localb,permb, ptypeb; uint64_t basea, baseb; - uint64_t basex; const uint64_t plocal =(uint64_t) & in._odata[0]; // vComplexF isigns[2] = { signs[0], signs[1] }; @@ -19,9 +18,7 @@ //////////////////////////////// int ent=ss*8;// 2*Ndim basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; - PREFETCH_CHIMU(basea); baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; - basex = basea; if ( locala ) { LOAD64(%r10,isigns); @@ -39,7 +36,7 @@ //////////////////////////////// // Yp //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; + basea = st.GetInfo(ptypea,locala,perma,Zp,ent,plocal); ent++; if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit YM_PROJMEM(baseb); @@ -56,7 +53,7 @@ //////////////////////////////// // Zp //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; + baseb = st.GetInfo(ptypeb,localb,permb,Tp,ent,plocal); ent++; if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZM_PROJMEM(basea); @@ -73,7 +70,7 @@ //////////////////////////////// // Tp //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; + basea = st.GetInfo(ptypea,locala,perma,Xm,ent,plocal); ent++; if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TM_PROJMEM(baseb); @@ -90,7 +87,7 @@ //////////////////////////////// // Xm //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; + baseb = st.GetInfo(ptypeb,localb,permb,Ym,ent,plocal); ent++; if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit XP_PROJMEM(basea); @@ -107,7 +104,7 @@ //////////////////////////////// // Ym //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; + basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++; if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit YP_PROJMEM(baseb); @@ -124,7 +121,7 @@ //////////////////////////////// // Zm //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; + baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++; if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZP_PROJMEM(basea); @@ -149,13 +146,14 @@ } else { LOAD_CHI(baseb); } + baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); { MULT_2SPIN_DIR_PFTM(Tm,basea); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TP_RECON_ACCUM; - SAVE_RESULT(&out._odata[ss]); + SAVE_RESULT(&out._odata[ss],baseb); } ssU++; diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h index a05f978c..dabbf6d8 100644 --- a/lib/simd/Intel512common.h +++ b/lib/simd/Intel512common.h @@ -31,9 +31,6 @@ Author: paboyle //////////////////////////////////////////////////////////////////////////////////////////////////// // Peformance options //////////////////////////////////////////////////////////////////////////////////////////////////// -#define AVX512_PF_L1 -#undef AVX512_PF_L2_LINEAR -#undef AVX512_PF_L2_TABLE #undef AVX512_PF_L2_WRITE //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -45,7 +42,7 @@ Author: paboyle "mov $0x5555, %%eax \n"\ "kmovw %%eax, %%k7 \n" : : : "%eax"); -//#define label(B) __asm__ ( __func__ __LINE__ #B ":\n" ); +//#define label(B) __asm__ ( __func__ _LINE__ #B ":\n" ); #define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n" #define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n" @@ -96,30 +93,13 @@ Author: paboyle #define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n" #define VMOVd(A,DEST) "vmovapd " #A ", " #DEST ";\n" -#ifdef AVX512_PF_L1 -#define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n" -#else -#define VPREFETCHG(O,A) -#endif - -#ifdef AVX512_PF_L2_LINEAR +#define VPREFETCH1(O,A) "prefetcht0 "#O"*64("#A");\n" #define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n" -#else -#define VPREFETCH2(O,A) -#endif - -#ifdef AVX512_PF_L2_TABLE -#define VPREFETCHP(O,A) "prefetcht1 "#O"*64("#A");\n" -#else -#define VPREFETCHP(O,A) -#endif - #ifdef AVX512_PF_L2_WRITE #define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n" #else #define VPREFETCHW(O,A) #endif - #define VPREFETCHNTA(O,A) #define VPREFETCH(O,A) diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h index 207d9db8..9deffd80 100644 --- a/lib/simd/Intel512wilson.h +++ b/lib/simd/Intel512wilson.h @@ -169,23 +169,6 @@ Author: paboyle VSTORE(5,%r8,Chi_12) \ ); -#define SAVE_RESULTi(PTR,pf) \ - LOAD64(%r8,PTR) \ - LOAD64(%r9,pf) \ - __asm__ ( \ - VSTORE(0,%r8,result_00) VPREFETCHG(0,%r9) \ - VSTORE(1,%r8,result_01) VPREFETCHG(1,%r9) \ - VSTORE(2,%r8,result_02) VPREFETCHG(2,%r9) \ - VSTORE(3,%r8,result_10) VPREFETCHG(3,%r9) \ - VSTORE(4,%r8,result_11) VPREFETCHG(4,%r9) \ - VSTORE(5,%r8,result_12) VPREFETCHG(5,%r9) \ - VSTORE(6,%r8,result_20) VPREFETCHG(6,%r9) \ - VSTORE(7,%r8,result_21) VPREFETCHG(7,%r9) \ - VSTORE(8,%r8,result_22) VPREFETCHG(8,%r9) \ - VSTORE(9,%r8,result_30) VPREFETCHG(9,%r9) \ - VSTORE(10,%r8,result_31) VPREFETCHG(10,%r9) \ - VSTORE(11,%r8,result_32) VPREFETCHG(11,%r9) \ - ); #define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p) #define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p) @@ -560,24 +543,89 @@ Author: paboyle VSUB(UChi_02,result_22,result_22)\ VSUB(UChi_12,result_32,result_32) ); -#define PREFETCH_CHIMU(A) -/* - LOAD64(%r9,A) \ - __asm__ ( \ - VPREFETCHG(0,%r9)\ - VPREFETCHG(1,%r9)\ - VPREFETCHG(2,%r9)\ - VPREFETCHG(3,%r9)\ - VPREFETCHG(4,%r9)\ - VPREFETCHG(5,%r9)\ - VPREFETCHG(6,%r9)\ - VPREFETCHG(7,%r9)\ - VPREFETCHG(8,%r9)\ - VPREFETCHG(9,%r9)\ - VPREFETCHG(10,%r9)\ - VPREFETCHG(11,%r9)); -*/ -#define PERMUTE_DIR0 __asm__ ( \ +#define AVX512_PF_L1 +#define AVX512_PF_L2_GAUGE +#define AVX512_PF_L2_TABLE +#undef AVX512_PF_L2_LINEAR + +#ifdef AVX512_PF_L2_TABLE +#define VPREFETCH_P1(A,B) VPREFETCH1(A,B) +#define VPREFETCH_P2(A,B) VPREFETCH1(A,B) +#else +#define VPREFETCH_P1(A,B) +#define VPREFETCH_P2(A,B) +#endif +#ifdef AVX512_PF_L2_LINEAR +#define VPREFETCH_M1(A,B) +#define VPREFETCH_M2(A,B) +#else +#define VPREFETCH_M1(A,B) VPREFETCH1(A,B) +#define VPREFETCH_M2(A,B) VPREFETCH2(A,B) +#endif +#ifdef AVX512_PF_L2_GAUGE +#define VPREFETCH_G1(A,B) VPREFETCH1(A,B) +#define VPREFETCH_G2(A,B) VPREFETCH2(A,B) +#else +#endif + +#define PF_GAUGE(A) \ + LOAD64(%r8,&U._odata[sU](A)) \ + __asm__ ( \ + VPREFETCH_G1(0,%r8) VPREFETCH_G1(1,%r8) \ + VPREFETCH_G1(2,%r8) VPREFETCH_G1(3,%r8) \ + ); + +#define SAVE_RESULTi(PTR,pf) \ + LOAD64(%r8,PTR) \ + LOAD64(%r9,pf) \ + __asm__ ( \ + VSTORE(0,%r8,result_00) VPREFETCH_M1(0,%r9) \ + VSTORE(1,%r8,result_01) VPREFETCH_M1(1,%r9) \ + VSTORE(2,%r8,result_02) VPREFETCH_M1(2,%r9) \ + VSTORE(3,%r8,result_10) VPREFETCH_M1(3,%r9) \ + VSTORE(4,%r8,result_11) VPREFETCH_M1(4,%r9) \ + VSTORE(5,%r8,result_12) VPREFETCH_M1(5,%r9) \ + VSTORE(6,%r8,result_20) VPREFETCH_M1(6,%r9) \ + VSTORE(7,%r8,result_21) VPREFETCH_M1(7,%r9) \ + VSTORE(8,%r8,result_22) VPREFETCH_M1(8,%r9) \ + VSTORE(9,%r8,result_30) VPREFETCH_M1(9,%r9) \ + VSTORE(10,%r8,result_31) VPREFETCH_M1(10,%r9) \ + VSTORE(11,%r8,result_32) VPREFETCH_M1(11,%r9) \ + ); + +#define PREFETCH_CHIMU(A) \ + LOAD64(%r9,A) \ + __asm__ ( \ + VPREFETCH_P2(0,%r9) \ + VPREFETCH_P2(1,%r9) \ + VPREFETCH_P2(2,%r9) \ + VPREFETCH_P2(3,%r9) \ + VPREFETCH_P2(4,%r9) \ + VPREFETCH_P2(5,%r9) \ + VPREFETCH_P2(6,%r9) \ + VPREFETCH_P2(7,%r9) \ + VPREFETCH_P2(8,%r9) \ + VPREFETCH_P2(9,%r9) \ + VPREFETCH_P2(10,%r9) \ + VPREFETCH_P2(11,%r9)); + +#define PREFETCH1_CHIMU(A) \ + LOAD64(%r9,A) \ + __asm__ ( \ + VPREFETCH_P1(0,%r9) \ + VPREFETCH_P1(1,%r9) \ + VPREFETCH_P1(2,%r9) \ + VPREFETCH_P1(3,%r9) \ + VPREFETCH_P1(4,%r9) \ + VPREFETCH_P1(5,%r9) \ + VPREFETCH_P1(6,%r9) \ + VPREFETCH_P1(7,%r9) \ + VPREFETCH_P1(8,%r9) \ + VPREFETCH_P1(9,%r9) \ + VPREFETCH_P1(10,%r9) \ + VPREFETCH_P1(11,%r9)); + +#define PERMUTE_DIR0 __asm__ ( \ VPERM0(Chi_00,Chi_00) \ VPERM0(Chi_01,Chi_01) \ VPERM0(Chi_02,Chi_02) \ @@ -614,14 +662,15 @@ Author: paboyle LOAD64(%r8,ptr) \ LOAD64(%r9,pf) \ __asm__ ( \ - VPREFETCH2(9,%r8) VPREFETCH2(10,%r8) \ - VPREFETCH2(11,%r8) \ - VPREFETCH2(12,%r8) \ - VPREFETCH2(13,%r8) \ - VPREFETCH2(14,%r8) \ - VPREFETCH2(15,%r8) \ - VPREFETCH2(16,%r8) \ - VPREFETCH2(17,%r8) \ + VPREFETCH_G2(9,%r8) \ + VPREFETCH_G2(10,%r8) \ + VPREFETCH_G2(11,%r8) \ + VPREFETCH_G2(12,%r8) \ + VPREFETCH_G2(13,%r8) \ + VPREFETCH_G2(14,%r8) \ + VPREFETCH_G2(15,%r8) \ + VPREFETCH_G2(16,%r8) \ + VPREFETCH_G2(17,%r8) \ VSHUF(Chi_00,T1) \ VMOVIDUP(0,%r8,Z0 ) \ VMOVIDUP(3,%r8,Z1 ) \ @@ -633,10 +682,10 @@ Author: paboyle VMUL(Z1,T2,UChi_11) VMOVIDUP(1,%r8,Z0 ) \ VMUL(Z2,T1,UChi_02) VMOVIDUP(4,%r8,Z1 ) \ VMUL(Z2,T2,UChi_12) VMOVIDUP(7,%r8,Z2 ) \ - VPREFETCHG(0,%r9) \ - VPREFETCHG(1,%r9) \ - VPREFETCHG(2,%r9) \ - VPREFETCHG(3,%r9) \ + VPREFETCH_M1(0,%r9) \ + VPREFETCH_M1(1,%r9) \ + VPREFETCH_M1(2,%r9) \ + VPREFETCH_M1(3,%r9) \ /*18*/ \ VMADDSUB(Z3,Chi_00,UChi_00) VSHUF(Chi_01,T1) \ VMADDSUB(Z3,Chi_10,UChi_10) \ @@ -644,10 +693,10 @@ Author: paboyle VMADDSUB(Z4,Chi_10,UChi_11) VSHUF(Chi_11,T2) \ VMADDSUB(Z5,Chi_00,UChi_02) VMOVRDUP(4,%r8,Z4 ) \ VMADDSUB(Z5,Chi_10,UChi_12) \ - VPREFETCHG(4,%r9) \ - VPREFETCHG(5,%r9) \ - VPREFETCHG(6,%r9) \ - VPREFETCHG(7,%r9) \ + VPREFETCH_M1(4,%r9) \ + VPREFETCH_M1(5,%r9) \ + VPREFETCH_M1(6,%r9) \ + VPREFETCH_M1(7,%r9) \ /*28*/ \ VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(7,%r8,Z5 ) \ VMADDSUB(Z0,T2,UChi_10) \ @@ -674,15 +723,15 @@ Author: paboyle VMADDSUB(Z4,Chi_11,UChi_11) VSHUF(Chi_12,T2) \ VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \ VMADDSUB(Z5,Chi_11,UChi_12) \ - VPREFETCHG(9,%r8) \ - VPREFETCHG(10,%r8) \ - VPREFETCHG(11,%r8) \ - VPREFETCHG(12,%r8) \ - VPREFETCHG(13,%r8) \ - VPREFETCHG(14,%r8) \ - VPREFETCHG(15,%r8) \ - VPREFETCHG(16,%r8) \ - VPREFETCHG(17,%r8) \ + VPREFETCH_M1(9,%r8) \ + VPREFETCH_M1(10,%r8) \ + VPREFETCH_M1(11,%r8) \ + VPREFETCH_M1(12,%r8) \ + VPREFETCH_M1(13,%r8) \ + VPREFETCH_M1(14,%r8) \ + VPREFETCH_M1(15,%r8) \ + VPREFETCH_M1(16,%r8) \ + VPREFETCH_M1(17,%r8) \ /*48*/ \ VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \ VMADDSUB(Z0,T2,UChi_10) \ @@ -690,10 +739,10 @@ Author: paboyle VMADDSUB(Z1,T2,UChi_11) \ VMADDSUB(Z2,T1,UChi_02) \ VMADDSUB(Z2,T2,UChi_12) \ - VPREFETCHG(8,%r9) \ - VPREFETCHG(9,%r9) \ - VPREFETCHG(10,%r9) \ - VPREFETCHG(11,%r9) \ + VPREFETCH_M1(8,%r9) \ + VPREFETCH_M1(9,%r9) \ + VPREFETCH_M1(10,%r9) \ + VPREFETCH_M1(11,%r9) \ /*55*/ \ VMADDSUB(Z3,Chi_02,UChi_00) \ VMADDSUB(Z3,Chi_12,UChi_10) \ @@ -712,56 +761,56 @@ Author: paboyle VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \ VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \ VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \ - VPREFETCHG(0,%r9) \ - VPREFETCHG(1,%r9) \ - VPREFETCHG(2,%r9) \ - VPREFETCHG(3,%r9) \ + VPREFETCH_M1(0,%r9) \ + VPREFETCH_M1(1,%r9) \ + VPREFETCH_M1(2,%r9) \ + VPREFETCH_M1(3,%r9) \ /*8*/ \ VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \ VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \ VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \ VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \ - VPREFETCHG(4,%r9) \ - VPREFETCHG(5,%r9) \ - VPREFETCHG(6,%r9) \ - VPREFETCHG(7,%r9) \ + VPREFETCH_M1(4,%r9) \ + VPREFETCH_M1(5,%r9) \ + VPREFETCH_M1(6,%r9) \ + VPREFETCH_M1(7,%r9) \ /*16*/ \ VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \ VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \ VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \ - VPREFETCHG(8,%r9) \ - VPREFETCHG(9,%r9) \ - VPREFETCHG(10,%r9) \ - VPREFETCHG(11,%r9) \ + VPREFETCH_M1(8,%r9) \ + VPREFETCH_M1(9,%r9) \ + VPREFETCH_M1(10,%r9) \ + VPREFETCH_M1(11,%r9) \ /*22*/ \ VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \ VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \ VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \ VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \ - VPREFETCH2(12,%r9) \ - VPREFETCH2(13,%r9) \ - VPREFETCH2(14,%r9) \ - VPREFETCH2(15,%r9) \ + VPREFETCH_M2(12,%r9) \ + VPREFETCH_M2(13,%r9) \ + VPREFETCH_M2(14,%r9) \ + VPREFETCH_M2(15,%r9) \ /*30*/ \ VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \ VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \ - VPREFETCH2(16,%r9) \ - VPREFETCH2(17,%r9) \ - VPREFETCH2(18,%r9) \ - VPREFETCH2(19,%r9) \ + VPREFETCH_M2(16,%r9) \ + VPREFETCH_M2(17,%r9) \ + VPREFETCH_M2(18,%r9) \ + VPREFETCH_M2(19,%r9) \ VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \ /*36*/ \ VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \ VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \ VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \ - VPREFETCH2(20,%r9) \ - VPREFETCH2(21,%r9) \ - VPREFETCH2(22,%r9) \ - VPREFETCH2(23,%r9) \ - VPREFETCHG(2,%r8) \ - VPREFETCHG(3,%r8) \ - VPREFETCH2(4,%r8) \ - VPREFETCH2(5,%r8) \ + VPREFETCH_M2(20,%r9) \ + VPREFETCH_M2(21,%r9) \ + VPREFETCH_M2(22,%r9) \ + VPREFETCH_M2(23,%r9) \ + VPREFETCH_G1(2,%r8) \ + VPREFETCH_G1(3,%r8) \ + VPREFETCH_G2(4,%r8) \ + VPREFETCH_G2(5,%r8) \ /*42 insns*/ ); #define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) \ @@ -794,8 +843,8 @@ Author: paboyle VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \ VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \ VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \ - /* VPREFETCHG(2,%r8)*/ \ - /* VPREFETCHG(3,%r8)*/ \ + /* VPREFETCH1(2,%r8)*/ \ + /* VPREFETCH1(3,%r8)*/ \ /*42 insns*/ ); From bdaa5b17670b728cbd65e7ea0a802dc1d9e1bc65 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 30 Jun 2016 13:07:42 -0700 Subject: [PATCH 19/21] Updated to have perfect prefetching for the s-vectorised kernel with any cache blocking. --- lib/Stencil.h | 5 + lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 153 +++++++++--------- lib/simd/Intel512wilson.h | 58 ++++--- 3 files changed, 119 insertions(+), 97 deletions(-) diff --git a/lib/Stencil.h b/lib/Stencil.h index bc015370..f5b6c288 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -272,6 +272,11 @@ if (local) return base + _entries[ent]._byte_offset; else return _entries[ent]._byte_offset; } + inline uint64_t GetPFInfo(int ent,uint64_t base) { + int local = _entries[ent]._is_local; + if (local) return base + _entries[ent]._byte_offset; + else return _entries[ent]._byte_offset; + } // Comms buffers std::vector > u_simd_send_buf; diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index 7373d2eb..4f3ef861 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -1,43 +1,44 @@ { - int locala,perma, ptypea; - int localb,permb, ptypeb; - int localc,permc, ptypec; - uint64_t basea, baseb, basec; - + int local,perm, ptype; + uint64_t base; + uint64_t basep; const uint64_t plocal =(uint64_t) & in._odata[0]; // vComplexF isigns[2] = { signs[0], signs[1] }; vComplexF *isigns = &signs[0]; MASK_REGS; - + int nmax=U._grid->oSites(); for(int site=0;site=nmax) ssn=0; + int sUn=lo.Reorder(ssn); for(int s=0;s shuffle and xor the real part sign bit - YM_PROJMEM(baseb); - MAYBEPERM(PERMUTE_DIR2,permb); + YM_PROJMEM(base); + MAYBEPERM(PERMUTE_DIR2,perm); } else { - LOAD_CHI(baseb); + LOAD_CHI(base); } + base = st.GetInfo(ptype,local,perm,Zp,ent,plocal); ent++; + PREFETCH_CHIMU(base); { - MULT_2SPIN_DIR_PFYP(Yp,basec); + MULT_2SPIN_DIR_PFYP(Yp,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit YM_RECON_ACCUM; @@ -63,17 +65,18 @@ //////////////////////////////// // Zp //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Xm,ent,plocal); ent++; - PREFETCH_CHIMU(baseb); - if ( localc ) { + basep = st.GetPFInfo(nent,plocal); nent++; + if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - ZM_PROJMEM(basec); - MAYBEPERM(PERMUTE_DIR1,permc); + ZM_PROJMEM(base); + MAYBEPERM(PERMUTE_DIR1,perm); } else { - LOAD_CHI(basec); + LOAD_CHI(base); } + base = st.GetInfo(ptype,local,perm,Tp,ent,plocal); ent++; + PREFETCH_CHIMU(base); { - MULT_2SPIN_DIR_PFZP(Zp,basea); + MULT_2SPIN_DIR_PFZP(Zp,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZM_RECON_ACCUM; @@ -81,17 +84,18 @@ //////////////////////////////// // Tp //////////////////////////////// - basec = st.GetInfo(ptypec,localc,permc,Ym,ent,plocal); ent++; - PREFETCH_CHIMU(basec); - if ( locala ) { + basep = st.GetPFInfo(nent,plocal); nent++; + if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - TM_PROJMEM(basea); - MAYBEPERM(PERMUTE_DIR0,perma); + TM_PROJMEM(base); + MAYBEPERM(PERMUTE_DIR0,perm); } else { - LOAD_CHI(basea); + LOAD_CHI(base); } + base = st.GetInfo(ptype,local,perm,Xm,ent,plocal); ent++; + PREFETCH_CHIMU(base); { - MULT_2SPIN_DIR_PFTP(Tp,baseb); + MULT_2SPIN_DIR_PFTP(Tp,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TM_RECON_ACCUM; @@ -99,17 +103,19 @@ //////////////////////////////// // Xm //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++; - PREFETCH_CHIMU(basea); - if ( localb ) { + basep= (uint64_t) &out._odata[ss]; + // basep= st.GetPFInfo(nent,plocal); nent++; + if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - XP_PROJMEM(baseb); - MAYBEPERM(PERMUTE_DIR3,permb); + XP_PROJMEM(base); + MAYBEPERM(PERMUTE_DIR3,perm); } else { - LOAD_CHI(baseb); + LOAD_CHI(base); } + base = st.GetInfo(ptype,local,perm,Ym,ent,plocal); ent++; + PREFETCH_CHIMU(base); { - MULT_2SPIN_DIR_PFXM(Xm,basec); + MULT_2SPIN_DIR_PFXM(Xm,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit XP_RECON_ACCUM; @@ -117,17 +123,18 @@ //////////////////////////////// // Ym //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++; - PREFETCH_CHIMU(baseb); - if ( localc ) { + basep= st.GetPFInfo(nent,plocal); nent++; + if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - YP_PROJMEM(basec); - MAYBEPERM(PERMUTE_DIR2,permc); + YP_PROJMEM(base); + MAYBEPERM(PERMUTE_DIR2,perm); } else { - LOAD_CHI(basec); + LOAD_CHI(base); } + base = st.GetInfo(ptype,local,perm,Zm,ent,plocal); ent++; + PREFETCH_CHIMU(base); { - MULT_2SPIN_DIR_PFYM(Ym,basea); + MULT_2SPIN_DIR_PFYM(Ym,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit YP_RECON_ACCUM; @@ -135,17 +142,18 @@ //////////////////////////////// // Zm //////////////////////////////// - basec = (uint64_t)&out._odata[ss]; - PREFETCH_CHIMU(basec); - if ( locala ) { + basep= st.GetPFInfo(nent,plocal); nent++; + if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - ZP_PROJMEM(basea); - MAYBEPERM(PERMUTE_DIR1,perma); + ZP_PROJMEM(base); + MAYBEPERM(PERMUTE_DIR1,perm); } else { - LOAD_CHI(basea); + LOAD_CHI(base); } + base = st.GetInfo(ptype,local,perm,Tm,ent,plocal); ent++; + PREFETCH_CHIMU(base); { - MULT_2SPIN_DIR_PFZM(Zm,baseb); + MULT_2SPIN_DIR_PFZM(Zm,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZP_RECON_ACCUM; @@ -153,23 +161,24 @@ //////////////////////////////// // Tm //////////////////////////////// - // basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; - // PREFETCH_CHIMU(basea); - if ( localb ) { + basep= st.GetPFInfo(nent,plocal); nent++; + if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - TP_PROJMEM(baseb); - MAYBEPERM(PERMUTE_DIR0,permb); + TP_PROJMEM(base); + MAYBEPERM(PERMUTE_DIR0,perm); } else { - LOAD_CHI(baseb); + LOAD_CHI(base); } + base= (uint64_t) &out._odata[ss]; + PREFETCH_CHIMU(base); { - MULT_2SPIN_DIR_PFTM(Tm,basec); + MULT_2SPIN_DIR_PFTM(Tm,basep); } - // baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TP_RECON_ACCUM; - SAVE_RESULT(&out._odata[ss],basec); + basep= st.GetPFInfo(nent,plocal); nent++; + SAVE_RESULT(base,basep); } ssU++; diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h index 9deffd80..660d07d6 100644 --- a/lib/simd/Intel512wilson.h +++ b/lib/simd/Intel512wilson.h @@ -261,8 +261,8 @@ Author: paboyle #define XM_PROJMEM(PTR) \ LOAD64(%r8,PTR)\ __asm__ ( \ - SHUF_CHIMU23i \ LOAD_CHIi \ + SHUF_CHIMU23i \ VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\ VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\ VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\ @@ -290,8 +290,8 @@ Author: paboyle #define ZM_PROJMEM(PTR) \ LOAD64(%r8,PTR) \ __asm__ ( \ - SHUF_CHIMU23i \ LOAD_CHIi \ + SHUF_CHIMU23i \ VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\ VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\ VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\ @@ -548,24 +548,25 @@ Author: paboyle #define AVX512_PF_L2_TABLE #undef AVX512_PF_L2_LINEAR -#ifdef AVX512_PF_L2_TABLE -#define VPREFETCH_P1(A,B) VPREFETCH1(A,B) -#define VPREFETCH_P2(A,B) VPREFETCH1(A,B) -#else -#define VPREFETCH_P1(A,B) -#define VPREFETCH_P2(A,B) -#endif -#ifdef AVX512_PF_L2_LINEAR -#define VPREFETCH_M1(A,B) +#ifdef AVX512_PF_L2_TABLE +// P1 Fetches the base pointer for next link into L1 with P1 +// M1 Fetches the next site pointer into L2 +#define VPREFETCH_P1(A,B) VPREFETCH1(A,B) +#define VPREFETCH_P2(A,B) +#define VPREFETCH_M1(A,B) VPREFETCH2(A,B) #define VPREFETCH_M2(A,B) -#else +#endif + +#ifdef AVX512_PF_L2_LINEAR #define VPREFETCH_M1(A,B) VPREFETCH1(A,B) #define VPREFETCH_M2(A,B) VPREFETCH2(A,B) +#define VPREFETCH_P1(A,B) +#define VPREFETCH_P2(A,B) #endif + #ifdef AVX512_PF_L2_GAUGE #define VPREFETCH_G1(A,B) VPREFETCH1(A,B) #define VPREFETCH_G2(A,B) VPREFETCH2(A,B) -#else #endif #define PF_GAUGE(A) \ @@ -593,21 +594,26 @@ Author: paboyle VSTORE(11,%r8,result_32) VPREFETCH_M1(11,%r9) \ ); +#ifdef AVX512_PF_L2_TABLE #define PREFETCH_CHIMU(A) \ LOAD64(%r9,A) \ __asm__ ( \ - VPREFETCH_P2(0,%r9) \ - VPREFETCH_P2(1,%r9) \ - VPREFETCH_P2(2,%r9) \ - VPREFETCH_P2(3,%r9) \ - VPREFETCH_P2(4,%r9) \ - VPREFETCH_P2(5,%r9) \ - VPREFETCH_P2(6,%r9) \ - VPREFETCH_P2(7,%r9) \ - VPREFETCH_P2(8,%r9) \ - VPREFETCH_P2(9,%r9) \ - VPREFETCH_P2(10,%r9) \ - VPREFETCH_P2(11,%r9)); + VPREFETCH_P1(0,%r9) \ + VPREFETCH_P1(1,%r9) \ + VPREFETCH_P1(2,%r9) \ + VPREFETCH_P1(3,%r9) \ + VPREFETCH_P1(4,%r9) \ + VPREFETCH_P1(5,%r9) \ + VPREFETCH_P1(6,%r9) \ + VPREFETCH_P1(7,%r9) \ + VPREFETCH_P1(8,%r9) \ + VPREFETCH_P1(9,%r9) \ + VPREFETCH_P1(10,%r9) \ + VPREFETCH_P1(11,%r9)); + +#else +#define PREFETCH_CHIMU(A) +#endif #define PREFETCH1_CHIMU(A) \ LOAD64(%r9,A) \ @@ -811,6 +817,8 @@ Author: paboyle VPREFETCH_G1(3,%r8) \ VPREFETCH_G2(4,%r8) \ VPREFETCH_G2(5,%r8) \ + VPREFETCH_G2(6,%r8) \ + VPREFETCH_G2(7,%r8) \ /*42 insns*/ ); #define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) \ From 712b9a348979f41e50f9f8bb7c7838807dbebe14 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 30 Jun 2016 14:00:34 -0700 Subject: [PATCH 20/21] Asm only for avx512 --- lib/qcd/action/fermion/WilsonKernels.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc index 672c23d6..4edd25f9 100644 --- a/lib/qcd/action/fermion/WilsonKernels.cc +++ b/lib/qcd/action/fermion/WilsonKernels.cc @@ -42,12 +42,15 @@ void WilsonKernels::DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,Dou std::vector > &buf, int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out) { +#ifdef AVX512 if ( AsmOpt ) { WilsonKernels::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out); } else { - +#else + { +#endif for(int site=0;site::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out); From 3fc6e03ad11881367ca62b2ca85abf0076f47897 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 30 Jun 2016 14:44:09 -0700 Subject: [PATCH 21/21] Version file --- VERSION | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 VERSION diff --git a/VERSION b/VERSION new file mode 100644 index 00000000..c12f9497 --- /dev/null +++ b/VERSION @@ -0,0 +1,4 @@ +Version : 0.5.0 + +- AVX512, AVX2, AVX, SSE good +- Clang 3.5 and above, ICPC v16 and above, GCC 4.9 and above