From 383ca7d39239d99a90e11df7a73918bea0268927 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 20 Aug 2017 01:27:48 +0100 Subject: [PATCH 1/4] Switch off comms for now until feature/multi-communicator is merged --- benchmarks/Benchmark_ITT.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 4f16b1de..91524149 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -160,7 +160,7 @@ public: int comm_proc = mpi_layout[mu]-1; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); } -#if 1 +#if 0 tbytes= Grid.StencilSendToRecvFromBegin(requests, (void *)&xbuf[dir][0], xmit_to_rank, From d9cd4f027336b650f7da0add0f3f380611cda7db Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Wed, 23 Aug 2017 15:07:18 +0100 Subject: [PATCH 2/4] Staggered multinode block cg debugged. Missing global sum. Code stalls and resumes on KNL at cambridge. Curious. CG iterations 23ms each, then 3200 ms pauses. Mean bandwidth reports as 200MB/s. Comms dominant in the report. However, the time behaviour suggests it is *bursty*.... Could be swap to disk? --- .../iterative/BlockConjugateGradient.h | 9 ++- lib/lattice/Lattice_reduction.h | 38 ++++++++---- .../fermion/ImprovedStaggeredFermion5D.cc | 60 +++++++++++++++++++ .../fermion/ImprovedStaggeredFermion5D.h | 10 ++++ .../solver/Test_staggered_block_cg_unprec.cc | 8 ++- 5 files changed, 110 insertions(+), 15 deletions(-) diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h index 9418f63c..d7817c05 100644 --- a/lib/algorithms/iterative/BlockConjugateGradient.h +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -199,7 +199,12 @@ void BlockCGrQsolve(LinearOperatorBase &Linop, const Field &B, Field &X) Linop.HermOp(X, AD); tmp = B - AD; + //std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl; ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp); + //std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl; + //std::cout << GridLogMessage << " m_rr " << m_rr< &Linop, const Field &B, Field &X) MatrixTimer.Start(); Linop.HermOp(D, Z); MatrixTimer.Stop(); + //std::cout << GridLogMessage << " norm2 Z " < &R,std::vector &a,const Lattice } }; +/* inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) { int NN = BlockSolverGrid->_ndimension; @@ -387,6 +388,7 @@ inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or } return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); } +*/ template static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) @@ -398,14 +400,15 @@ static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice int Nblock = X._grid->GlobalDimensions()[Orthog]; GridBase *FullGrid = X._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - Lattice Xslice(SliceGrid); - Lattice Rslice(SliceGrid); + // Lattice Xslice(SliceGrid); + // Lattice Rslice(SliceGrid); assert( FullGrid->_simd_layout[Orthog]==1); int nh = FullGrid->_ndimension; - int nl = SliceGrid->_ndimension; + // int nl = SliceGrid->_ndimension; + int nl = nh-1; //FIXME package in a convenient iterator //Should loop over a plane orthogonal to direction "Orthog" @@ -448,14 +451,14 @@ static void sliceMulMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice< int Nblock = X._grid->GlobalDimensions()[Orthog]; GridBase *FullGrid = X._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - Lattice Xslice(SliceGrid); - Lattice Rslice(SliceGrid); + // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + // Lattice Xslice(SliceGrid); + // Lattice Rslice(SliceGrid); assert( FullGrid->_simd_layout[Orthog]==1); int nh = FullGrid->_ndimension; - int nl = SliceGrid->_ndimension; + // int nl = SliceGrid->_ndimension; + int nl=1; //FIXME package in a convenient iterator //Should loop over a plane orthogonal to direction "Orthog" @@ -498,18 +501,19 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice typedef typename vobj::vector_type vector_type; GridBase *FullGrid = lhs._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); int Nblock = FullGrid->GlobalDimensions()[Orthog]; - Lattice Lslice(SliceGrid); - Lattice Rslice(SliceGrid); + // Lattice Lslice(SliceGrid); + // Lattice Rslice(SliceGrid); mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); assert( FullGrid->_simd_layout[Orthog]==1); int nh = FullGrid->_ndimension; - int nl = SliceGrid->_ndimension; + // int nl = SliceGrid->_ndimension; + int nl = nh-1; //FIXME package in a convenient iterator //Should loop over a plane orthogonal to direction "Orthog" @@ -550,6 +554,14 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice mat += mat_thread; } } + + for(int i=0;iGlobalSum(sum); + mat(i,j)=sum; + }} + return; } diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc index 61a3c559..7d988d89 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc @@ -230,8 +230,15 @@ void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOr { Compressor compressor; int LLs = in._grid->_rdimensions[0]; + + + + DhopTotalTime -= usecond(); + DhopCommTime -= usecond(); st.HaloExchange(in,compressor); + DhopCommTime += usecond(); + DhopComputeTime -= usecond(); // Dhop takes the 4d grid from U, and makes a 5d index for fermion if (dag == DaggerYes) { parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { @@ -244,12 +251,15 @@ void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOr Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out); } } + DhopComputeTime += usecond(); + DhopTotalTime += usecond(); } template void ImprovedStaggeredFermion5D::DhopOE(const FermionField &in, FermionField &out,int dag) { + DhopCalls+=1; conformable(in._grid,FermionRedBlackGrid()); // verifies half grid conformable(in._grid,out._grid); // drops the cb check @@ -261,6 +271,7 @@ void ImprovedStaggeredFermion5D::DhopOE(const FermionField &in, FermionFie template void ImprovedStaggeredFermion5D::DhopEO(const FermionField &in, FermionField &out,int dag) { + DhopCalls+=1; conformable(in._grid,FermionRedBlackGrid()); // verifies half grid conformable(in._grid,out._grid); // drops the cb check @@ -272,6 +283,7 @@ void ImprovedStaggeredFermion5D::DhopEO(const FermionField &in, FermionFie template void ImprovedStaggeredFermion5D::Dhop(const FermionField &in, FermionField &out,int dag) { + DhopCalls+=2; conformable(in._grid,FermionGrid()); // verifies full grid conformable(in._grid,out._grid); @@ -280,6 +292,54 @@ void ImprovedStaggeredFermion5D::Dhop(const FermionField &in, FermionField DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag); } +template +void ImprovedStaggeredFermion5D::Report(void) +{ + std::vector latt = GridDefaultLatt(); + RealD volume = Ls; for(int mu=0;mu_Nprocessors; + RealD NN = _FourDimGrid->NodeCount(); + + std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; + + std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls : " + << DhopCalls << std::endl; + std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime /Calls : " + << DhopTotalTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime /Calls : " + << DhopCommTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls : " + << DhopComputeTime / DhopCalls << " us" << std::endl; + + // Average the compute time + _FourDimGrid->GlobalSum(DhopComputeTime); + DhopComputeTime/=NP; + + RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl; + + RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl; + + std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil" < +void ImprovedStaggeredFermion5D::ZeroCounters(void) +{ + DhopCalls = 0; + DhopTotalTime = 0; + DhopCommTime = 0; + DhopComputeTime = 0; + Stencil.ZeroCounters(); + StencilEven.ZeroCounters(); + StencilOdd.ZeroCounters(); +} ///////////////////////////////////////////////////////////////////////// // Implement the general interface. Here we use SAME mass on all slices diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h index 4961da49..ca1a955a 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -55,6 +55,16 @@ namespace QCD { FermionField _tmp; FermionField &tmp(void) { return _tmp; } + //////////////////////////////////////// + // Performance monitoring + //////////////////////////////////////// + void Report(void); + void ZeroCounters(void); + double DhopTotalTime; + double DhopCalls; + double DhopCommTime; + double DhopComputeTime; + /////////////////////////////////////////////////////////////// // Implement the abstract base /////////////////////////////////////////////////////////////// diff --git a/tests/solver/Test_staggered_block_cg_unprec.cc b/tests/solver/Test_staggered_block_cg_unprec.cc index 8db41e98..f54bc3b2 100644 --- a/tests/solver/Test_staggered_block_cg_unprec.cc +++ b/tests/solver/Test_staggered_block_cg_unprec.cc @@ -75,7 +75,7 @@ int main (int argc, char ** argv) LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu); RealD mass=0.003; - ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); + ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); MdagMLinearOperator HermOp(Ds); ConjugateGradient CG(1.0e-8,10000); @@ -99,21 +99,27 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << " Calling 5d CG for "< Date: Thu, 24 Aug 2017 10:17:52 +0100 Subject: [PATCH 3/4] FFT test compile fixed --- lib/qcd/utils/GaugeFix.h | 3 +++ tests/core/Test_fft_gfix.cc | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/lib/qcd/utils/GaugeFix.h b/lib/qcd/utils/GaugeFix.h index f2ea1aa2..c4ea31aa 100644 --- a/lib/qcd/utils/GaugeFix.h +++ b/lib/qcd/utils/GaugeFix.h @@ -26,6 +26,8 @@ Author: Peter Boyle /* END LEGAL */ //#include +#ifndef GRID_QCD_GAUGE_FIX_H +#define GRID_QCD_GAUGE_FIX_H namespace Grid { namespace QCD { @@ -188,3 +190,4 @@ class FourierAcceleratedGaugeFixer : public Gimpl { } } +#endif diff --git a/tests/core/Test_fft_gfix.cc b/tests/core/Test_fft_gfix.cc index 9732eb85..916c4b0b 100644 --- a/tests/core/Test_fft_gfix.cc +++ b/tests/core/Test_fft_gfix.cc @@ -28,6 +28,9 @@ Author: Peter Boyle /* END LEGAL */ #include +using namespace Grid; +using namespace Grid::QCD; + int main (int argc, char ** argv) { std::vector seeds({1,2,3,4}); @@ -82,6 +85,7 @@ int main (int argc, char ** argv) Uorg = Uorg - Umu; std::cout << " Norm Difference "<< norm2(Uorg) << std::endl; + std::cout << " Norm "<< norm2(Umu) << std::endl; std::cout<< "*****************************************************************" < Date: Thu, 24 Aug 2017 18:17:09 +0100 Subject: [PATCH 4/4] CI update --- .travis.yml | 68 ----------------------------------------------------- README.md | 16 +------------ 2 files changed, 1 insertion(+), 83 deletions(-) diff --git a/.travis.yml b/.travis.yml index 64dae823..7d8203ce 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,68 +9,6 @@ matrix: - os: osx osx_image: xcode8.3 compiler: clang - - compiler: gcc - dist: trusty - sudo: required - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - g++-4.9 - - libmpfr-dev - - libgmp-dev - - libmpc-dev - - libopenmpi-dev - - openmpi-bin - - binutils-dev - env: VERSION=-4.9 - - compiler: gcc - dist: trusty - sudo: required - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - g++-5 - - libmpfr-dev - - libgmp-dev - - libmpc-dev - - libopenmpi-dev - - openmpi-bin - - binutils-dev - env: VERSION=-5 - - compiler: clang - dist: trusty - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - g++-4.8 - - libmpfr-dev - - libgmp-dev - - libmpc-dev - - libopenmpi-dev - - openmpi-bin - - binutils-dev - env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz - - compiler: clang - dist: trusty - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - g++-4.8 - - libmpfr-dev - - libgmp-dev - - libmpc-dev - - libopenmpi-dev - - openmpi-bin - - binutils-dev - env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz before_install: - export GRIDDIR=`pwd` @@ -106,9 +44,3 @@ script: - make -j4 - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - make check - - echo make clean - - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto ; fi - - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then make -j4; fi - - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi - - diff --git a/README.md b/README.md index 1e0988f3..13dd6996 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,4 @@ -# Grid - - - - - - - - - -
Last stable release - -
Development branch - -
+# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid) **Data parallel C++ mathematical object library.**