Merge branch 'develop' into feature/multi-communicator

2025-11-07 07:09:32 +00:00 · 2017-08-25 20:21:26 +01:00
parent f68b5de9c8 102ea9ae66
commit 80c5bce5bb
10 changed files with 118 additions and 99 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,68 +9,6 @@ matrix:
    - os:        osx
      osx_image: xcode8.3
      compiler: clang
-    - compiler: gcc
-      dist: trusty
-      sudo: required
-      addons:
-        apt:
-          sources:
-            - ubuntu-toolchain-r-test
-          packages:
-            - g++-4.9
-            - libmpfr-dev
-            - libgmp-dev
-            - libmpc-dev
-            - libopenmpi-dev
-            - openmpi-bin
-            - binutils-dev
-      env: VERSION=-4.9
-    - compiler: gcc
-      dist: trusty
-      sudo: required
-      addons:
-        apt:
-          sources:
-            - ubuntu-toolchain-r-test
-          packages:
-            - g++-5
-            - libmpfr-dev
-            - libgmp-dev
-            - libmpc-dev
-            - libopenmpi-dev
-            - openmpi-bin
-            - binutils-dev
-      env: VERSION=-5
-    - compiler: clang
-      dist: trusty
-      addons:
-        apt:
-          sources:
-            - ubuntu-toolchain-r-test
-          packages:
-            - g++-4.8
-            - libmpfr-dev
-            - libgmp-dev
-            - libmpc-dev
-            - libopenmpi-dev
-            - openmpi-bin
-            - binutils-dev
-      env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
-    - compiler: clang
-      dist: trusty
-      addons:
-        apt:
-          sources:
-            - ubuntu-toolchain-r-test
-          packages:
-            - g++-4.8
-            - libmpfr-dev
-            - libgmp-dev
-            - libmpc-dev
-            - libopenmpi-dev
-            - openmpi-bin
-            - binutils-dev
-      env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
      
 before_install:
    - export GRIDDIR=`pwd`
@@ -106,9 +44,3 @@ script:
    - make -j4
    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
    - make check
-    - echo make clean
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto ; fi
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then make -j4; fi
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
-
-
--- a/README.md
+++ b/README.md
@@ -1,18 +1,4 @@
-# Grid
-<table>
-<tr>
-    <td>Last stable release</td>
-    <td><a href="https://travis-ci.org/paboyle/Grid">
-    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=master"></a>
-    </td>
-</tr>
-<tr>
-    <td>Development branch</td>
-    <td><a href="https://travis-ci.org/paboyle/Grid">
-    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=develop"></a>
-    </td>
-</tr>
-</table>
+# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid)

 **Data parallel C++ mathematical object library.**

--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -181,7 +181,6 @@ public:
 	  
 #ifdef GRID_OMP
 #pragma omp atomic
-#endif
 	      ncomm++;

 #ifdef GRID_OMP
--- a/lib/algorithms/iterative/BlockConjugateGradient.h
+++ b/lib/algorithms/iterative/BlockConjugateGradient.h
@@ -199,7 +199,12 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)

  Linop.HermOp(X, AD);
  tmp = B - AD;  
+  //std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
+  //std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
+  //std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
+  //std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
+  //std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
  D=Q;

  std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
@@ -221,12 +226,14 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
    MatrixTimer.Start();
    Linop.HermOp(D, Z);      
    MatrixTimer.Stop();
+    //std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;

    //4. M  = [D^dag Z]^{-1}
    sliceInnerTimer.Start();
    sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
    sliceInnerTimer.Stop();
    m_M       = m_DZ.inverse();
+    //std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
    
    //5. X  = X + D MC
    m_tmp     = m_M * m_C;
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -369,6 +369,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
  }
 };

+/*
 inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
 {
  int NN    = BlockSolverGrid->_ndimension;
@@ -387,6 +388,7 @@ inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or
  }
  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
 }
+*/

 template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
@@ -398,14 +400,15 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
  int Nblock = X._grid->GlobalDimensions()[Orthog];

  GridBase *FullGrid  = X._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);

-  Lattice<vobj> Xslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
+  //  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);

  assert( FullGrid->_simd_layout[Orthog]==1);
  int nh =  FullGrid->_ndimension;
-  int nl = SliceGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl = nh-1;

  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
@@ -448,14 +451,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
  int Nblock = X._grid->GlobalDimensions()[Orthog];

  GridBase *FullGrid  = X._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-
-  Lattice<vobj> Xslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);

  assert( FullGrid->_simd_layout[Orthog]==1);
  int nh =  FullGrid->_ndimension;
-  int nl = SliceGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl=1;

  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
@@ -498,18 +501,19 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
  typedef typename vobj::vector_type vector_type;
  
  GridBase *FullGrid  = lhs._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
  
  int Nblock = FullGrid->GlobalDimensions()[Orthog];
  
-  Lattice<vobj> Lslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
+  //  Lattice<vobj> Lslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
  
  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);

  assert( FullGrid->_simd_layout[Orthog]==1);
  int nh =  FullGrid->_ndimension;
-  int nl = SliceGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl = nh-1;

  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
@@ -550,6 +554,14 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
      mat += mat_thread;
    }  
  }
+
+  for(int i=0;i<Nblock;i++){
+  for(int j=0;j<Nblock;j++){
+    ComplexD sum = mat(i,j);
+    FullGrid->GlobalSum(sum);
+    mat(i,j)=sum;
+  }}
+
  return;
 }

--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
+++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
@@ -230,8 +230,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 {
  Compressor compressor;
  int LLs = in._grid->_rdimensions[0];
-  st.HaloExchange(in,compressor);

+
+
+  DhopTotalTime -= usecond();
+  DhopCommTime -= usecond();
+  st.HaloExchange(in,compressor);
+  DhopCommTime += usecond();
+  
+  DhopComputeTime -= usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
  if (dag == DaggerYes) {
    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
@@ -244,12 +251,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 	Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
    }
  }
+  DhopComputeTime += usecond();
+  DhopTotalTime   += usecond();
 }


 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
+  DhopCalls+=1;
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
  conformable(in._grid,out._grid); // drops the cb check

@@ -261,6 +271,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
+  DhopCalls+=1;
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
  conformable(in._grid,out._grid); // drops the cb check

@@ -272,6 +283,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
+  DhopCalls+=2;
  conformable(in._grid,FermionGrid()); // verifies full grid
  conformable(in._grid,out._grid);

@@ -280,6 +292,54 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
  DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
 }

+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::Report(void) 
+{
+  std::vector<int> latt = GridDefaultLatt();          
+  RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
+  RealD NP = _FourDimGrid->_Nprocessors;
+  RealD NN = _FourDimGrid->NodeCount();
+
+  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
+
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls   : " 
+	    << DhopCalls   << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime   /Calls       : " 
+	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime    /Calls       : " 
+	    << DhopCommTime    / DhopCalls << " us" << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls        : " 
+	    << DhopComputeTime / DhopCalls << " us" << std::endl;
+
+  // Average the compute time
+  _FourDimGrid->GlobalSum(DhopComputeTime);
+  DhopComputeTime/=NP;
+
+  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
+  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
+  
+  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
+  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
+
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil"    <<std::endl;  Stencil.Report();
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl;  StencilEven.Report();
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
+}
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void) 
+{
+  DhopCalls       = 0;
+  DhopTotalTime    = 0;
+  DhopCommTime    = 0;
+  DhopComputeTime = 0;
+  Stencil.ZeroCounters();
+  StencilEven.ZeroCounters();
+  StencilOdd.ZeroCounters();
+}

 /////////////////////////////////////////////////////////////////////////
 // Implement the general interface. Here we use SAME mass on all slices
--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -55,6 +55,16 @@ namespace QCD {
      FermionField _tmp;
      FermionField &tmp(void) { return _tmp; }

+      ////////////////////////////////////////
+      // Performance monitoring
+      ////////////////////////////////////////
+      void Report(void);
+      void ZeroCounters(void);
+      double DhopTotalTime;
+      double DhopCalls;
+      double DhopCommTime;
+      double DhopComputeTime;
+
      ///////////////////////////////////////////////////////////////
      // Implement the abstract base
      ///////////////////////////////////////////////////////////////
--- a/lib/qcd/utils/GaugeFix.h
+++ b/lib/qcd/utils/GaugeFix.h
@@ -26,6 +26,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 //#include <Grid/Grid.h>

+#ifndef GRID_QCD_GAUGE_FIX_H
+#define GRID_QCD_GAUGE_FIX_H
 namespace Grid {
 namespace QCD {

@@ -188,3 +190,4 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {

 }
 }
+#endif
--- a/tests/core/Test_fft_gfix.cc
+++ b/tests/core/Test_fft_gfix.cc
@@ -28,6 +28,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #include <Grid/Grid.h>

+using namespace Grid;
+using namespace Grid::QCD;
+
 int main (int argc, char ** argv)
 {
  std::vector<int> seeds({1,2,3,4});
@@ -82,6 +85,7 @@ int main (int argc, char ** argv)

  Uorg = Uorg - Umu;
  std::cout << " Norm Difference "<< norm2(Uorg) << std::endl;
+  std::cout << " Norm "<< norm2(Umu) << std::endl;


  std::cout<< "*****************************************************************" <<std::endl;
--- a/tests/solver/Test_staggered_block_cg_unprec.cc
+++ b/tests/solver/Test_staggered_block_cg_unprec.cc
@@ -99,21 +99,27 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl;
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
  result=zero;
+  Ds.ZeroCounters();
  CG(HermOp,src,result);
+  Ds.Report();
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;

  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
  std::cout << GridLogMessage << " Calling multiRHS CG for "<<Ls <<" right hand sides" <<std::endl;
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
  result=zero;
+  Ds.ZeroCounters();
  mCG(HermOp,src,result);
+  Ds.Report();
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;

  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
  std::cout << GridLogMessage << " Calling Block CG for "<<Ls <<" right hand sides" <<std::endl;
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
  result=zero;
+  Ds.ZeroCounters();
  BCGrQ(HermOp,src,result);
+  Ds.Report();
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;