From 383ca7d39239d99a90e11df7a73918bea0268927 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 20 Aug 2017 01:27:48 +0100
Subject: [PATCH 1/4] Switch off comms for now until feature/multi-communicator
 is merged

---
 benchmarks/Benchmark_ITT.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc
index 4f16b1de..91524149 100644
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -160,7 +160,7 @@ public:
 		int comm_proc = mpi_layout[mu]-1;
 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	      }
-#if 1
+#if 0
 	      tbytes= Grid.StencilSendToRecvFromBegin(requests,
 						      (void *)&xbuf[dir][0],
 						      xmit_to_rank,

From d9cd4f027336b650f7da0add0f3f380611cda7db Mon Sep 17 00:00:00 2001
From: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Wed, 23 Aug 2017 15:07:18 +0100
Subject: [PATCH 2/4] Staggered multinode block cg debugged. Missing global
 sum. Code stalls and resumes on KNL at cambridge. Curious.

CG iterations 23ms each, then 3200 ms pauses. Mean bandwidth reports
as 200MB/s. Comms dominant in the report. However, the time behaviour suggests it
is *bursty*.... Could be swap to disk?
---
 .../iterative/BlockConjugateGradient.h        |  9 ++-
 lib/lattice/Lattice_reduction.h               | 38 ++++++++----
 .../fermion/ImprovedStaggeredFermion5D.cc     | 60 +++++++++++++++++++
 .../fermion/ImprovedStaggeredFermion5D.h      | 10 ++++
 .../solver/Test_staggered_block_cg_unprec.cc  |  8 ++-
 5 files changed, 110 insertions(+), 15 deletions(-)

diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h
index 9418f63c..d7817c05 100644
--- a/lib/algorithms/iterative/BlockConjugateGradient.h
+++ b/lib/algorithms/iterative/BlockConjugateGradient.h
@@ -199,7 +199,12 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
 
   Linop.HermOp(X, AD);
   tmp = B - AD;  
+  //std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
   ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
+  //std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
+  //std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
+  //std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
+  //std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
   D=Q;
 
   std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
@@ -221,13 +226,15 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
     MatrixTimer.Start();
     Linop.HermOp(D, Z);      
     MatrixTimer.Stop();
+    //std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;
 
     //4. M  = [D^dag Z]^{-1}
     sliceInnerTimer.Start();
     sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
     sliceInnerTimer.Stop();
     m_M       = m_DZ.inverse();
-
+    //std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
+    
     //5. X  = X + D MC
     m_tmp     = m_M * m_C;
     sliceMaddTimer.Start();
diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h
index 38982891..db012c8c 100644
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -369,6 +369,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
   }
 };
 
+/*
 inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
 {
   int NN    = BlockSolverGrid->_ndimension;
@@ -387,6 +388,7 @@ inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or
   }
   return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
 }
+*/
 
 template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
@@ -398,14 +400,15 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
   int Nblock = X._grid->GlobalDimensions()[Orthog];
 
   GridBase *FullGrid  = X._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
 
-  Lattice<vobj> Xslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
+  //  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
 
   assert( FullGrid->_simd_layout[Orthog]==1);
   int nh =  FullGrid->_ndimension;
-  int nl = SliceGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl = nh-1;
 
   //FIXME package in a convenient iterator
   //Should loop over a plane orthogonal to direction "Orthog"
@@ -448,14 +451,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
   int Nblock = X._grid->GlobalDimensions()[Orthog];
 
   GridBase *FullGrid  = X._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-
-  Lattice<vobj> Xslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
 
   assert( FullGrid->_simd_layout[Orthog]==1);
   int nh =  FullGrid->_ndimension;
-  int nl = SliceGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl=1;
 
   //FIXME package in a convenient iterator
   //Should loop over a plane orthogonal to direction "Orthog"
@@ -498,18 +501,19 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
   typedef typename vobj::vector_type vector_type;
   
   GridBase *FullGrid  = lhs._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
   
   int Nblock = FullGrid->GlobalDimensions()[Orthog];
   
-  Lattice<vobj> Lslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
+  //  Lattice<vobj> Lslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
   
   mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
 
   assert( FullGrid->_simd_layout[Orthog]==1);
   int nh =  FullGrid->_ndimension;
-  int nl = SliceGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl = nh-1;
 
   //FIXME package in a convenient iterator
   //Should loop over a plane orthogonal to direction "Orthog"
@@ -550,6 +554,14 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
       mat += mat_thread;
     }  
   }
+
+  for(int i=0;i<Nblock;i++){
+  for(int j=0;j<Nblock;j++){
+    ComplexD sum = mat(i,j);
+    FullGrid->GlobalSum(sum);
+    mat(i,j)=sum;
+  }}
+
   return;
 }
 
diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
index 61a3c559..7d988d89 100644
--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
+++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
@@ -230,8 +230,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 {
   Compressor compressor;
   int LLs = in._grid->_rdimensions[0];
+
+
+
+  DhopTotalTime -= usecond();
+  DhopCommTime -= usecond();
   st.HaloExchange(in,compressor);
+  DhopCommTime += usecond();
   
+  DhopComputeTime -= usecond();
   // Dhop takes the 4d grid from U, and makes a 5d index for fermion
   if (dag == DaggerYes) {
     parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
@@ -244,12 +251,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 	Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
     }
   }
+  DhopComputeTime += usecond();
+  DhopTotalTime   += usecond();
 }
 
 
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
+  DhopCalls+=1;
   conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
   conformable(in._grid,out._grid); // drops the cb check
 
@@ -261,6 +271,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
+  DhopCalls+=1;
   conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
   conformable(in._grid,out._grid); // drops the cb check
 
@@ -272,6 +283,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
+  DhopCalls+=2;
   conformable(in._grid,FermionGrid()); // verifies full grid
   conformable(in._grid,out._grid);
 
@@ -280,6 +292,54 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
   DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
 }
 
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::Report(void) 
+{
+  std::vector<int> latt = GridDefaultLatt();          
+  RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
+  RealD NP = _FourDimGrid->_Nprocessors;
+  RealD NN = _FourDimGrid->NodeCount();
+
+  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
+
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls   : " 
+	    << DhopCalls   << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime   /Calls       : " 
+	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime    /Calls       : " 
+	    << DhopCommTime    / DhopCalls << " us" << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls        : " 
+	    << DhopComputeTime / DhopCalls << " us" << std::endl;
+
+  // Average the compute time
+  _FourDimGrid->GlobalSum(DhopComputeTime);
+  DhopComputeTime/=NP;
+
+  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
+  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
+  
+  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
+  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
+
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil"    <<std::endl;  Stencil.Report();
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl;  StencilEven.Report();
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
+}
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void) 
+{
+  DhopCalls       = 0;
+  DhopTotalTime    = 0;
+  DhopCommTime    = 0;
+  DhopComputeTime = 0;
+  Stencil.ZeroCounters();
+  StencilEven.ZeroCounters();
+  StencilOdd.ZeroCounters();
+}
 
 /////////////////////////////////////////////////////////////////////////
 // Implement the general interface. Here we use SAME mass on all slices
diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h
index 4961da49..ca1a955a 100644
--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -55,6 +55,16 @@ namespace QCD {
       FermionField _tmp;
       FermionField &tmp(void) { return _tmp; }
 
+      ////////////////////////////////////////
+      // Performance monitoring
+      ////////////////////////////////////////
+      void Report(void);
+      void ZeroCounters(void);
+      double DhopTotalTime;
+      double DhopCalls;
+      double DhopCommTime;
+      double DhopComputeTime;
+
       ///////////////////////////////////////////////////////////////
       // Implement the abstract base
       ///////////////////////////////////////////////////////////////
diff --git a/tests/solver/Test_staggered_block_cg_unprec.cc b/tests/solver/Test_staggered_block_cg_unprec.cc
index 8db41e98..f54bc3b2 100644
--- a/tests/solver/Test_staggered_block_cg_unprec.cc
+++ b/tests/solver/Test_staggered_block_cg_unprec.cc
@@ -75,7 +75,7 @@ int main (int argc, char ** argv)
   LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
 
   RealD mass=0.003;
-  ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass);
+  ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); 
   MdagMLinearOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
 
   ConjugateGradient<FermionField> CG(1.0e-8,10000);
@@ -99,21 +99,27 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl;
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
   result=zero;
+  Ds.ZeroCounters();
   CG(HermOp,src,result);
+  Ds.Report();
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
   std::cout << GridLogMessage << " Calling multiRHS CG for "<<Ls <<" right hand sides" <<std::endl;
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
   result=zero;
+  Ds.ZeroCounters();
   mCG(HermOp,src,result);
+  Ds.Report();
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
   std::cout << GridLogMessage << " Calling Block CG for "<<Ls <<" right hand sides" <<std::endl;
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
   result=zero;
+  Ds.ZeroCounters();
   BCGrQ(HermOp,src,result);
+  Ds.Report();
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 
 

From 5fa386ddc96fc13146e528ea4d3d92ec9552c49e Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 24 Aug 2017 10:17:52 +0100
Subject: [PATCH 3/4] FFT test compile fixed

---
 lib/qcd/utils/GaugeFix.h    | 3 +++
 tests/core/Test_fft_gfix.cc | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/lib/qcd/utils/GaugeFix.h b/lib/qcd/utils/GaugeFix.h
index f2ea1aa2..c4ea31aa 100644
--- a/lib/qcd/utils/GaugeFix.h
+++ b/lib/qcd/utils/GaugeFix.h
@@ -26,6 +26,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
     /*  END LEGAL */
 //#include <Grid/Grid.h>
 
+#ifndef GRID_QCD_GAUGE_FIX_H
+#define GRID_QCD_GAUGE_FIX_H
 namespace Grid {
 namespace QCD {
 
@@ -188,3 +190,4 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
 
 }
 }
+#endif
diff --git a/tests/core/Test_fft_gfix.cc b/tests/core/Test_fft_gfix.cc
index 9732eb85..916c4b0b 100644
--- a/tests/core/Test_fft_gfix.cc
+++ b/tests/core/Test_fft_gfix.cc
@@ -28,6 +28,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
     /*  END LEGAL */
 #include <Grid/Grid.h>
 
+using namespace Grid;
+using namespace Grid::QCD;
+
 int main (int argc, char ** argv)
 {
   std::vector<int> seeds({1,2,3,4});
@@ -82,6 +85,7 @@ int main (int argc, char ** argv)
 
   Uorg = Uorg - Umu;
   std::cout << " Norm Difference "<< norm2(Uorg) << std::endl;
+  std::cout << " Norm "<< norm2(Umu) << std::endl;
 
 
   std::cout<< "*****************************************************************" <<std::endl;

From 102ea9ae668a1c8eef506113348355e9d78fd522 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Thu, 24 Aug 2017 18:17:09 +0100
Subject: [PATCH 4/4] CI update

---
 .travis.yml | 68 -----------------------------------------------------
 README.md   | 16 +------------
 2 files changed, 1 insertion(+), 83 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 64dae823..7d8203ce 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,68 +9,6 @@ matrix:
     - os:        osx
       osx_image: xcode8.3
       compiler: clang
-    - compiler: gcc
-      dist: trusty
-      sudo: required
-      addons:
-        apt:
-          sources:
-            - ubuntu-toolchain-r-test
-          packages:
-            - g++-4.9
-            - libmpfr-dev
-            - libgmp-dev
-            - libmpc-dev
-            - libopenmpi-dev
-            - openmpi-bin
-            - binutils-dev
-      env: VERSION=-4.9
-    - compiler: gcc
-      dist: trusty
-      sudo: required
-      addons:
-        apt:
-          sources:
-            - ubuntu-toolchain-r-test
-          packages:
-            - g++-5
-            - libmpfr-dev
-            - libgmp-dev
-            - libmpc-dev
-            - libopenmpi-dev
-            - openmpi-bin
-            - binutils-dev
-      env: VERSION=-5
-    - compiler: clang
-      dist: trusty
-      addons:
-        apt:
-          sources:
-            - ubuntu-toolchain-r-test
-          packages:
-            - g++-4.8
-            - libmpfr-dev
-            - libgmp-dev
-            - libmpc-dev
-            - libopenmpi-dev
-            - openmpi-bin
-            - binutils-dev
-      env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
-    - compiler: clang
-      dist: trusty
-      addons:
-        apt:
-          sources:
-            - ubuntu-toolchain-r-test
-          packages:
-            - g++-4.8
-            - libmpfr-dev
-            - libgmp-dev
-            - libmpc-dev
-            - libopenmpi-dev
-            - openmpi-bin
-            - binutils-dev
-      env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
       
 before_install:
     - export GRIDDIR=`pwd`
@@ -106,9 +44,3 @@ script:
     - make -j4
     - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
     - make check
-    - echo make clean
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto ; fi
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then make -j4; fi
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
-
-
diff --git a/README.md b/README.md
index 1e0988f3..13dd6996 100644
--- a/README.md
+++ b/README.md
@@ -1,18 +1,4 @@
-# Grid
-<table>
-<tr>
-    <td>Last stable release</td>
-    <td><a href="https://travis-ci.org/paboyle/Grid">
-    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=master"></a>
-    </td>
-</tr>
-<tr>
-    <td>Development branch</td>
-    <td><a href="https://travis-ci.org/paboyle/Grid">
-    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=develop"></a>
-    </td>
-</tr>
-</table>
+# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid)
 
 **Data parallel C++ mathematical object library.**