Grid/benchmarks/Benchmark_memory_bandwidth.cc

    /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

    Source file: ./benchmarks/Benchmark_memory_bandwidth.cc

    Copyright (C) 2015

Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
#include <Grid.h>

using namespace std;
using namespace Grid;
using namespace Grid::QCD;

int main (int argc, char ** argv)
{
  Grid_init(&argc,&argv);

  const int Nvec=8;
  typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
  typedef iVector<vReal,Nvec> Vec;


  Vec rn = zero;

  std::vector<int> simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();

  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

  
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking fused AXPY bandwidth ; sizeof(Real) "<<sizeof(Real)<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
  uint64_t lmax=44;
#define NLOOP (1*lmax*lmax*lmax*lmax/vol)
  for(int lat=4;lat<=lmax;lat+=4){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);

      uint64_t Nloop=NLOOP;

      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();

      LatticeVec z(&Grid); //random(pRNG,z);
      LatticeVec x(&Grid); //random(pRNG,x);
      LatticeVec y(&Grid); //random(pRNG,y);
      double a=2.0;


      double start=usecond();
      for(int i=0;i<Nloop;i++){
	axpy(z,a,x,y);
        x._odata[0]=z._odata[0]; // serial loop dependence to prevent optimise
        y._odata[4]=z._odata[4];
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
      
      double flops=vol*Nvec*2;// mul,add
      double bytes=3*vol*Nvec*sizeof(Real);
      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;

    }

  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
  
  for(int lat=4;lat<=lmax;lat+=4){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);

      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();

      LatticeVec z(&Grid); //random(pRNG,z);
      LatticeVec x(&Grid); //random(pRNG,x);
      LatticeVec y(&Grid); //random(pRNG,y);
      double a=2.0;

      uint64_t Nloop=NLOOP;

      double start=usecond();
      for(int i=0;i<Nloop;i++){
	z=a*x-y;
        x._odata[0]=z._odata[0]; // force serial dependency to prevent optimise away
        y._odata[4]=z._odata[4];
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
     
      double flops=vol*Nvec*2;// mul,add
      double bytes=3*vol*Nvec*sizeof(Real);
      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;

    }

  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking SCALE bandwidth"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;

  for(int lat=4;lat<=lmax;lat+=4){


      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      uint64_t Nloop=NLOOP;

      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);

      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();

      LatticeVec z(&Grid); //random(pRNG,z);
      LatticeVec x(&Grid); //random(pRNG,x);
      LatticeVec y(&Grid); //random(pRNG,y);
      RealD a=2.0;


      double start=usecond();
      for(int i=0;i<Nloop;i++){
	z=a*x;
        x._odata[0]=z._odata[0]*2.0;
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
      
      double bytes=2*vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*1;// mul
      std::cout<<GridLogMessage <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;

  }

  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking READ bandwidth"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;

  for(int lat=4;lat<=lmax;lat+=4){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      uint64_t Nloop=NLOOP;
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);

      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
      LatticeVec z(&Grid); //random(pRNG,z);
      LatticeVec x(&Grid); //random(pRNG,x);
      LatticeVec y(&Grid); //random(pRNG,y);
      RealD a=2.0;
      Real nn;      
      double start=usecond();
      for(int i=0;i<Nloop;i++){
	nn=norm2(x);
	vsplat(x._odata[0]._internal[0],nn);
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
      
      double bytes=vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*2;// mul,add
      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;

  }    

  Grid_finalize();
}
Global edit adding copyright and license info to every source file. 2016-01-02 14:51:32 +00:00			`/*************************************************************************************`

			`Grid physics library, www.github.com/paboyle/Grid`

			`Source file: ./benchmarks/Benchmark_memory_bandwidth.cc`

			`Copyright (C) 2015`

			`Author: Peter Boyle <paboyle@ph.ed.ac.uk>`
			`Author: paboyle <paboyle@ph.ed.ac.uk>`

			`This program is free software; you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation; either version 2 of the License, or`
			`(at your option) any later version.`

			`This program is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License along`
			`with this program; if not, write to the Free Software Foundation, Inc.,`
			`51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.`

			`See the full license in the file "LICENSE" in the top level distribution directory`
			`*************************************************************************************/`
			`/* END LEGAL */`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00			`#include <Grid.h>`

			`using namespace std;`
			`using namespace Grid;`
			`using namespace Grid::QCD;`

			`int main (int argc, char ** argv)`
			`{`
			`Grid_init(&argc,&argv);`

			`const int Nvec=8;`
			`typedef Lattice< iVector< vReal,Nvec> > LatticeVec;`
Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes. 2015-09-29 00:09:04 +01:00			`typedef iVector<vReal,Nvec> Vec;`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00
Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes. 2015-09-29 00:09:04 +01:00
			`Vec rn = zero;`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00
Enhanced SIMD interfacing 2015-05-12 20:41:44 +01:00			`std::vector<int> simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());`
Adding a better controlled threading class, preparing to force in deterministic reduction. 2015-05-11 18:59:03 +01:00			`std::vector<int> mpi_layout = GridDefaultMpi();`

Add messages to get the number of threads for openmp 2015-05-19 14:54:42 +01:00			`int threads = GridThread::GetThreads();`
Sizable improvement in multigrid for unsquared. 6000 matmuls CG unprec 2000 matmuls CG prec (4000 eo muls) 1050 matmuls PGCR on 16^3 x 32 x 8 m=.01 Substantial effort on timing and logging infrastructure 2015-07-23 17:31:13 +01:00			`std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;`
Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes. 2015-09-29 00:09:04 +01:00
Add messages to get the number of threads for openmp 2015-05-19 14:54:42 +01:00
Sizable improvement in multigrid for unsquared. 6000 matmuls CG unprec 2000 matmuls CG prec (4000 eo muls) 1050 matmuls PGCR on 16^3 x 32 x 8 m=.01 Substantial effort on timing and logging infrastructure 2015-07-23 17:31:13 +01:00			`std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;`
			`std::cout<<GridLogMessage << "= Benchmarking fused AXPY bandwidth ; sizeof(Real) "<<sizeof(Real)<<std::endl;`
			`std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;`
Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes. 2015-09-29 00:09:04 +01:00			`std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;`
Sizable improvement in multigrid for unsquared. 6000 matmuls CG unprec 2000 matmuls CG prec (4000 eo muls) 1050 matmuls PGCR on 16^3 x 32 x 8 m=.01 Substantial effort on timing and logging infrastructure 2015-07-23 17:31:13 +01:00			`std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;`
Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes. 2015-09-29 00:09:04 +01:00			`uint64_t lmax=44;`
Better run time on KNC 2015-11-04 11:25:34 +00:00			`#define NLOOP (1lmaxlmaxlmaxlmax/vol)`
Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes. 2015-09-29 00:09:04 +01:00			`for(int lat=4;lat<=lmax;lat+=4){`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00
Weak scale the benchmarks automatically. 2015-05-28 13:47:01 +01:00			`std::vector<int> latt_size ({latmpi_layout[0],latmpi_layout[1],latmpi_layout[2],latmpi_layout[3]});`
			`int vol = latt_size[0]latt_size[1]latt_size[2]*latt_size[3];`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00			`GridCartesian Grid(latt_size,simd_layout,mpi_layout);`

Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes. 2015-09-29 00:09:04 +01:00			`uint64_t Nloop=NLOOP;`

Better run time on KNC 2015-11-04 11:25:34 +00:00			`// GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00
Better run time on KNC 2015-11-04 11:25:34 +00:00			`LatticeVec z(&Grid); //random(pRNG,z);`
			`LatticeVec x(&Grid); //random(pRNG,x);`
			`LatticeVec y(&Grid); //random(pRNG,y);`
Updated bandwidth test 2015-05-05 18:08:53 +01:00			`double a=2.0;`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00

			`double start=usecond();`
			`for(int i=0;i<Nloop;i++){`
			`axpy(z,a,x,y);`
Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes. 2015-09-29 00:09:04 +01:00			`x._odata[0]=z._odata[0]; // serial loop dependence to prevent optimise`
			`y._odata[4]=z._odata[4];`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00			`}`
			`double stop=usecond();`
Cleaned up for Linux 2015-05-05 22:09:22 +01:00			`double time = (stop-start)/Nloop*1000;`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00
Weak scale the benchmarks automatically. 2015-05-28 13:47:01 +01:00			`double flops=volNvec2;// mul,add`
			`double bytes=3volNvec*sizeof(Real);`
Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes. 2015-09-29 00:09:04 +01:00			`std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00
			`}`

Sizable improvement in multigrid for unsquared. 6000 matmuls CG unprec 2000 matmuls CG prec (4000 eo muls) 1050 matmuls PGCR on 16^3 x 32 x 8 m=.01 Substantial effort on timing and logging infrastructure 2015-07-23 17:31:13 +01:00			`std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;`
			`std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl;`
			`std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;`
Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes. 2015-09-29 00:09:04 +01:00			`std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;`
Sizable improvement in multigrid for unsquared. 6000 matmuls CG unprec 2000 matmuls CG prec (4000 eo muls) 1050 matmuls PGCR on 16^3 x 32 x 8 m=.01 Substantial effort on timing and logging infrastructure 2015-07-23 17:31:13 +01:00			`std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00
Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes. 2015-09-29 00:09:04 +01:00			`for(int lat=4;lat<=lmax;lat+=4){`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00
Weak scale the benchmarks automatically. 2015-05-28 13:47:01 +01:00			`std::vector<int> latt_size ({latmpi_layout[0],latmpi_layout[1],latmpi_layout[2],latmpi_layout[3]});`
			`int vol = latt_size[0]latt_size[1]latt_size[2]*latt_size[3];`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00			`GridCartesian Grid(latt_size,simd_layout,mpi_layout);`

Better run time on KNC 2015-11-04 11:25:34 +00:00			`// GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00
Better run time on KNC 2015-11-04 11:25:34 +00:00			`LatticeVec z(&Grid); //random(pRNG,z);`
			`LatticeVec x(&Grid); //random(pRNG,x);`
			`LatticeVec y(&Grid); //random(pRNG,y);`
Updated bandwidth test 2015-05-05 18:08:53 +01:00			`double a=2.0;`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00
Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes. 2015-09-29 00:09:04 +01:00			`uint64_t Nloop=NLOOP;`

Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00			`double start=usecond();`
			`for(int i=0;i<Nloop;i++){`
ET ready benchmark with bytes counted assuming loop interchange 2015-05-10 15:18:04 +01:00			`z=a*x-y;`
Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes. 2015-09-29 00:09:04 +01:00			`x._odata[0]=z._odata[0]; // force serial dependency to prevent optimise away`
			`y._odata[4]=z._odata[4];`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00			`}`
			`double stop=usecond();`
Cleaned up for Linux 2015-05-05 22:09:22 +01:00			`double time = (stop-start)/Nloop*1000;`
ET ready benchmark with bytes counted assuming loop interchange 2015-05-10 15:18:04 +01:00
Weak scale the benchmarks automatically. 2015-05-28 13:47:01 +01:00			`double flops=volNvec2;// mul,add`
			`double bytes=3volNvec*sizeof(Real);`
Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes. 2015-09-29 00:09:04 +01:00			`std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00
			`}`

Sizable improvement in multigrid for unsquared. 6000 matmuls CG unprec 2000 matmuls CG prec (4000 eo muls) 1050 matmuls PGCR on 16^3 x 32 x 8 m=.01 Substantial effort on timing and logging infrastructure 2015-07-23 17:31:13 +01:00			`std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;`
			`std::cout<<GridLogMessage << "= Benchmarking SCALE bandwidth"<<std::endl;`
			`std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;`
Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes. 2015-09-29 00:09:04 +01:00			`std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;`

			`for(int lat=4;lat<=lmax;lat+=4){`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00

Weak scale the benchmarks automatically. 2015-05-28 13:47:01 +01:00			`std::vector<int> latt_size ({latmpi_layout[0],latmpi_layout[1],latmpi_layout[2],latmpi_layout[3]});`
			`int vol = latt_size[0]latt_size[1]latt_size[2]*latt_size[3];`
Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes. 2015-09-29 00:09:04 +01:00			`uint64_t Nloop=NLOOP;`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00
			`GridCartesian Grid(latt_size,simd_layout,mpi_layout);`

Better run time on KNC 2015-11-04 11:25:34 +00:00			`// GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00
Better run time on KNC 2015-11-04 11:25:34 +00:00			`LatticeVec z(&Grid); //random(pRNG,z);`
			`LatticeVec x(&Grid); //random(pRNG,x);`
			`LatticeVec y(&Grid); //random(pRNG,y);`
Updated bandwidth test 2015-05-05 18:08:53 +01:00			`RealD a=2.0;`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00

			`double start=usecond();`
			`for(int i=0;i<Nloop;i++){`
ET ready benchmark with bytes counted assuming loop interchange 2015-05-10 15:18:04 +01:00			`z=a*x;`
Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes. 2015-09-29 00:09:04 +01:00			`x._odata[0]=z._odata[0]*2.0;`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00			`}`
			`double stop=usecond();`
Cleaned up for Linux 2015-05-05 22:09:22 +01:00			`double time = (stop-start)/Nloop*1000;`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00
Weak scale the benchmarks automatically. 2015-05-28 13:47:01 +01:00			`double bytes=2volNvec*sizeof(Real);`
			`double flops=volNvec1;// mul`
Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes. 2015-09-29 00:09:04 +01:00			`std::cout<<GridLogMessage <<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00
			`}`

Sizable improvement in multigrid for unsquared. 6000 matmuls CG unprec 2000 matmuls CG prec (4000 eo muls) 1050 matmuls PGCR on 16^3 x 32 x 8 m=.01 Substantial effort on timing and logging infrastructure 2015-07-23 17:31:13 +01:00			`std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;`
			`std::cout<<GridLogMessage << "= Benchmarking READ bandwidth"<<std::endl;`
			`std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;`
Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes. 2015-09-29 00:09:04 +01:00			`std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;`
Sizable improvement in multigrid for unsquared. 6000 matmuls CG unprec 2000 matmuls CG prec (4000 eo muls) 1050 matmuls PGCR on 16^3 x 32 x 8 m=.01 Substantial effort on timing and logging infrastructure 2015-07-23 17:31:13 +01:00			`std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00
Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes. 2015-09-29 00:09:04 +01:00			`for(int lat=4;lat<=lmax;lat+=4){`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00
Weak scale the benchmarks automatically. 2015-05-28 13:47:01 +01:00			`std::vector<int> latt_size ({latmpi_layout[0],latmpi_layout[1],latmpi_layout[2],latmpi_layout[3]});`
			`int vol = latt_size[0]latt_size[1]latt_size[2]*latt_size[3];`
Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes. 2015-09-29 00:09:04 +01:00			`uint64_t Nloop=NLOOP;`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00			`GridCartesian Grid(latt_size,simd_layout,mpi_layout);`

Better run time on KNC 2015-11-04 11:25:34 +00:00			`// GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();`
			`LatticeVec z(&Grid); //random(pRNG,z);`
			`LatticeVec x(&Grid); //random(pRNG,x);`
			`LatticeVec y(&Grid); //random(pRNG,y);`
Updated bandwidth test 2015-05-05 18:08:53 +01:00			`RealD a=2.0;`
Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes. 2015-09-29 00:09:04 +01:00			`Real nn;`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00			`double start=usecond();`
			`for(int i=0;i<Nloop;i++){`
			`nn=norm2(x);`
Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes. 2015-09-29 00:09:04 +01:00			`vsplat(x._odata[0]._internal[0],nn);`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00			`}`
			`double stop=usecond();`
Cleaned up for Linux 2015-05-05 22:09:22 +01:00			`double time = (stop-start)/Nloop*1000;`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00
Weak scale the benchmarks automatically. 2015-05-28 13:47:01 +01:00			`double bytes=volNvecsizeof(Real);`
			`double flops=volNvec2;// mul,add`
Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes. 2015-09-29 00:09:04 +01:00			`std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00
			`}`

			`Grid_finalize();`
			`}`