Grid/benchmarks/Benchmark_comms.cc

    /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

    Source file: ./benchmarks/Benchmark_comms.cc

    Copyright (C) 2015

Author: Peter Boyle <paboyle@ph.ed.ac.uk>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
#include <Grid/Grid.h>

using namespace std;
using namespace Grid;
using namespace Grid::QCD;

struct time_statistics{
  double mean;
  double err;
  double min;
  double max;

  void statistics(std::vector<double> v){
      double sum = std::accumulate(v.begin(), v.end(), 0.0);
      mean = sum / v.size();

      std::vector<double> diff(v.size());
      std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
      double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
      err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));

      auto result = std::minmax_element(v.begin(), v.end());
      min = *result.first;
      max = *result.second;
}
};

void header(){
  std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t"
            <<std::setw(11)<<"bytes"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl;
};

int main (int argc, char ** argv)
{
  Grid_init(&argc,&argv);

  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

  int Nloop=500;
  int nmu=0;
  int maxlat=24;
  for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;

  std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
  std::vector<double> t_time(Nloop);
  time_statistics timestat;

  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  header();
  for(int lat=4;lat<=maxlat;lat+=4){
    for(int Ls=8;Ls<=32;Ls*=2){

      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
      				    lat*mpi_layout[2],
      				    lat*mpi_layout[3]});

      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);

      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));

      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);

      for(int i=0;i<Nloop;i++){
      double start=usecond();

	std::vector<CartesianCommunicator::CommsRequest_t> requests;

	ncomm=0;
	for(int mu=0;mu<4;mu++){
	
	  if (mpi_layout[mu]>1 ) {
	  
	    ncomm++;
	    int comm_proc=1;
	    int xmit_to_rank;
	    int recv_from_rank;
	    
	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
	    Grid.SendToRecvFromBegin(requests,
				   (void *)&xbuf[mu][0],
				   xmit_to_rank,
				   (void *)&rbuf[mu][0],
				   recv_from_rank,
				   bytes);
	
	    comm_proc = mpi_layout[mu]-1;
	  
	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
	    Grid.SendToRecvFromBegin(requests,
				     (void *)&xbuf[mu+4][0],
				     xmit_to_rank,
				     (void *)&rbuf[mu+4][0],
				     recv_from_rank,
				     bytes);
	  
	  }
	}
	Grid.SendToRecvFromComplete(requests);
	Grid.Barrier();
  double stop=usecond();
  t_time[i] = stop-start; // microseconds
      }

      timestat.statistics(t_time);

      double dbytes    = bytes;
      double xbytes    = dbytes*2.0*ncomm;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;

      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;

    }
  }    


  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  header();

  for(int lat=4;lat<=maxlat;lat+=4){
    for(int Ls=8;Ls<=32;Ls*=2){

      std::vector<int> latt_size  ({lat,lat,lat,lat});

      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);

      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));


      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);

      for(int i=0;i<Nloop;i++){
      double start=usecond();
    
	ncomm=0;
	for(int mu=0;mu<4;mu++){
	
	  if (mpi_layout[mu]>1 ) {
	  
	    ncomm++;
	    int comm_proc=1;
	    int xmit_to_rank;
	    int recv_from_rank;
	    
	    {
	      std::vector<CartesianCommunicator::CommsRequest_t> requests;
	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
	      Grid.SendToRecvFromBegin(requests,
				       (void *)&xbuf[mu][0],
				       xmit_to_rank,
				       (void *)&rbuf[mu][0],
				       recv_from_rank,
				       bytes);
	      Grid.SendToRecvFromComplete(requests);
	    }

	    comm_proc = mpi_layout[mu]-1;
	    {
	      std::vector<CartesianCommunicator::CommsRequest_t> requests;
	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
	      Grid.SendToRecvFromBegin(requests,
				       (void *)&xbuf[mu+4][0],
				       xmit_to_rank,
				       (void *)&rbuf[mu+4][0],
				       recv_from_rank,
				       bytes);
	      Grid.SendToRecvFromComplete(requests);
	    }
	  }
	}
	Grid.Barrier();
      double stop=usecond();
    t_time[i] = stop-start; // microseconds

      }

      timestat.statistics(t_time);
      
      double dbytes    = bytes;
      double xbytes    = dbytes*2.0*ncomm;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;

    std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;

      
    }
  }  


  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  header();

  for(int lat=4;lat<=maxlat;lat+=4){
    for(int Ls=8;Ls<=32;Ls*=2){

      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
      				    lat*mpi_layout[2],
      				    lat*mpi_layout[3]});

      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);

      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
      Grid.ShmBufferFreeAll();
      for(int d=0;d<8;d++){
	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
      }

      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);

      for(int i=0;i<Nloop;i++){
      double start=usecond();

	std::vector<CartesianCommunicator::CommsRequest_t> requests;

	ncomm=0;
	for(int mu=0;mu<4;mu++){
	
	  if (mpi_layout[mu]>1 ) {
	  
	    ncomm++;
	    int comm_proc=1;
	    int xmit_to_rank;
	    int recv_from_rank;
	    
	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
	    Grid.StencilSendToRecvFromBegin(requests,
					    (void *)&xbuf[mu][0],
					    xmit_to_rank,
					    (void *)&rbuf[mu][0],
					    recv_from_rank,
					    bytes);
	
	    comm_proc = mpi_layout[mu]-1;
	  
	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
	    Grid.StencilSendToRecvFromBegin(requests,
					    (void *)&xbuf[mu+4][0],
					    xmit_to_rank,
					    (void *)&rbuf[mu+4][0],
					    recv_from_rank,
					    bytes);
	  
	  }
	}
	Grid.StencilSendToRecvFromComplete(requests);
	Grid.Barrier();
      double stop=usecond();
    t_time[i] = stop-start; // microseconds

      }

      timestat.statistics(t_time);

      double dbytes    = bytes;
      double xbytes    = dbytes*2.0*ncomm;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;

      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;


    }
  }    


  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  header();

  for(int lat=4;lat<=maxlat;lat+=4){
    for(int Ls=8;Ls<=32;Ls*=2){

      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
      				    lat*mpi_layout[2],
      				    lat*mpi_layout[3]});

      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);

      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
      Grid.ShmBufferFreeAll();
      for(int d=0;d<8;d++){
	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
      }

      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);

      for(int i=0;i<Nloop;i++){
      double start=usecond();

	std::vector<CartesianCommunicator::CommsRequest_t> requests;

	ncomm=0;
	for(int mu=0;mu<4;mu++){
	
	  if (mpi_layout[mu]>1 ) {
	  
	    ncomm++;
	    int comm_proc=1;
	    int xmit_to_rank;
	    int recv_from_rank;
	    
	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
	    Grid.StencilSendToRecvFromBegin(requests,
					    (void *)&xbuf[mu][0],
					    xmit_to_rank,
					    (void *)&rbuf[mu][0],
					    recv_from_rank,
					    bytes);
	    Grid.StencilSendToRecvFromComplete(requests);
	    requests.resize(0);

	    comm_proc = mpi_layout[mu]-1;
	  
	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
	    Grid.StencilSendToRecvFromBegin(requests,
					    (void *)&xbuf[mu+4][0],
					    xmit_to_rank,
					    (void *)&rbuf[mu+4][0],
					    recv_from_rank,
					    bytes);
	    Grid.StencilSendToRecvFromComplete(requests);
	    requests.resize(0);
	  
	  }
	}
	    Grid.Barrier();
      double stop=usecond();
      t_time[i] = stop-start; // microseconds

      }

      timestat.statistics(t_time);

      double dbytes    = bytes;
      double xbytes    = dbytes*2.0*ncomm;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;


      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
 
    }
  }    

  Grid_finalize();
}
Global edit adding copyright and license info to every source file. 2016-01-02 14:51:32 +00:00			`/*************************************************************************************`

			`Grid physics library, www.github.com/paboyle/Grid`

			`Source file: ./benchmarks/Benchmark_comms.cc`

			`Copyright (C) 2015`

			`Author: Peter Boyle <paboyle@ph.ed.ac.uk>`

			`This program is free software; you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation; either version 2 of the License, or`
			`(at your option) any later version.`

			`This program is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License along`
			`with this program; if not, write to the Free Software Foundation, Inc.,`
			`51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.`

			`See the full license in the file "LICENSE" in the top level distribution directory`
			`*************************************************************************************/`
			`/* END LEGAL */`
Open up dependency on Eigen and FFTW 2016-07-07 22:31:07 +01:00			`#include <Grid/Grid.h>`
Added a comms benchmark 2015-05-02 23:42:30 +01:00
			`using namespace std;`
			`using namespace Grid;`
			`using namespace Grid::QCD;`

Adding more statistics to the Benchmark_comms. Min and max 2017-05-19 10:55:04 +01:00			`struct time_statistics{`
			`double mean;`
			`double err;`
			`double min;`
			`double max;`

			`void statistics(std::vector<double> v){`
Adding more statistical info in the Benchmark_comms 2017-05-18 19:07:35 +01:00			`double sum = std::accumulate(v.begin(), v.end(), 0.0);`
			`mean = sum / v.size();`

			`std::vector<double> diff(v.size());`
Adding more statistics to the Benchmark_comms. Min and max 2017-05-19 10:55:04 +01:00			`std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });`
Adding more statistical info in the Benchmark_comms 2017-05-18 19:07:35 +01:00			`double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);`
Adding more statistics to the Benchmark_comms. Min and max 2017-05-19 10:55:04 +01:00			`err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));`

			`auto result = std::minmax_element(v.begin(), v.end());`
			`min = *result.first;`
			`max = *result.second;`
Adding more statistical info in the Benchmark_comms 2017-05-18 19:07:35 +01:00			`}`
Adding more statistics to the Benchmark_comms. Min and max 2017-05-19 10:55:04 +01:00			`};`
Adding more statistical info in the Benchmark_comms 2017-05-18 19:07:35 +01:00
Adding more statistics to the Benchmark_comms. Min and max 2017-05-19 10:55:04 +01:00			`void header(){`
			`std::cout <<GridLogMessage << " L "<<"\t"<<" Ls "<<"\t"`
			`<<std::setw(11)<<"bytes"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl;`
			`};`
Adding more statistical info in the Benchmark_comms 2017-05-18 19:07:35 +01:00
Added a comms benchmark 2015-05-02 23:42:30 +01:00			`int main (int argc, char ** argv)`
			`{`
			`Grid_init(&argc,&argv);`

Enhanced SIMD interfacing 2015-05-12 20:41:44 +01:00			`std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());`
Adding a better controlled threading class, preparing to force in deterministic reduction. 2015-05-11 18:59:03 +01:00			`std::vector<int> mpi_layout = GridDefaultMpi();`
Add messages to get the number of threads for openmp 2015-05-19 14:54:42 +01:00			`int threads = GridThread::GetThreads();`
Sizable improvement in multigrid for unsquared. 6000 matmuls CG unprec 2000 matmuls CG prec (4000 eo muls) 1050 matmuls PGCR on 16^3 x 32 x 8 m=.01 Substantial effort on timing and logging infrastructure 2015-07-23 17:31:13 +01:00			`std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;`
Added a comms benchmark 2015-05-02 23:42:30 +01:00
Adding more statistical info in the Benchmark_comms 2017-05-18 19:07:35 +01:00			`int Nloop=500;`
Added a comms benchmark 2015-05-02 23:51:43 +01:00			`int nmu=0;`
Fixing an allocation issue in Benchmark_comms 2017-05-18 14:44:56 +01:00			`int maxlat=24;`
Comms improvements 2016-11-01 11:35:43 +00:00			`for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;`

Adding more statistical info in the Benchmark_comms 2017-05-18 19:07:35 +01:00			`std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;`
			`std::vector<double> t_time(Nloop);`
Adding more statistics to the Benchmark_comms. Min and max 2017-05-19 10:55:04 +01:00			`time_statistics timestat;`
Fixing an allocation issue in Benchmark_comms 2017-05-18 14:44:56 +01:00
Sizable improvement in multigrid for unsquared. 6000 matmuls CG unprec 2000 matmuls CG prec (4000 eo muls) 1050 matmuls PGCR on 16^3 x 32 x 8 m=.01 Substantial effort on timing and logging infrastructure 2015-07-23 17:31:13 +01:00			`std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;`
			`std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;`
			`std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;`
Adding more statistics to the Benchmark_comms. Min and max 2017-05-19 10:55:04 +01:00			`header();`
Comms benchmark improvements 2017-02-07 06:07:39 +00:00			`for(int lat=4;lat<=maxlat;lat+=4){`
			`for(int Ls=8;Ls<=32;Ls*=2){`
Added a comms benchmark 2015-05-02 23:42:30 +01:00
Update Benchmark_comms.cc 2015-06-25 10:59:53 +01:00			`std::vector<int> latt_size ({lat*mpi_layout[0],`
			`lat*mpi_layout[1],`
			`lat*mpi_layout[2],`
			`lat*mpi_layout[3]});`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00
			`GridCartesian Grid(latt_size,simd_layout,mpi_layout);`

			`std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(latlatlat*Ls));`
			`std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(latlatlat*Ls));`

			`int ncomm;`
Added a comms benchmark 2015-05-02 23:42:30 +01:00			`int bytes=latlatlatLssizeof(HalfSpinColourVectorD);`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00
Added a comms benchmark 2015-05-02 23:42:30 +01:00			`for(int i=0;i<Nloop;i++){`
Adding more statistical info in the Benchmark_comms 2017-05-18 19:07:35 +01:00			`double start=usecond();`
Added a comms benchmark 2015-05-02 23:42:30 +01:00
			`std::vector<CartesianCommunicator::CommsRequest_t> requests;`

			`ncomm=0;`
			`for(int mu=0;mu<4;mu++){`

			`if (mpi_layout[mu]>1 ) {`

			`ncomm++;`
			`int comm_proc=1;`
			`int xmit_to_rank;`
			`int recv_from_rank;`

			`Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);`
			`Grid.SendToRecvFromBegin(requests,`
			`(void *)&xbuf[mu][0],`
			`xmit_to_rank,`
			`(void *)&rbuf[mu][0],`
			`recv_from_rank,`
			`bytes);`

			`comm_proc = mpi_layout[mu]-1;`

			`Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);`
			`Grid.SendToRecvFromBegin(requests,`
			`(void *)&xbuf[mu+4][0],`
			`xmit_to_rank,`
			`(void *)&rbuf[mu+4][0],`
			`recv_from_rank,`
			`bytes);`

			`}`
			`}`
			`Grid.SendToRecvFromComplete(requests);`
			`Grid.Barrier();`
Adding more statistical info in the Benchmark_comms 2017-05-18 19:07:35 +01:00			`double stop=usecond();`
			`t_time[i] = stop-start; // microseconds`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00			`}`
Adding more statistical info in the Benchmark_comms 2017-05-18 19:07:35 +01:00
Adding more statistics to the Benchmark_comms. Min and max 2017-05-19 10:55:04 +01:00			`timestat.statistics(t_time);`
Added a comms benchmark 2015-05-02 23:42:30 +01:00
Integer wrap problem fixed. 2015-05-29 14:11:34 +01:00			`double dbytes = bytes;`
Adding more statistical info in the Benchmark_comms 2017-05-18 19:07:35 +01:00			`double xbytes = dbytes2.0ncomm;`
Added a comms benchmark 2015-05-02 23:42:30 +01:00			`double rbytes = xbytes;`
			`double bidibytes = xbytes+rbytes;`

Adding more statistics to the Benchmark_comms. Min and max 2017-05-19 10:55:04 +01:00			`std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"`
			`<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)`
			`<<std::right<< xbytes/timestat.mean<<" "<< xbytestimestat.err/(timestat.meantimestat.mean)<< " "`
			`<<xbytes/timestat.max <<" "<< xbytes/timestat.min`
			`<< "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< " " << bidibytestimestat.err/(timestat.meantimestat.mean) << " "`
			`<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;`
Added a comms benchmark 2015-05-02 23:42:30 +01:00
			`}`
Added a comms benchmark 2015-05-02 23:51:43 +01:00			`}`


Sizable improvement in multigrid for unsquared. 6000 matmuls CG unprec 2000 matmuls CG prec (4000 eo muls) 1050 matmuls PGCR on 16^3 x 32 x 8 m=.01 Substantial effort on timing and logging infrastructure 2015-07-23 17:31:13 +01:00			`std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;`
			`std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange in "<<nmu<<" dimensions"<<std::endl;`
			`std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;`
Adding more statistics to the Benchmark_comms. Min and max 2017-05-19 10:55:04 +01:00			`header();`
Added a comms benchmark 2015-05-02 23:51:43 +01:00
Comms benchmark improvements 2017-02-07 06:07:39 +00:00			`for(int lat=4;lat<=maxlat;lat+=4){`
			`for(int Ls=8;Ls<=32;Ls*=2){`
Added a comms benchmark 2015-05-02 23:51:43 +01:00
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00			`std::vector<int> latt_size ({lat,lat,lat,lat});`

			`GridCartesian Grid(latt_size,simd_layout,mpi_layout);`

			`std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(latlatlat*Ls));`
			`std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(latlatlat*Ls));`


			`int ncomm;`
Added a comms benchmark 2015-05-02 23:51:43 +01:00			`int bytes=latlatlatLssizeof(HalfSpinColourVectorD);`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00
Added a comms benchmark 2015-05-02 23:51:43 +01:00			`for(int i=0;i<Nloop;i++){`
Adding more statistical info in the Benchmark_comms 2017-05-18 19:07:35 +01:00			`double start=usecond();`
Added a comms benchmark 2015-05-02 23:42:30 +01:00
Added a comms benchmark 2015-05-02 23:51:43 +01:00			`ncomm=0;`
			`for(int mu=0;mu<4;mu++){`

			`if (mpi_layout[mu]>1 ) {`

			`ncomm++;`
			`int comm_proc=1;`
			`int xmit_to_rank;`
			`int recv_from_rank;`

			`{`
			`std::vector<CartesianCommunicator::CommsRequest_t> requests;`
			`Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);`
			`Grid.SendToRecvFromBegin(requests,`
			`(void *)&xbuf[mu][0],`
			`xmit_to_rank,`
			`(void *)&rbuf[mu][0],`
			`recv_from_rank,`
			`bytes);`
			`Grid.SendToRecvFromComplete(requests);`
			`}`

			`comm_proc = mpi_layout[mu]-1;`
			`{`
			`std::vector<CartesianCommunicator::CommsRequest_t> requests;`
			`Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);`
			`Grid.SendToRecvFromBegin(requests,`
			`(void *)&xbuf[mu+4][0],`
			`xmit_to_rank,`
			`(void *)&rbuf[mu+4][0],`
			`recv_from_rank,`
			`bytes);`
			`Grid.SendToRecvFromComplete(requests);`
			`}`
			`}`
			`}`
			`Grid.Barrier();`
Adding more statistical info in the Benchmark_comms 2017-05-18 19:07:35 +01:00			`double stop=usecond();`
			`t_time[i] = stop-start; // microseconds`

Added a comms benchmark 2015-05-02 23:51:43 +01:00			`}`

Adding more statistics to the Benchmark_comms. Min and max 2017-05-19 10:55:04 +01:00			`timestat.statistics(t_time);`
Integer wrap problem fixed. 2015-05-29 14:11:34 +01:00
			`double dbytes = bytes;`
Adding more statistics to the Benchmark_comms. Min and max 2017-05-19 10:55:04 +01:00			`double xbytes = dbytes2.0ncomm;`
Added a comms benchmark 2015-05-02 23:51:43 +01:00			`double rbytes = xbytes;`
			`double bidibytes = xbytes+rbytes;`

Adding more statistics to the Benchmark_comms. Min and max 2017-05-19 10:55:04 +01:00			`std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"`
			`<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)`
			`<<std::right<< xbytes/timestat.mean<<" "<< xbytestimestat.err/(timestat.meantimestat.mean)<< " "`
			`<<xbytes/timestat.max <<" "<< xbytes/timestat.min`
			`<< "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< " " << bidibytestimestat.err/(timestat.meantimestat.mean) << " "`
			`<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;`


Added a comms benchmark 2015-05-02 23:51:43 +01:00			`}`
			`}`

Final sign off commits from Cori-1 2016-11-09 12:11:03 +00:00
Comms improvements 2016-11-01 11:35:43 +00:00			`std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;`
			`std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;`
			`std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;`
Adding more statistics to the Benchmark_comms. Min and max 2017-05-19 10:55:04 +01:00			`header();`
Comms improvements 2016-11-01 11:35:43 +00:00
Comms benchmark improvements 2017-02-07 06:07:39 +00:00			`for(int lat=4;lat<=maxlat;lat+=4){`
			`for(int Ls=8;Ls<=32;Ls*=2){`
Comms improvements 2016-11-01 11:35:43 +00:00
			`std::vector<int> latt_size ({lat*mpi_layout[0],`
			`lat*mpi_layout[1],`
			`lat*mpi_layout[2],`
			`lat*mpi_layout[3]});`

			`GridCartesian Grid(latt_size,simd_layout,mpi_layout);`

			`std::vector<HalfSpinColourVectorD *> xbuf(8);`
			`std::vector<HalfSpinColourVectorD *> rbuf(8);`
			`Grid.ShmBufferFreeAll();`
			`for(int d=0;d<8;d++){`
			`xbuf[d] = (HalfSpinColourVectorD )Grid.ShmBufferMalloc(latlatlatLs*sizeof(HalfSpinColourVectorD));`
			`rbuf[d] = (HalfSpinColourVectorD )Grid.ShmBufferMalloc(latlatlatLs*sizeof(HalfSpinColourVectorD));`
			`}`

			`int ncomm;`
			`int bytes=latlatlatLssizeof(HalfSpinColourVectorD);`

			`for(int i=0;i<Nloop;i++){`
Adding more statistical info in the Benchmark_comms 2017-05-18 19:07:35 +01:00			`double start=usecond();`
Comms improvements 2016-11-01 11:35:43 +00:00
			`std::vector<CartesianCommunicator::CommsRequest_t> requests;`

			`ncomm=0;`
			`for(int mu=0;mu<4;mu++){`

			`if (mpi_layout[mu]>1 ) {`

			`ncomm++;`
			`int comm_proc=1;`
			`int xmit_to_rank;`
			`int recv_from_rank;`

			`Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);`
			`Grid.StencilSendToRecvFromBegin(requests,`
			`(void *)&xbuf[mu][0],`
			`xmit_to_rank,`
			`(void *)&rbuf[mu][0],`
			`recv_from_rank,`
			`bytes);`

			`comm_proc = mpi_layout[mu]-1;`

			`Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);`
			`Grid.StencilSendToRecvFromBegin(requests,`
			`(void *)&xbuf[mu+4][0],`
			`xmit_to_rank,`
			`(void *)&rbuf[mu+4][0],`
			`recv_from_rank,`
			`bytes);`

			`}`
			`}`
			`Grid.StencilSendToRecvFromComplete(requests);`
			`Grid.Barrier();`
Adding more statistical info in the Benchmark_comms 2017-05-18 19:07:35 +01:00			`double stop=usecond();`
			`t_time[i] = stop-start; // microseconds`
Comms improvements 2016-11-01 11:35:43 +00:00
			`}`

Adding more statistics to the Benchmark_comms. Min and max 2017-05-19 10:55:04 +01:00			`timestat.statistics(t_time);`

Comms improvements 2016-11-01 11:35:43 +00:00			`double dbytes = bytes;`
Adding more statistics to the Benchmark_comms. Min and max 2017-05-19 10:55:04 +01:00			`double xbytes = dbytes2.0ncomm;`
Comms improvements 2016-11-01 11:35:43 +00:00			`double rbytes = xbytes;`
			`double bidibytes = xbytes+rbytes;`

Adding more statistics to the Benchmark_comms. Min and max 2017-05-19 10:55:04 +01:00			`std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"`
			`<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)`
			`<<std::right<< xbytes/timestat.mean<<" "<< xbytestimestat.err/(timestat.meantimestat.mean)<< " "`
			`<<xbytes/timestat.max <<" "<< xbytes/timestat.min`
			`<< "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< " " << bidibytestimestat.err/(timestat.meantimestat.mean) << " "`
			`<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;`


Comms improvements 2016-11-01 11:35:43 +00:00			`}`
			`}`
Final sign off commits from Cori-1 2016-11-09 12:11:03 +00:00

			`std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;`
			`std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;`
			`std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;`
Adding more statistics to the Benchmark_comms. Min and max 2017-05-19 10:55:04 +01:00			`header();`
Final sign off commits from Cori-1 2016-11-09 12:11:03 +00:00
Comms benchmark improvements 2017-02-07 06:07:39 +00:00			`for(int lat=4;lat<=maxlat;lat+=4){`
			`for(int Ls=8;Ls<=32;Ls*=2){`
Final sign off commits from Cori-1 2016-11-09 12:11:03 +00:00
			`std::vector<int> latt_size ({lat*mpi_layout[0],`
			`lat*mpi_layout[1],`
			`lat*mpi_layout[2],`
			`lat*mpi_layout[3]});`

			`GridCartesian Grid(latt_size,simd_layout,mpi_layout);`

			`std::vector<HalfSpinColourVectorD *> xbuf(8);`
			`std::vector<HalfSpinColourVectorD *> rbuf(8);`
			`Grid.ShmBufferFreeAll();`
			`for(int d=0;d<8;d++){`
			`xbuf[d] = (HalfSpinColourVectorD )Grid.ShmBufferMalloc(latlatlatLs*sizeof(HalfSpinColourVectorD));`
			`rbuf[d] = (HalfSpinColourVectorD )Grid.ShmBufferMalloc(latlatlatLs*sizeof(HalfSpinColourVectorD));`
			`}`

			`int ncomm;`
			`int bytes=latlatlatLssizeof(HalfSpinColourVectorD);`

			`for(int i=0;i<Nloop;i++){`
Adding more statistical info in the Benchmark_comms 2017-05-18 19:07:35 +01:00			`double start=usecond();`
Final sign off commits from Cori-1 2016-11-09 12:11:03 +00:00
			`std::vector<CartesianCommunicator::CommsRequest_t> requests;`

			`ncomm=0;`
			`for(int mu=0;mu<4;mu++){`

			`if (mpi_layout[mu]>1 ) {`

			`ncomm++;`
			`int comm_proc=1;`
			`int xmit_to_rank;`
			`int recv_from_rank;`

			`Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);`
			`Grid.StencilSendToRecvFromBegin(requests,`
			`(void *)&xbuf[mu][0],`
			`xmit_to_rank,`
			`(void *)&rbuf[mu][0],`
			`recv_from_rank,`
			`bytes);`
Comms benchmark improvements 2017-02-07 06:07:39 +00:00			`Grid.StencilSendToRecvFromComplete(requests);`
			`requests.resize(0);`
Final sign off commits from Cori-1 2016-11-09 12:11:03 +00:00
			`comm_proc = mpi_layout[mu]-1;`

			`Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);`
			`Grid.StencilSendToRecvFromBegin(requests,`
			`(void *)&xbuf[mu+4][0],`
			`xmit_to_rank,`
			`(void *)&rbuf[mu+4][0],`
			`recv_from_rank,`
			`bytes);`
			`Grid.StencilSendToRecvFromComplete(requests);`
			`requests.resize(0);`

			`}`
			`}`
Adding more statistical info in the Benchmark_comms 2017-05-18 19:07:35 +01:00			`Grid.Barrier();`
			`double stop=usecond();`
			`t_time[i] = stop-start; // microseconds`
Final sign off commits from Cori-1 2016-11-09 12:11:03 +00:00
			`}`

Adding more statistics to the Benchmark_comms. Min and max 2017-05-19 10:55:04 +01:00			`timestat.statistics(t_time);`

Final sign off commits from Cori-1 2016-11-09 12:11:03 +00:00			`double dbytes = bytes;`
Adding more statistics to the Benchmark_comms. Min and max 2017-05-19 10:55:04 +01:00			`double xbytes = dbytes2.0ncomm;`
Final sign off commits from Cori-1 2016-11-09 12:11:03 +00:00			`double rbytes = xbytes;`
			`double bidibytes = xbytes+rbytes;`


Adding more statistics to the Benchmark_comms. Min and max 2017-05-19 10:55:04 +01:00			`std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"`
			`<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)`
			`<<std::right<< xbytes/timestat.mean<<" "<< xbytestimestat.err/(timestat.meantimestat.mean)<< " "`
			`<<xbytes/timestat.max <<" "<< xbytes/timestat.min`
			`<< "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< " " << bidibytestimestat.err/(timestat.meantimestat.mean) << " "`
			`<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;`
Adding more statistical info in the Benchmark_comms 2017-05-18 19:07:35 +01:00
Final sign off commits from Cori-1 2016-11-09 12:11:03 +00:00			`}`
			`}`

Added a comms benchmark 2015-05-02 23:42:30 +01:00			`Grid_finalize();`
			`}`