Grid/benchmarks/Benchmark_prec_change.cc

/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

    Source file: ./benchmarks/Benchmark_prec_change.cc

    Copyright (C) 2015

Author: Christopher Kelly <ckelly@bnl.gov>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
#include <Grid/Grid.h>

using namespace std;
using namespace Grid;

int main (int argc, char ** argv)
{
  Grid_init(&argc,&argv);

  int Ls = 12;
  Coordinate latt4 = GridDefaultLatt();

  GridCartesian         * UGridD   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGridD = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridD);
  GridCartesian         * FGridD   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
  GridRedBlackCartesian * FrbGridD = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);

  GridCartesian         * UGridF   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
  GridCartesian         * FGridF   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
  GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);

  
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  
  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
  GridParallelRNG          RNG4(UGridD);  RNG4.SeedFixedIntegers(seeds4);
  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
  GridParallelRNG          RNG5(FGridD);  RNG5.SeedFixedIntegers(seeds5);
  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;

  LatticeFermionD field_d(FGridD), tmp_d(FGridD);
  random(RNG5,field_d); tmp_d = field_d;

  LatticeFermionD2 field_d2(FGridF), tmp_d2(FGridF);
  precisionChange(field_d2, field_d); tmp_d2 = field_d2;

  LatticeFermionF field_f(FGridF), tmp_f(FGridF);
  precisionChange(field_f, field_d); tmp_f = field_f;

  int N = 500;

  double time_ds = 0, time_sd = 0;

  std::cout<<GridLogMessage << "Benchmarking single<->double original implementation (fields initially device-resident)" << std::endl;
  for(int i=0;i<N;i++){
    //We want to benchmark the typical scenario of both fields being device resident
    //To do this, invoke an operation that will open a device view and touch all sites
    //with a write operation that invalidates the CPU copy
    field_d = tmp_d;
    field_f = tmp_f;

    double start=usecond();
    precisionChangeOrig(field_d,field_f);
    double stop=usecond();
    time_sd += stop - start;

    field_d = tmp_d;
    field_f = tmp_f;

    start=usecond();
    precisionChangeOrig(field_f,field_d);
    stop=usecond();
    time_ds += stop - start;   
  }
  std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;


  precisionChangeWorkspace wk_sp_to_dp(field_d.Grid(),field_f.Grid());
  precisionChangeWorkspace wk_dp_to_sp(field_f.Grid(),field_d.Grid());
  
  std::cout<<GridLogMessage << "Benchmarking single<->double with pregenerated workspace(fields initially device-resident)" << std::endl;
  time_sd = time_ds = 0;
  for(int i=0;i<N;i++){
    field_d = tmp_d;
    field_f = tmp_f;

    double start=usecond();
    precisionChange(field_d,field_f, wk_sp_to_dp);
    double stop=usecond();
    time_sd += stop - start;

    field_d = tmp_d;
    field_f = tmp_f;

    start=usecond();
    precisionChange(field_f,field_d, wk_dp_to_sp);
    stop=usecond();
    time_ds += stop - start;   
  }
  std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
  
  std::cout<<GridLogMessage << "Benchmarking single<->double with workspace generated on-the-fly (fields initially device-resident)" << std::endl;
  time_sd = time_ds = 0;
  for(int i=0;i<N;i++){
    field_d = tmp_d;
    field_f = tmp_f;

    double start=usecond();
    precisionChange(field_d,field_f);
    double stop=usecond();
    time_sd += stop - start;

    field_d = tmp_d;
    field_f = tmp_f;

    start=usecond();
    precisionChange(field_f,field_d);
    stop=usecond();
    time_ds += stop - start;

  }
  std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;


  std::cout<<GridLogMessage << "Benchmarking single<->double2 (fields initially device-resident)" << std::endl;
  time_sd = time_ds = 0;
  for(int i=0;i<N;i++){
    field_d2 = tmp_d2;
    field_f = tmp_f;

    double start=usecond();
    precisionChangeFast(field_d2,field_f);
    double stop=usecond();
    time_sd += stop - start;

    field_d2 = tmp_d2;
    field_f = tmp_f;

    start=usecond();
    precisionChangeFast(field_f,field_d2);
    stop=usecond();
    time_ds += stop - start;
  }
  std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;


  std::cout<<GridLogMessage << "Benchmarking single<->double2 through standard precisionChange call(fields initially device-resident) [NB: perf should be the same as the previous test!]" << std::endl;
  time_sd = time_ds = 0;
  for(int i=0;i<N;i++){
    field_d2 = tmp_d2;
    field_f = tmp_f;

    double start=usecond();
    precisionChange(field_d2,field_f);
    double stop=usecond();
    time_sd += stop - start;

    field_d2 = tmp_d2;
    field_f = tmp_f;

    start=usecond();
    precisionChange(field_f,field_d2);
    stop=usecond();
    time_ds += stop - start;
  }
  std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;

  Grid_finalize();
}
Precision change improvements Added a new, much faster implementation of precision change that uses (optionally) a precomputed workspace containing pointer offsets that is device resident, such that all lattice copying occurs only on the device and no host<->device transfer is required, other than the pointer table. It also avoids the need to unpack and repack the fields using explicit lane copying. When this new precisionChange is called without a workspace, one will be computed on-the-fly; however it is still considerably faster than the original implementation. In the special case of using double2 and when the Grids are the same, calls to the new precisionChange will automatically use precisionChangeFast, such that there is a single API call for all precision changes. Reliable update and mixed-prec multishift have been modified to precompute precision change workspaces Renamed the original precisionChange as precisionChangeOrig Fixed incorrect pointer offset bug in copyLane Added a test and a benchmark for precisionChange Added a test for reliable update CG 2023-02-21 15:52:42 +00:00			`/*************************************************************************************`

			`Grid physics library, www.github.com/paboyle/Grid`

			`Source file: ./benchmarks/Benchmark_prec_change.cc`

			`Copyright (C) 2015`

			`Author: Christopher Kelly <ckelly@bnl.gov>`
			`Author: Peter Boyle <paboyle@ph.ed.ac.uk>`

			`This program is free software; you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation; either version 2 of the License, or`
			`(at your option) any later version.`

			`This program is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License along`
			`with this program; if not, write to the Free Software Foundation, Inc.,`
			`51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.`

			`See the full license in the file "LICENSE" in the top level distribution directory`
			`*************************************************************************************/`
			`/* END LEGAL */`
			`#include <Grid/Grid.h>`

			`using namespace std;`
			`using namespace Grid;`

			`int main (int argc, char ** argv)`
			`{`
			`Grid_init(&argc,&argv);`

			`int Ls = 12;`
			`Coordinate latt4 = GridDefaultLatt();`

			`GridCartesian * UGridD = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());`
			`GridRedBlackCartesian * UrbGridD = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridD);`
			`GridCartesian * FGridD = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);`
			`GridRedBlackCartesian * FrbGridD = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);`

			`GridCartesian * UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());`
			`GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);`
			`GridCartesian * FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);`
			`GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);`


			`std::vector<int> seeds4({1,2,3,4});`
			`std::vector<int> seeds5({5,6,7,8});`

			`std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;`
			`GridParallelRNG RNG4(UGridD); RNG4.SeedFixedIntegers(seeds4);`
			`std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;`
			`GridParallelRNG RNG5(FGridD); RNG5.SeedFixedIntegers(seeds5);`
			`std::cout << GridLogMessage << "Initialised RNGs" << std::endl;`

			`LatticeFermionD field_d(FGridD), tmp_d(FGridD);`
			`random(RNG5,field_d); tmp_d = field_d;`

			`LatticeFermionD2 field_d2(FGridF), tmp_d2(FGridF);`
			`precisionChange(field_d2, field_d); tmp_d2 = field_d2;`

			`LatticeFermionF field_f(FGridF), tmp_f(FGridF);`
			`precisionChange(field_f, field_d); tmp_f = field_f;`

			`int N = 500;`

			`double time_ds = 0, time_sd = 0;`

			`std::cout<<GridLogMessage << "Benchmarking single<->double original implementation (fields initially device-resident)" << std::endl;`
			`for(int i=0;i<N;i++){`
			`//We want to benchmark the typical scenario of both fields being device resident`
			`//To do this, invoke an operation that will open a device view and touch all sites`
			`//with a write operation that invalidates the CPU copy`
			`field_d = tmp_d;`
			`field_f = tmp_f;`

			`double start=usecond();`
			`precisionChangeOrig(field_d,field_f);`
			`double stop=usecond();`
			`time_sd += stop - start;`

			`field_d = tmp_d;`
			`field_f = tmp_f;`

			`start=usecond();`
			`precisionChangeOrig(field_f,field_d);`
			`stop=usecond();`
			`time_ds += stop - start;`
			`}`
			`std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;`


			`precisionChangeWorkspace wk_sp_to_dp(field_d.Grid(),field_f.Grid());`
			`precisionChangeWorkspace wk_dp_to_sp(field_f.Grid(),field_d.Grid());`

			`std::cout<<GridLogMessage << "Benchmarking single<->double with pregenerated workspace(fields initially device-resident)" << std::endl;`
			`time_sd = time_ds = 0;`
			`for(int i=0;i<N;i++){`
			`field_d = tmp_d;`
			`field_f = tmp_f;`

			`double start=usecond();`
			`precisionChange(field_d,field_f, wk_sp_to_dp);`
			`double stop=usecond();`
			`time_sd += stop - start;`

			`field_d = tmp_d;`
			`field_f = tmp_f;`

			`start=usecond();`
			`precisionChange(field_f,field_d, wk_dp_to_sp);`
			`stop=usecond();`
			`time_ds += stop - start;`
			`}`
			`std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;`

			`std::cout<<GridLogMessage << "Benchmarking single<->double with workspace generated on-the-fly (fields initially device-resident)" << std::endl;`
			`time_sd = time_ds = 0;`
			`for(int i=0;i<N;i++){`
			`field_d = tmp_d;`
			`field_f = tmp_f;`

			`double start=usecond();`
			`precisionChange(field_d,field_f);`
			`double stop=usecond();`
			`time_sd += stop - start;`

			`field_d = tmp_d;`
			`field_f = tmp_f;`

			`start=usecond();`
			`precisionChange(field_f,field_d);`
			`stop=usecond();`
			`time_ds += stop - start;`

			`}`
			`std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;`


			`std::cout<<GridLogMessage << "Benchmarking single<->double2 (fields initially device-resident)" << std::endl;`
			`time_sd = time_ds = 0;`
			`for(int i=0;i<N;i++){`
			`field_d2 = tmp_d2;`
			`field_f = tmp_f;`

			`double start=usecond();`
			`precisionChangeFast(field_d2,field_f);`
			`double stop=usecond();`
			`time_sd += stop - start;`

			`field_d2 = tmp_d2;`
			`field_f = tmp_f;`

			`start=usecond();`
			`precisionChangeFast(field_f,field_d2);`
			`stop=usecond();`
			`time_ds += stop - start;`
			`}`
			`std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;`


			`std::cout<<GridLogMessage << "Benchmarking single<->double2 through standard precisionChange call(fields initially device-resident) [NB: perf should be the same as the previous test!]" << std::endl;`
			`time_sd = time_ds = 0;`
			`for(int i=0;i<N;i++){`
			`field_d2 = tmp_d2;`
			`field_f = tmp_f;`

			`double start=usecond();`
			`precisionChange(field_d2,field_f);`
			`double stop=usecond();`
			`time_sd += stop - start;`

			`field_d2 = tmp_d2;`
			`field_f = tmp_f;`

			`start=usecond();`
			`precisionChange(field_f,field_d2);`
			`stop=usecond();`
			`time_ds += stop - start;`
			`}`
			`std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;`

			`Grid_finalize();`
			`}`