diff --git a/Grid/Benchmark_comms_host_device.cpp b/Grid/Benchmark_comms_host_device.cpp deleted file mode 100644 index e213859..0000000 --- a/Grid/Benchmark_comms_host_device.cpp +++ /dev/null @@ -1,265 +0,0 @@ -/* -Copyright © 2015 Peter Boyle -Copyright © 2022 Antonin Portelli - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -*/ - -#include - -using namespace std; -using namespace Grid; - -struct time_statistics -{ - double mean; - double err; - double min; - double max; - - void statistics(std::vector v) - { - double sum = std::accumulate(v.begin(), v.end(), 0.0); - mean = sum / v.size(); - - std::vector diff(v.size()); - std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; }); - double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); - err = std::sqrt(sq_sum / (v.size() * (v.size() - 1))); - - auto result = std::minmax_element(v.begin(), v.end()); - min = *result.first; - max = *result.second; - } -}; - -void header() -{ - std::cout << GridLogMessage << " L " - << "\t" - << " Ls " - << "\t" << std::setw(11) << "bytes\t\t" - << "MB/s uni" - << "\t" - << "MB/s bidi" << std::endl; -}; - -int main(int argc, char **argv) -{ - Grid_init(&argc, &argv); - - Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd()); - Coordinate mpi_layout = GridDefaultMpi(); - int threads = GridThread::GetThreads(); - std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" - << std::endl; - - int Nloop = 250; - int nmu = 0; - int maxlat = 32; - for (int mu = 0; mu < Nd; mu++) - if (mpi_layout[mu] > 1) - nmu++; - - std::cout << GridLogMessage << "Number of iterations to average: " << Nloop - << std::endl; - std::vector t_time(Nloop); - // time_statistics timestat; - - std::cout << GridLogMessage - << "=========================================================================" - "===========================" - << std::endl; - std::cout << GridLogMessage - << "= Benchmarking sequential halo exchange from host memory " << std::endl; - std::cout << GridLogMessage - << "=========================================================================" - "===========================" - << std::endl; - header(); - - for (int lat = 8; lat <= maxlat; lat += 4) - { - for (int Ls = 8; Ls <= 8; Ls *= 2) - { - - Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], - lat * mpi_layout[3]}); - - GridCartesian Grid(latt_size, simd_layout, mpi_layout); - RealD Nrank = Grid._Nprocessors; - RealD Nnode = Grid.NodeCount(); - RealD ppn = Nrank / Nnode; - - std::vector> xbuf(8); - std::vector> rbuf(8); - - for (int mu = 0; mu < 8; mu++) - { - xbuf[mu].resize(lat * lat * lat * Ls); - rbuf[mu].resize(lat * lat * lat * Ls); - } - uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD); - - int ncomm; - - for (int mu = 0; mu < 4; mu++) - { - if (mpi_layout[mu] > 1) - { - double start = usecond(); - for (int i = 0; i < Nloop; i++) - { - - ncomm = 0; - - ncomm++; - int comm_proc = 1; - int xmit_to_rank; - int recv_from_rank; - - { - std::vector requests; - Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); - Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank, - (void *)&rbuf[mu][0], recv_from_rank, bytes); - } - - comm_proc = mpi_layout[mu] - 1; - { - std::vector requests; - Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); - Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank, - (void *)&rbuf[mu + 4][0], recv_from_rank, bytes); - } - } - Grid.Barrier(); - double stop = usecond(); - double mean = (stop - start) / Nloop; - double dbytes = bytes * ppn; - double xbytes = dbytes * 2.0 * ncomm; - double rbytes = xbytes; - double bidibytes = xbytes + rbytes; - - std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t" - << std::setw(11) << bytes << std::fixed << std::setprecision(1) - << std::setw(7) << " " << std::right << xbytes / mean << " " - << "\t\t" << std::setw(7) << bidibytes / mean << std::endl; - } - } - } - } - - std::cout << GridLogMessage - << "=========================================================================" - "===========================" - << std::endl; - std::cout << GridLogMessage - << "= Benchmarking sequential halo exchange from GPU memory " << std::endl; - std::cout << GridLogMessage - << "=========================================================================" - "===========================" - << std::endl; - header(); - - for (int lat = 8; lat <= maxlat; lat += 4) - { - for (int Ls = 8; Ls <= 8; Ls *= 2) - { - - Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], - lat * mpi_layout[3]}); - - GridCartesian Grid(latt_size, simd_layout, mpi_layout); - RealD Nrank = Grid._Nprocessors; - RealD Nnode = Grid.NodeCount(); - RealD ppn = Nrank / Nnode; - - std::vector xbuf(8); - std::vector rbuf(8); - - uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD); - for (int d = 0; d < 8; d++) - { - xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); - rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); - } - - int ncomm; - - for (int mu = 0; mu < 4; mu++) - { - if (mpi_layout[mu] > 1) - { - double start = usecond(); - for (int i = 0; i < Nloop; i++) - { - - ncomm = 0; - - ncomm++; - int comm_proc = 1; - int xmit_to_rank; - int recv_from_rank; - - { - std::vector requests; - Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); - Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank, - (void *)&rbuf[mu][0], recv_from_rank, bytes); - } - - comm_proc = mpi_layout[mu] - 1; - { - std::vector requests; - Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); - Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank, - (void *)&rbuf[mu + 4][0], recv_from_rank, bytes); - } - } - Grid.Barrier(); - double stop = usecond(); - double mean = (stop - start) / Nloop; - double dbytes = bytes * ppn; - double xbytes = dbytes * 2.0 * ncomm; - double rbytes = xbytes; - double bidibytes = xbytes + rbytes; - - std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t" - << std::setw(11) << bytes << std::fixed << std::setprecision(1) - << std::setw(7) << " " << std::right << xbytes / mean << " " - << "\t\t" << std::setw(7) << bidibytes / mean << std::endl; - } - } - - for (int d = 0; d < 8; d++) - { - acceleratorFreeDevice(xbuf[d]); - acceleratorFreeDevice(rbuf[d]); - } - } - } - - std::cout << GridLogMessage - << "=========================================================================" - "===========================" - << std::endl; - std::cout << GridLogMessage << "= All done; Bye Bye" << std::endl; - std::cout << GridLogMessage - << "=========================================================================" - "===========================" - << std::endl; - - Grid_finalize(); -} diff --git a/Grid/Benchmark_dwf_fp32.cpp b/Grid/Benchmark_dwf_fp32.cpp deleted file mode 100644 index c0fcf7c..0000000 --- a/Grid/Benchmark_dwf_fp32.cpp +++ /dev/null @@ -1,512 +0,0 @@ -/* -Copyright © 2015 Peter Boyle -Copyright © 2022 Antonin Portelli -Copyright © 2023 Simon Bürger - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -*/ - -#include "json.hpp" -#include -#ifdef GRID_CUDA -#define CUDA_PROFILE -#endif - -#ifdef CUDA_PROFILE -#include -#endif - -using namespace std; -using namespace Grid; - -template struct scal -{ - d internal; -}; - -Gamma::Algebra Gmu[] = {Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, - Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT}; - -int main(int argc, char **argv) -{ - Grid_init(&argc, &argv); - - int threads = GridThread::GetThreads(); - - Coordinate latt4 = GridDefaultLatt(); - int Ls = 16; - std::string json_filename = ""; // empty indicates no json output - nlohmann::json json; - - // benchmark specific command line arguments - for (int i = 0; i < argc; i++) - { - if (std::string(argv[i]) == "-Ls") - { - std::stringstream ss(argv[i + 1]); - ss >> Ls; - } - if (std::string(argv[i]) == "--json-out") - json_filename = argv[i + 1]; - } - - GridLogLayout(); - - long unsigned int single_site_flops = 8 * Nc * (7 + 16 * Nc); - - json["single_site_flops"] = single_site_flops; - - GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid( - GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi()); - GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); - - GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid); - GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid); - - json["grid"] = FGrid->FullDimensions().toVector(); - json["local_grid"] = FGrid->LocalDimensions().toVector(); - - std::cout << GridLogMessage << "Making s innermost grids" << std::endl; - GridCartesian *sUGrid = - SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(), GridDefaultMpi()); - - GridRedBlackCartesian *sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid); - GridCartesian *sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls, UGrid); - GridRedBlackCartesian *sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls, UGrid); - - std::vector seeds4({1, 2, 3, 4}); - std::vector seeds5({5, 6, 7, 8}); - - std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; - GridParallelRNG RNG4(UGrid); - RNG4.SeedUniqueString(std::string("The 4D RNG")); - std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; - GridParallelRNG RNG5(FGrid); - RNG5.SeedUniqueString(std::string("The 5D RNG")); - std::cout << GridLogMessage << "Initialised RNGs" << std::endl; - - LatticeFermionF src(FGrid); - random(RNG5, src); -#if 0 - src = Zero(); - { - Coordinate origin({0,0,0,latt4[2]-1,0}); - SpinColourVectorF tmp; - tmp=Zero(); - tmp()(0)(0)=Complex(-2.0,0.0); - std::cout << " source site 0 " << tmp<::HotConfiguration(RNG4, Umu); - std::cout << GridLogMessage << "Random gauge initialised " << std::endl; -#if 0 - Umu=1.0; - for(int mu=0;mu(Umu,mu); - // if (mu !=2 ) ttmp = 0; - // ttmp = ttmp* pow(10.0,mu); - PokeIndex(Umu,ttmp,mu); - } - std::cout << GridLogMessage << "Forced to diagonal " << std::endl; -#endif - - //////////////////////////////////// - // Naive wilson implementation - //////////////////////////////////// - // replicate across fifth dimension - // LatticeGaugeFieldF Umu5d(FGrid); - std::vector U(4, UGrid); - for (int mu = 0; mu < Nd; mu++) - { - U[mu] = PeekIndex(Umu, mu); - } - std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl; - - if (1) - { - ref = Zero(); - for (int mu = 0; mu < Nd; mu++) - { - - tmp = Cshift(src, mu + 1, 1); - { - autoView(tmp_v, tmp, CpuWrite); - autoView(U_v, U[mu], CpuRead); - for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++) - { - for (int s = 0; s < Ls; s++) - { - tmp_v[Ls * ss + s] = U_v[ss] * tmp_v[Ls * ss + s]; - } - } - } - ref = ref + tmp - Gamma(Gmu[mu]) * tmp; - - { - autoView(tmp_v, tmp, CpuWrite); - autoView(U_v, U[mu], CpuRead); - autoView(src_v, src, CpuRead); - for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++) - { - for (int s = 0; s < Ls; s++) - { - tmp_v[Ls * ss + s] = adj(U_v[ss]) * src_v[Ls * ss + s]; - } - } - } - tmp = Cshift(tmp, mu + 1, -1); - ref = ref + tmp + Gamma(Gmu[mu]) * tmp; - } - ref = -0.5 * ref; - } - - RealD mass = 0.1; - RealD M5 = 1.8; - - RealD NP = UGrid->_Nprocessors; - RealD NN = UGrid->NodeCount(); - - json["ranks"] = NP; - json["nodes"] = NN; - - std::cout << GridLogMessage - << "*****************************************************************" - << std::endl; - std::cout << GridLogMessage - << "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" - << std::endl; - std::cout << GridLogMessage - << "*****************************************************************" - << std::endl; - std::cout << GridLogMessage - << "*****************************************************************" - << std::endl; - std::cout << GridLogMessage - << "* Benchmarking DomainWallFermionR::Dhop " << std::endl; - std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd() - << std::endl; - std::cout << GridLogMessage << "* VComplexF size is " << sizeof(vComplexF) << " B" - << std::endl; - - if (sizeof(RealF) == 4) - std::cout << GridLogMessage << "* SINGLE precision " << std::endl; - if (sizeof(RealF) == 8) - std::cout << GridLogMessage << "* DOUBLE precision " << std::endl; -#ifdef GRID_OMP - if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute) - std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl; - if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) - std::cout << GridLogMessage << "* Using sequential comms compute" << std::endl; -#endif - if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric) - std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl; - if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) - std::cout << GridLogMessage << "* Using Nc=3 WilsonKernels" << std::endl; - if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm) - - std::cout << GridLogMessage << "* Using Asm Nc=3 WilsonKernels" << std::endl; - std::cout << GridLogMessage - << "*****************************************************************" - << std::endl; - - DomainWallFermionF Dw(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5); - int ncall = 300; - - if (1) - { - FGrid->Barrier(); - Dw.ZeroCounters(); - Dw.Dhop(src, result, 0); - std::cout << GridLogMessage << "Called warmup" << std::endl; - double t0 = usecond(); - for (int i = 0; i < ncall; i++) - { - __SSC_START; - Dw.Dhop(src, result, 0); - __SSC_STOP; - } - double t1 = usecond(); - FGrid->Barrier(); - - double volume = Ls; - for (int mu = 0; mu < Nd; mu++) - volume = volume * latt4[mu]; - double flops = single_site_flops * volume * ncall; - - auto nsimd = vComplex::Nsimd(); - auto simdwidth = sizeof(vComplex); - - // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors - double data_rf = volume * ((2 * Nd + 1) * Nd * Nc + 2 * Nd * Nc * Nc) * simdwidth / - nsimd * ncall / (1024. * 1024. * 1024.); - - // mem: Nd Wilson * Ls, Nd gauge, Nc colors - double data_mem = - (volume * (2 * Nd + 1) * Nd * Nc + (volume / Ls) * 2 * Nd * Nc * Nc) * simdwidth / - nsimd * ncall / (1024. * 1024. * 1024.); - - json["Dw"]["calls"] = ncall; - json["Dw"]["time"] = t1 - t0; - json["Dw"]["mflops"] = flops / (t1 - t0); - json["Dw"]["mflops_per_rank"] = flops / (t1 - t0) / NP; - json["Dw"]["mflops_per_node"] = flops / (t1 - t0) / NN; - json["Dw"]["RF"] = 1000000. * data_rf / ((t1 - t0)); - json["Dw"]["mem"] = 1000000. * data_mem / ((t1 - t0)); - - std::cout << GridLogMessage << "Called Dw " << ncall << " times in " << t1 - t0 - << " us" << std::endl; - // std::cout< 1.0e-4)) - { - /* - std::cout << "RESULT\n " << result<Barrier(); - exit(-1); - } - assert(norm2(err) < 1.0e-4); - Dw.Report(); - } - - if (1) - { // Naive wilson dag implementation - ref = Zero(); - for (int mu = 0; mu < Nd; mu++) - { - - // ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x - tmp = Cshift(src, mu + 1, 1); - { - autoView(ref_v, ref, CpuWrite); - autoView(tmp_v, tmp, CpuRead); - autoView(U_v, U[mu], CpuRead); - for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++) - { - for (int s = 0; s < Ls; s++) - { - int i = s + Ls * ss; - ref_v[i] += U_v[ss] * (tmp_v[i] + Gamma(Gmu[mu]) * tmp_v[i]); - ; - } - } - } - - { - autoView(tmp_v, tmp, CpuWrite); - autoView(U_v, U[mu], CpuRead); - autoView(src_v, src, CpuRead); - for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++) - { - for (int s = 0; s < Ls; s++) - { - tmp_v[Ls * ss + s] = adj(U_v[ss]) * src_v[Ls * ss + s]; - } - } - } - // tmp =adj(U[mu])*src; - tmp = Cshift(tmp, mu + 1, -1); - { - autoView(ref_v, ref, CpuWrite); - autoView(tmp_v, tmp, CpuRead); - for (int i = 0; i < ref_v.size(); i++) - { - ref_v[i] += tmp_v[i] - Gamma(Gmu[mu]) * tmp_v[i]; - ; - } - } - } - ref = -0.5 * ref; - } - // dump=1; - Dw.Dhop(src, result, 1); - - std::cout << GridLogMessage - << "Compare to naive wilson implementation Dag to verify correctness" - << std::endl; - std::cout << GridLogMessage << "Called DwDag" << std::endl; - std::cout << GridLogMessage << "norm dag result " << norm2(result) << std::endl; - std::cout << GridLogMessage << "norm dag ref " << norm2(ref) << std::endl; - err = ref - result; - std::cout << GridLogMessage << "norm dag diff " << norm2(err) << std::endl; - if ((norm2(err) > 1.0e-4)) - { - /* - std::cout<< "DAG RESULT\n " <Barrier(); - Dw.DhopEO(src_o, r_e, DaggerNo); - double t0 = usecond(); - for (int i = 0; i < ncall; i++) - { -#ifdef CUDA_PROFILE - if (i == 10) - cudaProfilerStart(); -#endif - Dw.DhopEO(src_o, r_e, DaggerNo); -#ifdef CUDA_PROFILE - if (i == 20) - cudaProfilerStop(); -#endif - } - double t1 = usecond(); - FGrid->Barrier(); - - double volume = Ls; - for (int mu = 0; mu < Nd; mu++) - volume = volume * latt4[mu]; - double flops = (single_site_flops * volume * ncall) / 2.0; - - json["Deo"]["calls"] = ncall; - json["Deo"]["time"] = t1 - t0; - json["Deo"]["mflops"] = flops / (t1 - t0); - json["Deo"]["mflops_per_rank"] = flops / (t1 - t0) / NP; - json["Deo"]["mflops_per_node"] = flops / (t1 - t0) / NN; - - std::cout << GridLogMessage << "Deo mflop/s = " << flops / (t1 - t0) << std::endl; - std::cout << GridLogMessage << "Deo mflop/s per rank " << flops / (t1 - t0) / NP - << std::endl; - std::cout << GridLogMessage << "Deo mflop/s per node " << flops / (t1 - t0) / NN - << std::endl; - - Dw.Report(); - } - Dw.DhopEO(src_o, r_e, DaggerNo); - Dw.DhopOE(src_e, r_o, DaggerNo); - Dw.Dhop(src, result, DaggerNo); - - std::cout << GridLogMessage << "r_e" << norm2(r_e) << std::endl; - std::cout << GridLogMessage << "r_o" << norm2(r_o) << std::endl; - std::cout << GridLogMessage << "res" << norm2(result) << std::endl; - - setCheckerboard(r_eo, r_o); - setCheckerboard(r_eo, r_e); - - err = r_eo - result; - std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl; - if ((norm2(err) > 1.0e-4)) - { - /* - std::cout<< "Deo RESULT\n " <