Grid column formatting
This commit is contained in:
		@@ -6,7 +6,7 @@
 | 
			
		||||
  BreakBeforeBraces: Allman,
 | 
			
		||||
  AllowShortIfStatementsOnASingleLine: false,
 | 
			
		||||
  IndentCaseLabels: false,
 | 
			
		||||
  ColumnLimit: 0,
 | 
			
		||||
  ColumnLimit: 90,
 | 
			
		||||
  AccessModifierOffset: -4,
 | 
			
		||||
  NamespaceIndentation: All,
 | 
			
		||||
  FixNamespaceComments: false,
 | 
			
		||||
 
 | 
			
		||||
@@ -122,87 +122,82 @@ class Benchmark
 | 
			
		||||
 | 
			
		||||
    for (int lat = 16; lat <= maxlat; lat += 8)
 | 
			
		||||
    {
 | 
			
		||||
      //      for(int Ls=8;Ls<=8;Ls*=2){
 | 
			
		||||
      int Ls = 12;
 | 
			
		||||
 | 
			
		||||
      Coordinate latt_size({lat * mpi_layout[0],
 | 
			
		||||
                            lat * mpi_layout[1],
 | 
			
		||||
                            lat * mpi_layout[2],
 | 
			
		||||
                            lat * mpi_layout[3]});
 | 
			
		||||
 | 
			
		||||
      GridCartesian Grid(latt_size, simd_layout, mpi_layout);
 | 
			
		||||
      RealD Nrank = Grid._Nprocessors;
 | 
			
		||||
      RealD Nnode = Grid.NodeCount();
 | 
			
		||||
      RealD ppn = Nrank / Nnode;
 | 
			
		||||
 | 
			
		||||
      std::vector<HalfSpinColourVectorD *> xbuf(8);
 | 
			
		||||
      std::vector<HalfSpinColourVectorD *> rbuf(8);
 | 
			
		||||
      uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
 | 
			
		||||
      for (int d = 0; d < 8; d++)
 | 
			
		||||
      {
 | 
			
		||||
        int Ls = 12;
 | 
			
		||||
        xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
 | 
			
		||||
        rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
 | 
			
		||||
        //	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 | 
			
		||||
        //	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
        Coordinate latt_size({lat * mpi_layout[0],
 | 
			
		||||
                              lat * mpi_layout[1],
 | 
			
		||||
                              lat * mpi_layout[2],
 | 
			
		||||
                              lat * mpi_layout[3]});
 | 
			
		||||
      double dbytes;
 | 
			
		||||
 | 
			
		||||
        GridCartesian Grid(latt_size, simd_layout, mpi_layout);
 | 
			
		||||
        RealD Nrank = Grid._Nprocessors;
 | 
			
		||||
        RealD Nnode = Grid.NodeCount();
 | 
			
		||||
        RealD ppn = Nrank / Nnode;
 | 
			
		||||
 | 
			
		||||
        std::vector<HalfSpinColourVectorD *> xbuf(8);
 | 
			
		||||
        std::vector<HalfSpinColourVectorD *> rbuf(8);
 | 
			
		||||
        // Grid.ShmBufferFreeAll();
 | 
			
		||||
        uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
 | 
			
		||||
        for (int d = 0; d < 8; d++)
 | 
			
		||||
      for (int dir = 0; dir < 8; dir++)
 | 
			
		||||
      {
 | 
			
		||||
        int mu = dir % 4;
 | 
			
		||||
        if (mpi_layout[mu] > 1)
 | 
			
		||||
        {
 | 
			
		||||
          xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
 | 
			
		||||
          rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
 | 
			
		||||
          //	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 | 
			
		||||
          //	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        //	int ncomm;
 | 
			
		||||
        double dbytes;
 | 
			
		||||
 | 
			
		||||
        for (int dir = 0; dir < 8; dir++)
 | 
			
		||||
        {
 | 
			
		||||
          int mu = dir % 4;
 | 
			
		||||
          if (mpi_layout[mu] > 1)
 | 
			
		||||
          std::vector<double> times(Nloop);
 | 
			
		||||
          for (int i = 0; i < Nloop; i++)
 | 
			
		||||
          {
 | 
			
		||||
 | 
			
		||||
            std::vector<double> times(Nloop);
 | 
			
		||||
            for (int i = 0; i < Nloop; i++)
 | 
			
		||||
            dbytes = 0;
 | 
			
		||||
            double start = usecond();
 | 
			
		||||
            int xmit_to_rank;
 | 
			
		||||
            int recv_from_rank;
 | 
			
		||||
 | 
			
		||||
            if (dir == mu)
 | 
			
		||||
            {
 | 
			
		||||
 | 
			
		||||
              dbytes = 0;
 | 
			
		||||
              double start = usecond();
 | 
			
		||||
              int xmit_to_rank;
 | 
			
		||||
              int recv_from_rank;
 | 
			
		||||
 | 
			
		||||
              if (dir == mu)
 | 
			
		||||
              {
 | 
			
		||||
                int comm_proc = 1;
 | 
			
		||||
                Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
 | 
			
		||||
              }
 | 
			
		||||
              else
 | 
			
		||||
              {
 | 
			
		||||
                int comm_proc = mpi_layout[mu] - 1;
 | 
			
		||||
                Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
 | 
			
		||||
              }
 | 
			
		||||
              Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
 | 
			
		||||
                                  (void *)&rbuf[dir][0], recv_from_rank,
 | 
			
		||||
                                  bytes);
 | 
			
		||||
              dbytes += bytes;
 | 
			
		||||
 | 
			
		||||
              double stop = usecond();
 | 
			
		||||
              t_time[i] = stop - start; // microseconds
 | 
			
		||||
              int comm_proc = 1;
 | 
			
		||||
              Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
 | 
			
		||||
            }
 | 
			
		||||
            timestat.statistics(t_time);
 | 
			
		||||
            else
 | 
			
		||||
            {
 | 
			
		||||
              int comm_proc = mpi_layout[mu] - 1;
 | 
			
		||||
              Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
 | 
			
		||||
            }
 | 
			
		||||
            Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
 | 
			
		||||
                                (void *)&rbuf[dir][0], recv_from_rank,
 | 
			
		||||
                                bytes);
 | 
			
		||||
            dbytes += bytes;
 | 
			
		||||
 | 
			
		||||
            dbytes = dbytes * ppn;
 | 
			
		||||
            double xbytes = dbytes * 0.5;
 | 
			
		||||
            double bidibytes = dbytes;
 | 
			
		||||
 | 
			
		||||
            std::cout << GridLogMessage << lat << "\t" << Ls << "\t "
 | 
			
		||||
                      << bytes << " \t "
 | 
			
		||||
                      << xbytes / timestat.mean << " \t " << xbytes * timestat.err / (timestat.mean * timestat.mean) << " \t "
 | 
			
		||||
                      << xbytes / timestat.max << " " << xbytes / timestat.min
 | 
			
		||||
                      << "\t\t" << bidibytes / timestat.mean << "  " << bidibytes * timestat.err / (timestat.mean * timestat.mean) << " "
 | 
			
		||||
                      << bidibytes / timestat.max << " " << bidibytes / timestat.min << std::endl;
 | 
			
		||||
            double stop = usecond();
 | 
			
		||||
            t_time[i] = stop - start; // microseconds
 | 
			
		||||
          }
 | 
			
		||||
          timestat.statistics(t_time);
 | 
			
		||||
 | 
			
		||||
          dbytes = dbytes * ppn;
 | 
			
		||||
          double xbytes = dbytes * 0.5;
 | 
			
		||||
          double bidibytes = dbytes;
 | 
			
		||||
 | 
			
		||||
          std::cout << GridLogMessage << lat << "\t" << Ls << "\t "
 | 
			
		||||
                    << bytes << " \t "
 | 
			
		||||
                    << xbytes / timestat.mean << " \t " << xbytes * timestat.err / (timestat.mean * timestat.mean) << " \t "
 | 
			
		||||
                    << xbytes / timestat.max << " " << xbytes / timestat.min
 | 
			
		||||
                    << "\t\t" << bidibytes / timestat.mean << "  " << bidibytes * timestat.err / (timestat.mean * timestat.mean) << " "
 | 
			
		||||
                    << bidibytes / timestat.max << " " << bidibytes / timestat.min << std::endl;
 | 
			
		||||
        }
 | 
			
		||||
        for (int d = 0; d < 8; d++)
 | 
			
		||||
        {
 | 
			
		||||
          acceleratorFreeDevice(xbuf[d]);
 | 
			
		||||
          acceleratorFreeDevice(rbuf[d]);
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
      for (int d = 0; d < 8; d++)
 | 
			
		||||
      {
 | 
			
		||||
        acceleratorFreeDevice(xbuf[d]);
 | 
			
		||||
        acceleratorFreeDevice(rbuf[d]);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    return;
 | 
			
		||||
 
 | 
			
		||||
@@ -32,23 +32,13 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
 | 
			
		||||
#ifdef HAVE_LIME
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
 | 
			
		||||
std::string filestem(const int l)
 | 
			
		||||
{
 | 
			
		||||
  return "iobench_l" + std::to_string(l);
 | 
			
		||||
}
 | 
			
		||||
std::string filestem(const int l) { return "iobench_l" + std::to_string(l); }
 | 
			
		||||
 | 
			
		||||
int vol(const int i)
 | 
			
		||||
{
 | 
			
		||||
  return BENCH_IO_LMIN + 2 * i;
 | 
			
		||||
}
 | 
			
		||||
int vol(const int i) { return BENCH_IO_LMIN + 2 * i; }
 | 
			
		||||
 | 
			
		||||
int volInd(const int l)
 | 
			
		||||
{
 | 
			
		||||
  return (l - BENCH_IO_LMIN) / 2;
 | 
			
		||||
}
 | 
			
		||||
int volInd(const int l) { return (l - BENCH_IO_LMIN) / 2; }
 | 
			
		||||
 | 
			
		||||
template <typename Mat>
 | 
			
		||||
void stats(Mat &mean, Mat &stdDev, const std::vector<Mat> &data)
 | 
			
		||||
template <typename Mat> void stats(Mat &mean, Mat &stdDev, const std::vector<Mat> &data)
 | 
			
		||||
{
 | 
			
		||||
  auto nr = data[0].rows(), nc = data[0].cols();
 | 
			
		||||
  Eigen::MatrixXd sqSum(nr, nc);
 | 
			
		||||
@@ -66,11 +56,11 @@ void stats(Mat &mean, Mat &stdDev, const std::vector<Mat> &data)
 | 
			
		||||
  mean /= n;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#define grid_printf(...)        \
 | 
			
		||||
  {                             \
 | 
			
		||||
    char _buf[1024];            \
 | 
			
		||||
    sprintf(_buf, __VA_ARGS__); \
 | 
			
		||||
    MSG << _buf;                \
 | 
			
		||||
#define grid_printf(...)                                                                 \
 | 
			
		||||
  {                                                                                      \
 | 
			
		||||
    char _buf[1024];                                                                     \
 | 
			
		||||
    sprintf(_buf, __VA_ARGS__);                                                          \
 | 
			
		||||
    MSG << _buf;                                                                         \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
enum
 | 
			
		||||
@@ -173,47 +163,49 @@ int main(int argc, char **argv)
 | 
			
		||||
  MSG << "SUMMARY" << std::endl;
 | 
			
		||||
  MSG << BIGSEP << std::endl;
 | 
			
		||||
  MSG << "Summary of individual results (all results in MB/s)." << std::endl;
 | 
			
		||||
  MSG << "Every second colum gives the standard deviation of the previous column." << std::endl;
 | 
			
		||||
  MSG << "Every second colum gives the standard deviation of the previous column."
 | 
			
		||||
      << std::endl;
 | 
			
		||||
  MSG << std::endl;
 | 
			
		||||
  grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n",
 | 
			
		||||
              "L", "std read", "std dev", "std write", "std dev",
 | 
			
		||||
              "Grid read", "std dev", "Grid write", "std dev");
 | 
			
		||||
  grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n", "L", "std read", "std dev",
 | 
			
		||||
              "std write", "std dev", "Grid read", "std dev", "Grid write", "std dev");
 | 
			
		||||
  for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
 | 
			
		||||
  {
 | 
			
		||||
    grid_printf("%4d %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n",
 | 
			
		||||
                l, mean(volInd(l), sRead), stdDev(volInd(l), sRead),
 | 
			
		||||
                mean(volInd(l), sWrite), stdDev(volInd(l), sWrite),
 | 
			
		||||
                mean(volInd(l), gRead), stdDev(volInd(l), gRead),
 | 
			
		||||
                mean(volInd(l), gWrite), stdDev(volInd(l), gWrite));
 | 
			
		||||
    grid_printf("%4d %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", l,
 | 
			
		||||
                mean(volInd(l), sRead), stdDev(volInd(l), sRead), mean(volInd(l), sWrite),
 | 
			
		||||
                stdDev(volInd(l), sWrite), mean(volInd(l), gRead),
 | 
			
		||||
                stdDev(volInd(l), gRead), mean(volInd(l), gWrite),
 | 
			
		||||
                stdDev(volInd(l), gWrite));
 | 
			
		||||
  }
 | 
			
		||||
  MSG << std::endl;
 | 
			
		||||
  MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)" << std::endl;
 | 
			
		||||
  MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)"
 | 
			
		||||
      << std::endl;
 | 
			
		||||
  MSG << std::endl;
 | 
			
		||||
  grid_printf("%4s %12s %12s %12s %12s\n",
 | 
			
		||||
              "L", "std read", "std write", "Grid read", "Grid write");
 | 
			
		||||
  grid_printf("%4s %12s %12s %12s %12s\n", "L", "std read", "std write", "Grid read",
 | 
			
		||||
              "Grid write");
 | 
			
		||||
  for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
 | 
			
		||||
  {
 | 
			
		||||
    grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n",
 | 
			
		||||
                l, rob(volInd(l), sRead), rob(volInd(l), sWrite),
 | 
			
		||||
                rob(volInd(l), gRead), rob(volInd(l), gWrite));
 | 
			
		||||
    grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n", l, rob(volInd(l), sRead),
 | 
			
		||||
                rob(volInd(l), sWrite), rob(volInd(l), gRead), rob(volInd(l), gWrite));
 | 
			
		||||
  }
 | 
			
		||||
  MSG << std::endl;
 | 
			
		||||
  MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX << "^4 (all results in MB/s)." << std::endl;
 | 
			
		||||
  MSG << "Every second colum gives the standard deviation of the previous column." << std::endl;
 | 
			
		||||
  MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX
 | 
			
		||||
      << "^4 (all results in MB/s)." << std::endl;
 | 
			
		||||
  MSG << "Every second colum gives the standard deviation of the previous column."
 | 
			
		||||
      << std::endl;
 | 
			
		||||
  MSG << std::endl;
 | 
			
		||||
  grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n",
 | 
			
		||||
              "std read", "std dev", "std write", "std dev",
 | 
			
		||||
              "Grid read", "std dev", "Grid write", "std dev");
 | 
			
		||||
  grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n",
 | 
			
		||||
              avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite),
 | 
			
		||||
              avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
 | 
			
		||||
  grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n", "std read", "std dev",
 | 
			
		||||
              "std write", "std dev", "Grid read", "std dev", "Grid write", "std dev");
 | 
			
		||||
  grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", avMean(sRead),
 | 
			
		||||
              avStdDev(sRead), avMean(sWrite), avStdDev(sWrite), avMean(gRead),
 | 
			
		||||
              avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
 | 
			
		||||
  MSG << std::endl;
 | 
			
		||||
  MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)" << std::endl;
 | 
			
		||||
  MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)"
 | 
			
		||||
      << std::endl;
 | 
			
		||||
  MSG << std::endl;
 | 
			
		||||
  grid_printf("%12s %12s %12s %12s\n",
 | 
			
		||||
              "std read", "std write", "Grid read", "Grid write");
 | 
			
		||||
  grid_printf("%12.1f %12.1f %12.1f %12.1f\n",
 | 
			
		||||
              avRob(sRead), avRob(sWrite), avRob(gRead), avRob(gWrite));
 | 
			
		||||
  grid_printf("%12s %12s %12s %12s\n", "std read", "std write", "Grid read",
 | 
			
		||||
              "Grid write");
 | 
			
		||||
  grid_printf("%12.1f %12.1f %12.1f %12.1f\n", avRob(sRead), avRob(sWrite), avRob(gRead),
 | 
			
		||||
              avRob(gWrite));
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -34,8 +34,7 @@ struct time_statistics
 | 
			
		||||
    mean = sum / v.size();
 | 
			
		||||
 | 
			
		||||
    std::vector<double> diff(v.size());
 | 
			
		||||
    std::transform(v.begin(), v.end(), diff.begin(), [=](double x)
 | 
			
		||||
                   { return x - mean; });
 | 
			
		||||
    std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
 | 
			
		||||
    double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
 | 
			
		||||
    err = std::sqrt(sq_sum / (v.size() * (v.size() - 1)));
 | 
			
		||||
 | 
			
		||||
@@ -50,8 +49,7 @@ void header()
 | 
			
		||||
  std::cout << GridLogMessage << " L  "
 | 
			
		||||
            << "\t"
 | 
			
		||||
            << " Ls  "
 | 
			
		||||
            << "\t"
 | 
			
		||||
            << std::setw(11) << "bytes\t\t"
 | 
			
		||||
            << "\t" << std::setw(11) << "bytes\t\t"
 | 
			
		||||
            << "MB/s uni"
 | 
			
		||||
            << "\t"
 | 
			
		||||
            << "MB/s bidi" << std::endl;
 | 
			
		||||
@@ -64,7 +62,8 @@ int main(int argc, char **argv)
 | 
			
		||||
  Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
 | 
			
		||||
  Coordinate mpi_layout = GridDefaultMpi();
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads"
 | 
			
		||||
            << std::endl;
 | 
			
		||||
 | 
			
		||||
  int Nloop = 250;
 | 
			
		||||
  int nmu = 0;
 | 
			
		||||
@@ -73,13 +72,21 @@ int main(int argc, char **argv)
 | 
			
		||||
    if (mpi_layout[mu] > 1)
 | 
			
		||||
      nmu++;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "Number of iterations to average: " << Nloop << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "Number of iterations to average: " << Nloop
 | 
			
		||||
            << std::endl;
 | 
			
		||||
  std::vector<double> t_time(Nloop);
 | 
			
		||||
  //  time_statistics timestat;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "====================================================================================================" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "= Benchmarking sequential halo exchange from host memory " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "====================================================================================================" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage
 | 
			
		||||
            << "========================================================================="
 | 
			
		||||
               "==========================="
 | 
			
		||||
            << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage
 | 
			
		||||
            << "= Benchmarking sequential halo exchange from host memory " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage
 | 
			
		||||
            << "========================================================================="
 | 
			
		||||
               "==========================="
 | 
			
		||||
            << std::endl;
 | 
			
		||||
  header();
 | 
			
		||||
 | 
			
		||||
  for (int lat = 8; lat <= maxlat; lat += 4)
 | 
			
		||||
@@ -87,9 +94,7 @@ int main(int argc, char **argv)
 | 
			
		||||
    for (int Ls = 8; Ls <= 8; Ls *= 2)
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
      Coordinate latt_size({lat * mpi_layout[0],
 | 
			
		||||
                            lat * mpi_layout[1],
 | 
			
		||||
                            lat * mpi_layout[2],
 | 
			
		||||
      Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
 | 
			
		||||
                            lat * mpi_layout[3]});
 | 
			
		||||
 | 
			
		||||
      GridCartesian Grid(latt_size, simd_layout, mpi_layout);
 | 
			
		||||
@@ -127,22 +132,16 @@ int main(int argc, char **argv)
 | 
			
		||||
            {
 | 
			
		||||
              std::vector<CommsRequest_t> requests;
 | 
			
		||||
              Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
 | 
			
		||||
              Grid.SendToRecvFrom((void *)&xbuf[mu][0],
 | 
			
		||||
                                  xmit_to_rank,
 | 
			
		||||
                                  (void *)&rbuf[mu][0],
 | 
			
		||||
                                  recv_from_rank,
 | 
			
		||||
                                  bytes);
 | 
			
		||||
              Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank,
 | 
			
		||||
                                  (void *)&rbuf[mu][0], recv_from_rank, bytes);
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            comm_proc = mpi_layout[mu] - 1;
 | 
			
		||||
            {
 | 
			
		||||
              std::vector<CommsRequest_t> requests;
 | 
			
		||||
              Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
 | 
			
		||||
              Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0],
 | 
			
		||||
                                  xmit_to_rank,
 | 
			
		||||
                                  (void *)&rbuf[mu + 4][0],
 | 
			
		||||
                                  recv_from_rank,
 | 
			
		||||
                                  bytes);
 | 
			
		||||
              Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank,
 | 
			
		||||
                                  (void *)&rbuf[mu + 4][0], recv_from_rank, bytes);
 | 
			
		||||
            }
 | 
			
		||||
          }
 | 
			
		||||
          Grid.Barrier();
 | 
			
		||||
@@ -154,17 +153,24 @@ int main(int argc, char **argv)
 | 
			
		||||
          double bidibytes = xbytes + rbytes;
 | 
			
		||||
 | 
			
		||||
          std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t"
 | 
			
		||||
                    << std::setw(11) << bytes << std::fixed << std::setprecision(1) << std::setw(7) << " "
 | 
			
		||||
                    << std::right << xbytes / mean << "  "
 | 
			
		||||
                    << std::setw(11) << bytes << std::fixed << std::setprecision(1)
 | 
			
		||||
                    << std::setw(7) << " " << std::right << xbytes / mean << "  "
 | 
			
		||||
                    << "\t\t" << std::setw(7) << bidibytes / mean << std::endl;
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "====================================================================================================" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "= Benchmarking sequential halo exchange from GPU memory " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "====================================================================================================" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage
 | 
			
		||||
            << "========================================================================="
 | 
			
		||||
               "==========================="
 | 
			
		||||
            << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage
 | 
			
		||||
            << "= Benchmarking sequential halo exchange from GPU memory " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage
 | 
			
		||||
            << "========================================================================="
 | 
			
		||||
               "==========================="
 | 
			
		||||
            << std::endl;
 | 
			
		||||
  header();
 | 
			
		||||
 | 
			
		||||
  for (int lat = 8; lat <= maxlat; lat += 4)
 | 
			
		||||
@@ -172,9 +178,7 @@ int main(int argc, char **argv)
 | 
			
		||||
    for (int Ls = 8; Ls <= 8; Ls *= 2)
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
      Coordinate latt_size({lat * mpi_layout[0],
 | 
			
		||||
                            lat * mpi_layout[1],
 | 
			
		||||
                            lat * mpi_layout[2],
 | 
			
		||||
      Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
 | 
			
		||||
                            lat * mpi_layout[3]});
 | 
			
		||||
 | 
			
		||||
      GridCartesian Grid(latt_size, simd_layout, mpi_layout);
 | 
			
		||||
@@ -212,22 +216,16 @@ int main(int argc, char **argv)
 | 
			
		||||
            {
 | 
			
		||||
              std::vector<CommsRequest_t> requests;
 | 
			
		||||
              Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
 | 
			
		||||
              Grid.SendToRecvFrom((void *)&xbuf[mu][0],
 | 
			
		||||
                                  xmit_to_rank,
 | 
			
		||||
                                  (void *)&rbuf[mu][0],
 | 
			
		||||
                                  recv_from_rank,
 | 
			
		||||
                                  bytes);
 | 
			
		||||
              Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank,
 | 
			
		||||
                                  (void *)&rbuf[mu][0], recv_from_rank, bytes);
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            comm_proc = mpi_layout[mu] - 1;
 | 
			
		||||
            {
 | 
			
		||||
              std::vector<CommsRequest_t> requests;
 | 
			
		||||
              Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
 | 
			
		||||
              Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0],
 | 
			
		||||
                                  xmit_to_rank,
 | 
			
		||||
                                  (void *)&rbuf[mu + 4][0],
 | 
			
		||||
                                  recv_from_rank,
 | 
			
		||||
                                  bytes);
 | 
			
		||||
              Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank,
 | 
			
		||||
                                  (void *)&rbuf[mu + 4][0], recv_from_rank, bytes);
 | 
			
		||||
            }
 | 
			
		||||
          }
 | 
			
		||||
          Grid.Barrier();
 | 
			
		||||
@@ -239,8 +237,8 @@ int main(int argc, char **argv)
 | 
			
		||||
          double bidibytes = xbytes + rbytes;
 | 
			
		||||
 | 
			
		||||
          std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t"
 | 
			
		||||
                    << std::setw(11) << bytes << std::fixed << std::setprecision(1) << std::setw(7) << " "
 | 
			
		||||
                    << std::right << xbytes / mean << "  "
 | 
			
		||||
                    << std::setw(11) << bytes << std::fixed << std::setprecision(1)
 | 
			
		||||
                    << std::setw(7) << " " << std::right << xbytes / mean << "  "
 | 
			
		||||
                    << "\t\t" << std::setw(7) << bidibytes / mean << std::endl;
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
@@ -253,9 +251,15 @@ int main(int argc, char **argv)
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "====================================================================================================" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage
 | 
			
		||||
            << "========================================================================="
 | 
			
		||||
               "==========================="
 | 
			
		||||
            << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "= All done; Bye Bye" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "====================================================================================================" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage
 | 
			
		||||
            << "========================================================================="
 | 
			
		||||
               "==========================="
 | 
			
		||||
            << std::endl;
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -28,17 +28,13 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
 | 
			
		||||
template <class d>
 | 
			
		||||
struct scal
 | 
			
		||||
template <class d> struct scal
 | 
			
		||||
{
 | 
			
		||||
  d internal;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
Gamma::Algebra Gmu[] = {
 | 
			
		||||
    Gamma::Algebra::GammaX,
 | 
			
		||||
    Gamma::Algebra::GammaY,
 | 
			
		||||
    Gamma::Algebra::GammaZ,
 | 
			
		||||
    Gamma::Algebra::GammaT};
 | 
			
		||||
Gamma::Algebra Gmu[] = {Gamma::Algebra::GammaX, Gamma::Algebra::GammaY,
 | 
			
		||||
                        Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT};
 | 
			
		||||
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
@@ -59,13 +55,15 @@ int main(int argc, char **argv)
 | 
			
		||||
 | 
			
		||||
  long unsigned int single_site_flops = 8 * Nc * (7 + 16 * Nc);
 | 
			
		||||
 | 
			
		||||
  GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
 | 
			
		||||
  GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
 | 
			
		||||
      GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
			
		||||
  GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
 | 
			
		||||
  GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "Making s innermost grids" << std::endl;
 | 
			
		||||
  GridCartesian *sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(), GridDefaultMpi());
 | 
			
		||||
  GridCartesian *sUGrid =
 | 
			
		||||
      SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(), GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian *sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
 | 
			
		||||
  GridCartesian *sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls, UGrid);
 | 
			
		||||
  GridRedBlackCartesian *sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls, UGrid);
 | 
			
		||||
@@ -177,13 +175,24 @@ int main(int argc, char **argv)
 | 
			
		||||
  RealD NP = UGrid->_Nprocessors;
 | 
			
		||||
  RealD NN = UGrid->NodeCount();
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "*****************************************************************" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "*****************************************************************" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "*****************************************************************" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "* Benchmarking DomainWallFermionR::Dhop                  " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd() << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "* VComplexF size is " << sizeof(vComplexF) << " B" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage
 | 
			
		||||
            << "*****************************************************************"
 | 
			
		||||
            << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage
 | 
			
		||||
            << "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm"
 | 
			
		||||
            << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage
 | 
			
		||||
            << "*****************************************************************"
 | 
			
		||||
            << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage
 | 
			
		||||
            << "*****************************************************************"
 | 
			
		||||
            << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage
 | 
			
		||||
            << "* Benchmarking DomainWallFermionR::Dhop                  " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd()
 | 
			
		||||
            << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "* VComplexF size is " << sizeof(vComplexF) << " B"
 | 
			
		||||
            << std::endl;
 | 
			
		||||
  if (sizeof(RealF) == 4)
 | 
			
		||||
    std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
 | 
			
		||||
  if (sizeof(RealF) == 8)
 | 
			
		||||
@@ -200,7 +209,9 @@ int main(int argc, char **argv)
 | 
			
		||||
    std::cout << GridLogMessage << "* Using Nc=3       WilsonKernels" << std::endl;
 | 
			
		||||
  if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm)
 | 
			
		||||
    std::cout << GridLogMessage << "* Using Asm Nc=3   WilsonKernels" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "*****************************************************************" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage
 | 
			
		||||
            << "*****************************************************************"
 | 
			
		||||
            << std::endl;
 | 
			
		||||
 | 
			
		||||
  DomainWallFermionF Dw(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5);
 | 
			
		||||
  int ncall = 300;
 | 
			
		||||
@@ -230,19 +241,29 @@ int main(int argc, char **argv)
 | 
			
		||||
    auto simdwidth = sizeof(vComplex);
 | 
			
		||||
 | 
			
		||||
    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
 | 
			
		||||
    double data_rf = volume * ((2 * Nd + 1) * Nd * Nc + 2 * Nd * Nc * Nc) * simdwidth / nsimd * ncall / (1024. * 1024. * 1024.);
 | 
			
		||||
    double data_rf = volume * ((2 * Nd + 1) * Nd * Nc + 2 * Nd * Nc * Nc) * simdwidth /
 | 
			
		||||
                     nsimd * ncall / (1024. * 1024. * 1024.);
 | 
			
		||||
 | 
			
		||||
    // mem: Nd Wilson * Ls, Nd gauge, Nc colors
 | 
			
		||||
    double data_mem = (volume * (2 * Nd + 1) * Nd * Nc + (volume / Ls) * 2 * Nd * Nc * Nc) * simdwidth / nsimd * ncall / (1024. * 1024. * 1024.);
 | 
			
		||||
    double data_mem =
 | 
			
		||||
        (volume * (2 * Nd + 1) * Nd * Nc + (volume / Ls) * 2 * Nd * Nc * Nc) * simdwidth /
 | 
			
		||||
        nsimd * ncall / (1024. * 1024. * 1024.);
 | 
			
		||||
 | 
			
		||||
    std::cout << GridLogMessage << "Called Dw " << ncall << " times in " << t1 - t0 << " us" << std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "Called Dw " << ncall << " times in " << t1 - t0
 | 
			
		||||
              << " us" << std::endl;
 | 
			
		||||
    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
 | 
			
		||||
    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "mflop/s =   " << flops / (t1 - t0) << std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "mflop/s per rank =  " << flops / (t1 - t0) / NP << std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "mflop/s per node =  " << flops / (t1 - t0) / NN << std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "RF  GiB/s (base 2) =   " << 1000000. * data_rf / ((t1 - t0)) << std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "mem GiB/s (base 2) =   " << 1000000. * data_mem / ((t1 - t0)) << std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "mflop/s per rank =  " << flops / (t1 - t0) / NP
 | 
			
		||||
              << std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "mflop/s per node =  " << flops / (t1 - t0) / NN
 | 
			
		||||
              << std::endl;
 | 
			
		||||
    std::cout << GridLogMessage
 | 
			
		||||
              << "RF  GiB/s (base 2) =   " << 1000000. * data_rf / ((t1 - t0))
 | 
			
		||||
              << std::endl;
 | 
			
		||||
    std::cout << GridLogMessage
 | 
			
		||||
              << "mem GiB/s (base 2) =   " << 1000000. * data_mem / ((t1 - t0))
 | 
			
		||||
              << std::endl;
 | 
			
		||||
    err = ref - result;
 | 
			
		||||
    std::cout << GridLogMessage << "norm diff   " << norm2(err) << std::endl;
 | 
			
		||||
    // exit(0);
 | 
			
		||||
@@ -313,7 +334,9 @@ int main(int argc, char **argv)
 | 
			
		||||
  }
 | 
			
		||||
  //  dump=1;
 | 
			
		||||
  Dw.Dhop(src, result, 1);
 | 
			
		||||
  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage
 | 
			
		||||
            << "Compare to naive wilson implementation Dag to verify correctness"
 | 
			
		||||
            << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "Called DwDag" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "norm dag result " << norm2(result) << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "norm dag ref    " << norm2(ref) << std::endl;
 | 
			
		||||
@@ -333,7 +356,8 @@ int main(int argc, char **argv)
 | 
			
		||||
  LatticeFermionF r_o(FrbGrid);
 | 
			
		||||
  LatticeFermionF r_eo(FGrid);
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"
 | 
			
		||||
            << std::endl;
 | 
			
		||||
  pickCheckerboard(Even, src_e, src);
 | 
			
		||||
  pickCheckerboard(Odd, src_o, src);
 | 
			
		||||
 | 
			
		||||
@@ -341,9 +365,12 @@ int main(int argc, char **argv)
 | 
			
		||||
  std::cout << GridLogMessage << "src_o" << norm2(src_o) << std::endl;
 | 
			
		||||
 | 
			
		||||
  // S-direction is INNERMOST and takes no part in the parity.
 | 
			
		||||
  std::cout << GridLogMessage << "*********************************************************" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "* Benchmarking DomainWallFermionF::DhopEO                " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd() << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage
 | 
			
		||||
            << "*********************************************************" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage
 | 
			
		||||
            << "* Benchmarking DomainWallFermionF::DhopEO                " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd()
 | 
			
		||||
            << std::endl;
 | 
			
		||||
  if (sizeof(RealF) == 4)
 | 
			
		||||
    std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
 | 
			
		||||
  if (sizeof(RealF) == 8)
 | 
			
		||||
@@ -360,7 +387,8 @@ int main(int argc, char **argv)
 | 
			
		||||
    std::cout << GridLogMessage << "* Using Nc=3       WilsonKernels" << std::endl;
 | 
			
		||||
  if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm)
 | 
			
		||||
    std::cout << GridLogMessage << "* Using Asm Nc=3   WilsonKernels" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "*********************************************************" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage
 | 
			
		||||
            << "*********************************************************" << std::endl;
 | 
			
		||||
  {
 | 
			
		||||
    Dw.ZeroCounters();
 | 
			
		||||
    FGrid->Barrier();
 | 
			
		||||
@@ -387,8 +415,10 @@ int main(int argc, char **argv)
 | 
			
		||||
    double flops = (single_site_flops * volume * ncall) / 2.0;
 | 
			
		||||
 | 
			
		||||
    std::cout << GridLogMessage << "Deo mflop/s =   " << flops / (t1 - t0) << std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "Deo mflop/s per rank   " << flops / (t1 - t0) / NP << std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "Deo mflop/s per node   " << flops / (t1 - t0) / NN << std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "Deo mflop/s per rank   " << flops / (t1 - t0) / NP
 | 
			
		||||
              << std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "Deo mflop/s per node   " << flops / (t1 - t0) / NN
 | 
			
		||||
              << std::endl;
 | 
			
		||||
    Dw.Report();
 | 
			
		||||
  }
 | 
			
		||||
  Dw.DhopEO(src_o, r_e, DaggerNo);
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user