Benchmarks modified for new GPU constructs

2026-01-09 03:19:34 +00:00 · 2019-06-15 12:52:56 +01:00
parent 0184719216
commit 0561c2edeb
6 changed files with 61 additions and 84 deletions
--- a/benchmarks/Benchmark_IO_vs_dir.cc
+++ b/benchmarks/Benchmark_IO_vs_dir.cc
@@ -5,7 +5,6 @@
 "============================================================================="

 using namespace Grid;
-using namespace QCD;

 int main (int argc, char ** argv)
 {
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -153,8 +153,8 @@ public:
 	  dbytes=0;
 	  ncomm=0;

-	  thread_loop( (int dir=0;dir<8;dir++),{
-
+	  thread_for(dir,8,{
+		     
 	    double tbytes;
 	    int mu =dir % 4;

--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -485,7 +485,7 @@ int main (int argc, char ** argv)
 	dbytes=0;
 	ncomm=0;

-	thread_loop( (int dir=0;dir<8;dir++),{
+	thread_for(dir,8,{

 	  double tbytes;
 	  int mu =dir % 4;
--- a/benchmarks/Benchmark_memory_asynch.cc
+++ b/benchmarks/Benchmark_memory_asynch.cc
@@ -78,7 +78,7 @@ int main (int argc, char ** argv)
    }

    double start=usecond();
-    thread_loop( (int t=0;t<threads;t++),{
+    thread_for(t,threads,{
      auto x_t = x[t].View();
      sum[t] = x_t[0];
      for(int i=0;i<Nloop;i++){
--- a/benchmarks/Benchmark_meson_field.cc
+++ b/benchmarks/Benchmark_meson_field.cc
@@ -30,8 +30,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 using namespace std;
 using namespace Grid;
-using namespace Grid::QCD;
-

 #include "Grid/util/Profiling.h"

@@ -67,7 +65,7 @@ void sliceInnerProductMesonField(std::vector< std::vector<ComplexD> > &mat,
  // sum across these down to scalars
  // splitting the SIMD
  std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd*Lblock*Rblock);
-  thread_loop( (int r = 0; r < rd * Lblock * Rblock; r++),{
+  thread_for(r, rd * Lblock * Rblock,{
    lvSum[r] = Zero();
  });

@@ -79,7 +77,7 @@ void sliceInnerProductMesonField(std::vector< std::vector<ComplexD> > &mat,
  
  std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
  // Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
-  thread_loop((int r=0;r<rd;r++),{
+  thread_for(r,rd,{

    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 

@@ -114,7 +112,7 @@ void sliceInnerProductMesonField(std::vector< std::vector<ComplexD> > &mat,

  std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
  // Sum across simd lanes in the plane, breaking out orthog dir.
-  thread_loop((int rt=0;rt<rd;rt++),{
+  thread_for(rt,rd,{
    
    Coordinate icoor(Nd);

@@ -199,12 +197,12 @@ void sliceInnerProductMesonFieldGamma(std::vector< std::vector<ComplexD> > &mat,
  int MFlvol = ld*Lblock*Rblock*Ngamma;

  std::vector<vector_type,alignedAllocator<vector_type> > lvSum(MFrvol);
-  thread_loop( (int r = 0; r < MFrvol; r++),{
+  thread_for(r,MFrvol,{
    lvSum[r] = Zero();
  });

  std::vector<scalar_type > lsSum(MFlvol);             
-  thread_loop( (int r = 0; r < MFlvol; r++),{
+  thread_for( r,MFlvol,{
    lsSum[r]=scalar_type(0.0);
  });

@@ -215,7 +213,7 @@ void sliceInnerProductMesonFieldGamma(std::vector< std::vector<ComplexD> > &mat,
  std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;

  // Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
-  thread_loop((int r=0;r<rd;r++),{
+  thread_for(r,rd,{

    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 

@@ -256,7 +254,7 @@ void sliceInnerProductMesonFieldGamma(std::vector< std::vector<ComplexD> > &mat,

  std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
  // Sum across simd lanes in the plane, breaking out orthog dir.
-  thread_loop((int rt=0;rt<rd;rt++),{
+  thread_for(rt,rd,{

    iScalar<vector_type> temp; 
    Coordinate icoor(Nd);
@@ -347,12 +345,12 @@ void sliceInnerProductMesonFieldGamma1(std::vector< std::vector<ComplexD> > &mat
  int MFlvol = ld*Lblock*Rblock;

  Vector<SpinMatrix_v > lvSum(MFrvol);
-  thread_loop( (int r = 0; r < MFrvol; r++),{
+  thread_for(r,MFrvol,{
    lvSum[r] = Zero();
  });

  Vector<SpinMatrix_s > lsSum(MFlvol);             
-  thread_loop( (int r = 0; r < MFlvol; r++),{
+  thread_for(r,MFlvol,{
    lsSum[r]=scalar_type(0.0);
  });

@@ -363,7 +361,7 @@ void sliceInnerProductMesonFieldGamma1(std::vector< std::vector<ComplexD> > &mat
  std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;

  // Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
-  thread_loop((int r=0;r<rd;r++),{
+  thread_for(r,rd,{

    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 

@@ -398,7 +396,7 @@ void sliceInnerProductMesonFieldGamma1(std::vector< std::vector<ComplexD> > &mat

  std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
  // Sum across simd lanes in the plane, breaking out orthog dir.
-  thread_loop((int rt=0;rt<rd;rt++),{
+  thread_for(rt,rd,{

    Coordinate icoor(Nd);
    ExtractBuffer<SpinMatrix_s> extracted(Nsimd);               
@@ -425,7 +423,7 @@ void sliceInnerProductMesonFieldGamma1(std::vector< std::vector<ComplexD> > &mat
  });

  std::cout << GridLogMessage << " Entering third parallel loop "<<std::endl;
-  thread_loop((int t=0;t<fd;t++)
+  thread_for(t,fd,
  {
    int pt = t / ld; // processor plane
    int lt = t % ld;
@@ -490,13 +488,13 @@ void sliceInnerProductMesonFieldGammaMom(std::vector< std::vector<ComplexD> > &m
  int MFlvol = ld*Lblock*Rblock*Nmom;

  Vector<SpinMatrix_v > lvSum(MFrvol);
-  thread_loop( (int r = 0; r < MFrvol; r++),
+  thread_for(r,MFrvol,
  {
    lvSum[r] = Zero();
  });

  Vector<SpinMatrix_s > lsSum(MFlvol);             
-  thread_loop( (int r = 0; r < MFlvol; r++),
+  thread_for(r,MFlvol,
  {
    lsSum[r]=scalar_type(0.0);
  });
@@ -508,7 +506,7 @@ void sliceInnerProductMesonFieldGammaMom(std::vector< std::vector<ComplexD> > &m
  std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;

  // Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
-  thread_loop((int r=0;r<rd;r++),
+  thread_for(r,rd,
  {

    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
@@ -552,7 +550,7 @@ void sliceInnerProductMesonFieldGammaMom(std::vector< std::vector<ComplexD> > &m

  std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
  // Sum across simd lanes in the plane, breaking out orthog dir.
-  thread_loop((int rt=0;rt<rd;rt++),
+  thread_for(rt,rd,
  {

    Coordinate icoor(Nd);
@@ -582,7 +580,7 @@ void sliceInnerProductMesonFieldGammaMom(std::vector< std::vector<ComplexD> > &m
  });

  std::cout << GridLogMessage << " Entering third parallel loop "<<std::endl;
-  thread_loop((int t=0;t<fd;t++),
+  thread_for(t,fd,
  {
    int pt = t / ld; // processor plane
    int lt = t % ld;
--- a/benchmarks/Benchmark_su3_gpu.cc
+++ b/benchmarks/Benchmark_su3_gpu.cc
@@ -71,26 +71,21 @@ int main (int argc, char ** argv)
      auto z_v = z.View();
      const uint64_t Nsite = x_v.size();
      const uint64_t nsimd = vComplex::Nsimd();
-      const uint64_t    NN = Nsite*nsimd;
      for(int64_t i=0;i<Nwarm;i++){
-	accelerator_loopN( sss, NN, {
-	  uint64_t lane = sss % nsimd; 
-	  uint64_t ss   = sss / nsimd;
-	  auto xx = extractLane(lane,x_v[ss]);
-	  auto yy = extractLane(lane,y_v[ss]);
+	accelerator_for( ss, Nsite, nsimd ,{
+	  auto xx = coalescedRead(x_v[ss]);
+	  auto yy = coalescedRead(y_v[ss]);
 	  auto zz = xx*yy;
-	  insertLane(lane,z_v[ss],zz);
+	  coalescedWrite(z_v[ss],zz);
        });
      }
      double start=usecond();
      for(int64_t i=0;i<Nloop;i++){
-	accelerator_loopN( sss, NN, {
-	  uint64_t lane = sss % nsimd; 
-	  uint64_t ss   = sss / nsimd;
-	  auto xx = extractLane(lane,x_v[ss]);
-	  auto yy = extractLane(lane,y_v[ss]);
+	accelerator_for( ss, Nsite, nsimd ,{
+	  auto xx = coalescedRead(x_v[ss]);
+	  auto yy = coalescedRead(y_v[ss]);
 	  auto zz = xx*yy;
-	  insertLane(lane,z_v[ss],zz);
+	  coalescedWrite(z_v[ss],zz);
        });
      }
      double stop=usecond();
@@ -126,26 +121,21 @@ int main (int argc, char ** argv)
      auto z_v = z.View();
      const uint64_t Nsite = x_v.size();
      const uint64_t nsimd = vComplex::Nsimd();
-      const uint64_t    NN = Nsite*nsimd;
      for(int64_t i=0;i<Nwarm;i++){
-	accelerator_loopN( sss, NN, {
-	  uint64_t lane = sss % nsimd; 
-	  uint64_t ss   = sss / nsimd;
-	  auto xx = extractLane(lane,x_v[ss]);
-	  auto yy = extractLane(lane,y_v[ss]);
+	accelerator_for( ss, Nsite, nsimd ,{
+	  auto xx = coalescedRead(x_v[ss]);
+	  auto yy = coalescedRead(y_v[ss]);
 	  auto zz = xx*yy;
-	  insertLane(lane,x_v[ss],zz);
+	  coalescedWrite(x_v[ss],zz);
        });
      }
      double start=usecond();
      for(int64_t i=0;i<Nloop;i++){
-	accelerator_loopN( sss, NN, {
-	  uint64_t lane = sss % nsimd; 
-	  uint64_t ss   = sss / nsimd;
-	  auto xx = extractLane(lane,x_v[ss]);
-	  auto yy = extractLane(lane,y_v[ss]);
+	accelerator_for( ss, Nsite, nsimd ,{
+	  auto xx = coalescedRead(x_v[ss]);
+	  auto yy = coalescedRead(y_v[ss]);
 	  auto zz = xx*yy;
-	  insertLane(lane,x_v[ss],zz);
+	  coalescedWrite(x_v[ss],zz);
        });
      }
      double stop=usecond();
@@ -182,28 +172,23 @@ int main (int argc, char ** argv)
      auto z_v = z.View();
      const uint64_t Nsite = x_v.size();
      const uint64_t nsimd = vComplex::Nsimd();
-      const uint64_t    NN = Nsite*nsimd;
      for(int64_t i=0;i<Nwarm;i++){
-	accelerator_loopN( sss, NN, {
-	  uint64_t lane = sss % nsimd; 
-	  uint64_t ss   = sss / nsimd;
-	  auto xx = extractLane(lane,x_v[ss]);
-	  auto yy = extractLane(lane,y_v[ss]);
-	  auto zz = extractLane(lane,z_v[ss]);
-	  zz = zz + xx * yy;
-	  insertLane(lane,z_v[ss],zz);
+	accelerator_for( ss, Nsite, nsimd ,{
+	  auto xx = coalescedRead(x_v[ss]);
+	  auto yy = coalescedRead(y_v[ss]);
+	  auto zz = coalescedRead(z_v[ss]);
+	  zz = zz+xx*yy;
+	  coalescedWrite(z_v[ss],zz);
        });
      }
      double start=usecond();
      for(int64_t i=0;i<Nloop;i++){
-	accelerator_loopN( sss, NN, {
-	  uint64_t lane = sss % nsimd; 
-	  uint64_t ss   = sss / nsimd;
-	  auto xx = extractLane(lane,x_v[ss]);
-	  auto yy = extractLane(lane,y_v[ss]);
-	  auto zz = extractLane(lane,z_v[ss]);
-	  zz = zz + xx * yy;
-	  insertLane(lane,x_v[ss],zz);
+	accelerator_for( ss, Nsite, nsimd ,{
+	  auto xx = coalescedRead(x_v[ss]);
+	  auto yy = coalescedRead(y_v[ss]);
+	  auto zz = coalescedRead(z_v[ss]);
+	  zz = zz+xx*yy;
+	  coalescedWrite(z_v[ss],zz);
        });
      }
      double stop=usecond();
@@ -241,28 +226,23 @@ int main (int argc, char ** argv)
      auto w_v = z.View();
      const uint64_t Nsite = x_v.size();
      const uint64_t nsimd = vComplex::Nsimd();
-      const uint64_t    NN = Nsite*nsimd;
      for(int64_t i=0;i<Nwarm;i++){
-	accelerator_loopN( sss, NN, {
-	  uint64_t lane = sss % nsimd; 
-	  uint64_t ss   = sss / nsimd;
-	  auto xx = extractLane(lane,x_v[ss]);
-	  auto yy = extractLane(lane,y_v[ss]);
-	  auto zz = extractLane(lane,z_v[ss]);
-	  auto ww = zz + xx * yy;
-	  insertLane(lane,w_v[ss],ww);
+	accelerator_for( ss, Nsite, nsimd ,{
+	  auto xx = coalescedRead(x_v[ss]);
+	  auto yy = coalescedRead(y_v[ss]);
+	  auto zz = coalescedRead(z_v[ss]);
+	  auto ww = zz+xx*yy;
+	  coalescedWrite(w_v[ss],ww);
        });
      }
      double start=usecond();
      for(int64_t i=0;i<Nloop;i++){
-	accelerator_loopN( sss, NN, {
-	  uint64_t lane = sss % nsimd; 
-	  uint64_t ss   = sss / nsimd;
-	  auto xx = extractLane(lane,x_v[ss]);
-	  auto yy = extractLane(lane,y_v[ss]);
-	  auto zz = extractLane(lane,z_v[ss]);
-	  auto ww = zz + xx * yy;
-	  insertLane(lane,w_v[ss],ww);
+	accelerator_for( ss, Nsite, nsimd ,{
+	  auto xx = coalescedRead(x_v[ss]);
+	  auto yy = coalescedRead(y_v[ss]);
+	  auto zz = coalescedRead(z_v[ss]);
+	  auto ww = zz+xx*yy;
+	  coalescedWrite(w_v[ss],ww);
        });
      }
      double stop=usecond();