Better timing tweaks to give sensible results on 24 threads on Edison dual ivybridge nodes.

2026-01-12 04:49:33 +00:00 · 2015-09-28 16:09:04 -07:00
parent 9f4f65cb46
commit af89c40462
2 changed files with 59 additions and 37 deletions
--- a/benchmarks/Benchmark_memory_asynch.cc
+++ b/benchmarks/Benchmark_memory_asynch.cc
@@ -25,9 +25,10 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t"<<"bytes/thread"<<"\t\t\t"<<"GB/s"<<"\t\t\t"<<"GB/s per thread"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;

-  for(int lat=4;lat<=16536;lat*=2){
+  const int lmax = 16536*16;
+  for(int lat=4;lat<=lmax;lat*=2){

-    int Nloop=16536*1024*4/lat;
+    int Nloop=lmax*128*4/lat;

    std::vector<int> latt_size  ({2*mpi_layout[0],2*mpi_layout[1],4*mpi_layout[2],lat*mpi_layout[3]});

@@ -37,11 +38,15 @@ int main (int argc, char ** argv)

    Vec tsum; tsum = zero;

+    GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+
    std::vector<double> stop(threads);
    Vector<Vec> sum(threads);

    std::vector<LatticeVec> x(threads,&Grid);
-
+    for(int t=0;t<threads;t++){
+      random(pRNG,x[t]);
+    }

    double start=usecond();
 PARALLEL_FOR_LOOP
@@ -64,6 +69,9 @@ PARALLEL_FOR_LOOP
      if ( stop[t]<min_stop ) min_stop=stop[t];
      if ( stop[t]>max_stop ) max_stop=stop[t];
    }
+
+    
+
    double max_time = (max_stop-start)/Nloop*1000;
    double min_time = (min_stop-start)/Nloop*1000;
      
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@@ -10,144 +10,158 @@ int main (int argc, char ** argv)

  const int Nvec=8;
  typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
+  typedef iVector<vReal,Nvec> Vec;

-  int Nloop=1000;
+
+  Vec rn = zero;

  std::vector<int> simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();

  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
  
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking fused AXPY bandwidth ; sizeof(Real) "<<sizeof(Real)<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-
-  for(int lat=4;lat<=32;lat+=4){
+  uint64_t lmax=44;
+#define NLOOP (100*lmax*lmax*lmax*lmax/vol)
+  for(int lat=4;lat<=lmax;lat+=4){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);

-      //GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+      uint64_t Nloop=NLOOP;

-      LatticeVec z(&Grid); //random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+
+      LatticeVec z(&Grid); random(pRNG,z);
+      LatticeVec x(&Grid); random(pRNG,x);
+      LatticeVec y(&Grid); random(pRNG,y);
      double a=2.0;


      double start=usecond();
      for(int i=0;i<Nloop;i++){
-	//   inline void axpy(Lattice<vobj> &ret,double a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
 	axpy(z,a,x,y);
+        x._odata[0]=z._odata[0]; // serial loop dependence to prevent optimise
+        y._odata[4]=z._odata[4];
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
      
      double flops=vol*Nvec*2;// mul,add
      double bytes=3*vol*Nvec*sizeof(Real);
-      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;

    }

  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
  
-  for(int lat=4;lat<=32;lat+=4){
+  for(int lat=4;lat<=lmax;lat+=4){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);

-      //GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();

-      LatticeVec z(&Grid); //random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      LatticeVec z(&Grid); random(pRNG,z);
+      LatticeVec x(&Grid); random(pRNG,x);
+      LatticeVec y(&Grid); random(pRNG,y);
      double a=2.0;

+      uint64_t Nloop=NLOOP;
+
      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	z=a*x-y;
+        x._odata[0]=z._odata[0]; // force serial dependency to prevent optimise away
+        y._odata[4]=z._odata[4];
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
     
      double flops=vol*Nvec*2;// mul,add
      double bytes=3*vol*Nvec*sizeof(Real);
-      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;

    }

  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking SCALE bandwidth"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
+
+  for(int lat=4;lat<=lmax;lat+=4){

-  for(int lat=4;lat<=32;lat+=4){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      uint64_t Nloop=NLOOP;

      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);

-      //GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();

-      LatticeVec z(&Grid); //random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      LatticeVec z(&Grid); random(pRNG,z);
+      LatticeVec x(&Grid); random(pRNG,x);
+      LatticeVec y(&Grid); random(pRNG,y);
      RealD a=2.0;


      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	z=a*x;
+        x._odata[0]=z._odata[0]*2.0;
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
      
      double bytes=2*vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*1;// mul
-      std::cout<<GridLogMessage <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
+      std::cout<<GridLogMessage <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;

  }

  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking READ bandwidth"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;

-  for(int lat=4;lat<=32;lat+=4){
+  for(int lat=4;lat<=lmax;lat+=4){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      uint64_t Nloop=NLOOP;
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);

-      //GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
-
-      LatticeVec z(&Grid); //random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+      LatticeVec z(&Grid); random(pRNG,z);
+      LatticeVec x(&Grid); random(pRNG,x);
+      LatticeVec y(&Grid); random(pRNG,y);
      RealD a=2.0;
-      ComplexD nn;
-
+      Real nn;      
      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	nn=norm2(x);
+	vsplat(x._odata[0]._internal[0],nn);
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
      
      double bytes=vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*2;// mul,add
-      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;

  }