Improved threaded comms benchmark

2026-07-20 10:33:27 +01:00 · 2017-06-28 23:27:02 +01:00
parent 54e94360ad
commit 6f5a5cd9b3
2 changed files with 102 additions and 3 deletions
@@ -435,6 +435,100 @@ int main (int argc, char ** argv)
 
    }
  }    
+
+
+
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  header();
+
+  for(int lat=4;lat<=maxlat;lat+=4){
+    for(int Ls=8;Ls<=8;Ls*=2){
+
+      std::vector<int> latt_size  ({lat*mpi_layout[0],
+      				    lat*mpi_layout[1],
+      				    lat*mpi_layout[2],
+      				    lat*mpi_layout[3]});
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+      RealD Nrank = Grid._Nprocessors;
+      RealD Nnode = Grid.NodeCount();
+      RealD ppn = Nrank/Nnode;
+
+      std::vector<HalfSpinColourVectorD *> xbuf(8);
+      std::vector<HalfSpinColourVectorD *> rbuf(8);
+      Grid.ShmBufferFreeAll();
+      for(int d=0;d<8;d++){
+	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+      }
+
+      int ncomm;
+      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+      double dbytes;
+      for(int i=0;i<Nloop;i++){
+	double start=usecond();
+
+	std::vector<CartesianCommunicator::CommsRequest_t> requests;
+	dbytes=0;
+	ncomm=0;
+
+	parallel_for(int dir=0;dir<8;dir++){
+
+	  double tbytes;
+	  int mu =dir % 4;
+
+	  if (mpi_layout[mu]>1 ) {
+	  
+	    ncomm++;
+	    int xmit_to_rank;
+	    int recv_from_rank;
+	    if ( dir == mu ) { 
+	      int comm_proc=1;
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    } else { 
+	      int comm_proc = mpi_layout[mu]-1;
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    }
+	    tbytes= Grid.StencilSendToRecvFromBegin(requests,
+						    (void *)&xbuf[dir][0],
+						    xmit_to_rank,
+						    (void *)&rbuf[dir][0],
+						    recv_from_rank,
+						    bytes,dir);
+	    Grid.StencilSendToRecvFromComplete(requests,dir);
+	    requests.resize(0);
+
+#pragma omp atomic
+	    dbytes+=tbytes;
+	  }
+	}
+	Grid.Barrier();
+	double stop=usecond();
+	t_time[i] = stop-start; // microseconds
+      }
+
+      timestat.statistics(t_time);
+
+      dbytes=dbytes*ppn;
+      double xbytes    = dbytes*0.5;
+      double rbytes    = dbytes*0.5;
+      double bidibytes = dbytes;
+
+
+      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
+               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
+               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
+               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
+               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
+               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
+ 
+    }
+  }    
+
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;