Added a comms benchmark

2025-11-01 04:24:32 +00:00 · 2015-05-02 23:42:30 +01:00
parent 4a1d4f1b3c
commit f663be2a6c
5 changed files with 173 additions and 31 deletions
--- a/benchmarks/Grid_comms.cc
+++ b/benchmarks/Grid_comms.cc
@@ -0,0 +1,83 @@
 #include <Grid.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  std::vector<int> simd_layout({1,1,2,2});
  std::vector<int> mpi_layout ({1,2,2,1});
  std::cout << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
  int Nloop=10;
  for(int lat=4;lat<=16;lat+=4){
    for(int Ls=1;Ls<=16;Ls*=2){
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      double start=usecond();
      int ncomm=0;
      for(int i=0;i<Nloop;i++){
 	std::vector<int> latt_size  ({lat,lat,lat,lat});
 	GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 	std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
 	std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
 	std::vector<CartesianCommunicator::CommsRequest_t> requests;
 	ncomm=0;
 	for(int mu=0;mu<4;mu++){
 	  if (mpi_layout[mu]>1 ) {
 	    ncomm++;
 	    int comm_proc=1;
 	    int xmit_to_rank;
 	    int recv_from_rank;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    Grid.SendToRecvFromBegin(requests,
 				   (void *)&xbuf[mu][0],
 				   xmit_to_rank,
 				   (void *)&rbuf[mu][0],
 				   recv_from_rank,
 				   bytes);
 	    comm_proc = mpi_layout[mu]-1;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    Grid.SendToRecvFromBegin(requests,
 				     (void *)&xbuf[mu+4][0],
 				     xmit_to_rank,
 				     (void *)&rbuf[mu+4][0],
 				     recv_from_rank,
 				     bytes);
 	  }
 	}
 	Grid.SendToRecvFromComplete(requests);
 	Grid.Barrier();
      }
      double stop=usecond();
      double xbytes    = Nloop*bytes*2*ncomm;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;
      double time = stop-start;
      std::cout << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
    }
  }
  Grid_finalize();
 }
--- a/benchmarks/Grid_wilson.cc
+++ b/benchmarks/Grid_wilson.cc
@@ -35,6 +35,7 @@ int main (int argc, char ** argv)
  LatticeFermion src(&Grid); random(pRNG,src);
  LatticeFermion result(&Grid); result=zero;
  LatticeFermion    ref(&Grid);    ref=zero;
  LatticeFermion    err(&Grid);    
  LatticeFermion    tmp(&Grid);    tmp=zero;
  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
  std::vector<LatticeColourMatrix> U(4,&Grid);
@@ -82,12 +83,14 @@ int main (int argc, char ** argv)
  }
  double t1=usecond();
  double flops=1320*volume*ncall;
  std::cout << "Called Dw"<<std::endl;
  std::cout << "norm result "<< norm2(result)<<std::endl;
  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
-  std::cout << "mflop/s = "<< flops/(t1-t0)<<std::endl;
+  std::cout << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
  err = ref -result;
  std::cout << "norm diff   "<< norm2(err)<<std::endl;
  //  for(int ss=0;ss<10;ss++ ){
  for(int ss=0;ss<0;ss++ ){
@@ -100,8 +103,5 @@ int main (int argc, char ** argv)
    }
  }
  ref = ref -result;
  std::cout << "norm diff   "<< norm2(ref)<<std::endl;
  Grid_finalize();
 }
--- a/lib/Grid_communicator.h
+++ b/lib/Grid_communicator.h
@@ -21,6 +21,9 @@ class CartesianCommunicator {
 #ifdef GRID_COMMS_MPI
    MPI_Comm communicator;
    typedef MPI_Request CommsRequest_t;
 #else 
    typedef int CommsRequest_t;
 #endif
    // Constructor
@@ -77,13 +80,20 @@ class CartesianCommunicator {
      GlobalSumVector(ptr,words);
    }
    ////////////////////////////////////////////////////////////
-    // Face exchange
+    // Face exchange, buffer swap in translational invariant way
    ////////////////////////////////////////////////////////////
    void SendToRecvFrom(void *xmit,
 			int xmit_to_rank,
 			void *recv,
 			int recv_from_rank,
 			int bytes);
    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 			 void *xmit,
 			 int xmit_to_rank,
 			 void *recv,
 			 int recv_from_rank,
 			 int bytes);
    void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
    ////////////////////////////////////////////////////////////
    // Barrier
--- a/lib/communicator/Grid_communicator_fake.cc
+++ b/lib/communicator/Grid_communicator_fake.cc
@@ -20,7 +20,7 @@ void CartesianCommunicator::GlobalSum(double &){}
 void CartesianCommunicator::GlobalSum(uint32_t &){}
 void CartesianCommunicator::GlobalSumVector(double *,int N){}
-// Basic Halo comms primitive
+// Basic Halo comms primitive -- should never call in single node
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
 					   void *recv,
@@ -29,6 +29,19 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
  exit(-1);
 }
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  exit(-1);
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  exit(-1);
 }
 void CartesianCommunicator::Barrier(void)
 {
--- a/lib/communicator/Grid_communicator_mpi.cc
+++ b/lib/communicator/Grid_communicator_mpi.cc
@@ -29,37 +29,45 @@ CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
 }
 void CartesianCommunicator::GlobalSum(uint32_t &u){
-  MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
-  MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
-  MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
-  MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
+  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
-  MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
+  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
-  MPI_Cart_shift(communicator,dim,shift,&source,&dest);
+  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
  assert(ierr==0);
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  int rank;
-  MPI_Cart_rank  (communicator, &coor[0], &rank);
+  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
  assert(ierr==0);
  return rank;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
  coor.resize(_ndimension);
-  MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
+  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
  assert(ierr==0);
 }
 // Basic Halo comms primitive
@@ -69,36 +77,64 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int from,
 					   int bytes)
 {
-  MPI_Request reqs[2];
+  std::vector<CommsRequest_t> reqs(0);
-  MPI_Status OkeyDokey[2];
+  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
 }
 // Basic Halo comms primitive
  void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						  void *xmit,
 						  int dest,
 						  void *recv,
 						  int from,
 						  int bytes)
 {
  MPI_Request xrq;
  MPI_Request rrq;
  int rank = _processor;
-  MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&reqs[0]);
+  int ierr;
-  MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&reqs[1]);
+  ierr=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
-  MPI_Waitall(2,reqs,OkeyDokey);
+  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
  assert(ierr==0);
  list.push_back(xrq);
  list.push_back(rrq);
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  int nreq=list.size();
  std::vector<MPI_Status> status(nreq);
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  assert(ierr==0);
 }
 void CartesianCommunicator::Barrier(void)
 {
-  MPI_Barrier(communicator);
+  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
-  MPI_Bcast(data,
+  int ierr=MPI_Bcast(data,
-	    bytes,
+		     bytes,
-	    MPI_BYTE,
+		     MPI_BYTE,
-	    root,
+		     root,
-	    communicator);
+		     communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
-  MPI_Bcast(data,
+  int ierr= MPI_Bcast(data,
-	    bytes,
+		      bytes,
-	    MPI_BYTE,
+		      MPI_BYTE,
-	    root,
+		      root,
-	    MPI_COMM_WORLD);
+		      MPI_COMM_WORLD);
  assert(ierr==0);
 }
 }