Added a comms benchmark

2025-07-23 16:07:08 +01:00 · 2015-05-02 23:42:30 +01:00
parent 6a39089a43
commit ea52562527
5 changed files with 173 additions and 31 deletions
--- a/benchmarks/Grid_comms.cc
+++ b/benchmarks/Grid_comms.cc
@@ -0,0 +1,83 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::vector<int> simd_layout({1,1,2,2});
+  std::vector<int> mpi_layout ({1,2,2,1});
+
+
+  std::cout << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
+  int Nloop=10;
+  for(int lat=4;lat<=16;lat+=4){
+    for(int Ls=1;Ls<=16;Ls*=2){
+
+      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+      double start=usecond();
+      int ncomm=0;
+      for(int i=0;i<Nloop;i++){
+	std::vector<int> latt_size  ({lat,lat,lat,lat});
+    
+	GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+
+	std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+	std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+	std::vector<CartesianCommunicator::CommsRequest_t> requests;
+
+	ncomm=0;
+	for(int mu=0;mu<4;mu++){
+	
+	  if (mpi_layout[mu]>1 ) {
+	  
+	    ncomm++;
+	    int comm_proc=1;
+	    int xmit_to_rank;
+	    int recv_from_rank;
+	    
+	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    Grid.SendToRecvFromBegin(requests,
+				   (void *)&xbuf[mu][0],
+				   xmit_to_rank,
+				   (void *)&rbuf[mu][0],
+				   recv_from_rank,
+				   bytes);
+	
+	    comm_proc = mpi_layout[mu]-1;
+	  
+	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    Grid.SendToRecvFromBegin(requests,
+				     (void *)&xbuf[mu+4][0],
+				     xmit_to_rank,
+				     (void *)&rbuf[mu+4][0],
+				     recv_from_rank,
+				     bytes);
+	  
+	  }
+	}
+
+	Grid.SendToRecvFromComplete(requests);
+	Grid.Barrier();
+      }
+
+      double stop=usecond();
+
+      double xbytes    = Nloop*bytes*2*ncomm;
+      double rbytes    = xbytes;
+      double bidibytes = xbytes+rbytes;
+
+      double time = stop-start;
+
+      std::cout << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
+    }
+    
+  }
+
+
+  Grid_finalize();
+}
--- a/benchmarks/Grid_wilson.cc
+++ b/benchmarks/Grid_wilson.cc
@@ -35,6 +35,7 @@ int main (int argc, char ** argv)
  LatticeFermion src(&Grid); random(pRNG,src);
  LatticeFermion result(&Grid); result=zero;
  LatticeFermion    ref(&Grid);    ref=zero;
+  LatticeFermion    err(&Grid);    
  LatticeFermion    tmp(&Grid);    tmp=zero;
  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
  std::vector<LatticeColourMatrix> U(4,&Grid);
@@ -82,12 +83,14 @@ int main (int argc, char ** argv)
  }
  double t1=usecond();
  double flops=1320*volume*ncall;
-
  
  std::cout << "Called Dw"<<std::endl;
  std::cout << "norm result "<< norm2(result)<<std::endl;
  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
-  std::cout << "mflop/s = "<< flops/(t1-t0)<<std::endl;
+  std::cout << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+  err = ref -result;
+  std::cout << "norm diff   "<< norm2(err)<<std::endl;
+

  //  for(int ss=0;ss<10;ss++ ){
  for(int ss=0;ss<0;ss++ ){
@@ -100,8 +103,5 @@ int main (int argc, char ** argv)
    }
  }

-  ref = ref -result;
-  std::cout << "norm diff   "<< norm2(ref)<<std::endl;
-
  Grid_finalize();
 }
--- a/lib/Grid_communicator.h
+++ b/lib/Grid_communicator.h
@@ -21,6 +21,9 @@ class CartesianCommunicator {

 #ifdef GRID_COMMS_MPI
    MPI_Comm communicator;
+    typedef MPI_Request CommsRequest_t;
+#else 
+    typedef int CommsRequest_t;
 #endif

    // Constructor
@@ -77,13 +80,20 @@ class CartesianCommunicator {
      GlobalSumVector(ptr,words);
    }
    ////////////////////////////////////////////////////////////
-    // Face exchange
+    // Face exchange, buffer swap in translational invariant way
    ////////////////////////////////////////////////////////////
    void SendToRecvFrom(void *xmit,
 			int xmit_to_rank,
 			void *recv,
 			int recv_from_rank,
 			int bytes);
+    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+			 void *xmit,
+			 int xmit_to_rank,
+			 void *recv,
+			 int recv_from_rank,
+			 int bytes);
+    void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);

    ////////////////////////////////////////////////////////////
    // Barrier
--- a/lib/communicator/Grid_communicator_fake.cc
+++ b/lib/communicator/Grid_communicator_fake.cc
@@ -20,7 +20,7 @@ void CartesianCommunicator::GlobalSum(double &){}
 void CartesianCommunicator::GlobalSum(uint32_t &){}
 void CartesianCommunicator::GlobalSumVector(double *,int N){}

-// Basic Halo comms primitive
+// Basic Halo comms primitive -- should never call in single node
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
 					   void *recv,
@@ -29,6 +29,19 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
  exit(-1);
 }
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes)
+{
+  exit(-1);
+}
+void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
+{
+  exit(-1);
+}

 void CartesianCommunicator::Barrier(void)
 {
--- a/lib/communicator/Grid_communicator_mpi.cc
+++ b/lib/communicator/Grid_communicator_mpi.cc
@@ -29,37 +29,45 @@ CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
 }

 void CartesianCommunicator::GlobalSum(uint32_t &u){
-  MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
+  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
-  MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
+  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
-  MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
+  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
-  MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
+  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
+  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
-  MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
+  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
+  assert(ierr==0);
 }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
-  MPI_Cart_shift(communicator,dim,shift,&source,&dest);
+  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
+  assert(ierr==0);
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  int rank;
-  MPI_Cart_rank  (communicator, &coor[0], &rank);
+  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
+  assert(ierr==0);
  return rank;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
  coor.resize(_ndimension);
-  MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
+  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
+  assert(ierr==0);
 }

 // Basic Halo comms primitive
@@ -69,36 +77,64 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int from,
 					   int bytes)
 {
-  MPI_Request reqs[2];
-  MPI_Status OkeyDokey[2];
+  std::vector<CommsRequest_t> reqs(0);
+  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
+  SendToRecvFromComplete(reqs);
+}
+// Basic Halo comms primitive
+  void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						  void *xmit,
+						  int dest,
+						  void *recv,
+						  int from,
+						  int bytes)
+{
+  MPI_Request xrq;
+  MPI_Request rrq;
  int rank = _processor;
-  MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&reqs[0]);
-  MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&reqs[1]);
-  MPI_Waitall(2,reqs,OkeyDokey);
+  int ierr;
+  ierr=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+  
+  assert(ierr==0);

+  list.push_back(xrq);
+  list.push_back(rrq);
+
+}
+void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
+{
+  int nreq=list.size();
+  std::vector<MPI_Status> status(nreq);
+  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
+
+  assert(ierr==0);
 }

 void CartesianCommunicator::Barrier(void)
 {
-  MPI_Barrier(communicator);
+  int ierr = MPI_Barrier(communicator);
+  assert(ierr==0);
 }

 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
-  MPI_Bcast(data,
-	    bytes,
-	    MPI_BYTE,
-	    root,
-	    communicator);
+  int ierr=MPI_Bcast(data,
+		     bytes,
+		     MPI_BYTE,
+		     root,
+		     communicator);
+  assert(ierr==0);
 }

 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
-  MPI_Bcast(data,
-	    bytes,
-	    MPI_BYTE,
-	    root,
-	    MPI_COMM_WORLD);
+  int ierr= MPI_Bcast(data,
+		      bytes,
+		      MPI_BYTE,
+		      root,
+		      MPI_COMM_WORLD);
+  assert(ierr==0);
 }

 }