Binary IO file for generic Grid array parallel I/O.

Number of IO MPI tasks can be varied by selecting which dimensions use parallel IO and which dimensions use Serial send to boss I/O. Thus can neck down from, say 1024 nodes = 4x4x8x8 to {1,8,32,64,128,256,1024} nodes doing the I/O. Interpolates nicely between ALL nodes write their data, a single boss per time-plane in processor space [old UKQCD fortran code did this], and a single node doing all I/O. Not sure I have the transfer sizes big enough and am not overly convinced fstream is guaranteed to not give buffer inconsistencies unless I set streambuf size to zero. Practically it has worked on 8 tasks, 2x1x2x2 writing /cloning NERSC configurations on my MacOS + OpenMPI and Clang environment. It is VERY easy to switch to pwrite at a later date, and also easy to send x-strips around from each node in order to gather bigger chunks at the syscall level. That would push us up to the circa 8x 18*4*8 == 4KB size write chunk, and by taking, say, x/y non parallel we get to 16MB contiguous chunks written in multi 4KB transactions per IOnode in 64^3 lattices for configuration I/O. I suspect this is fine for system performance.
2025-11-05 14:29:31 +00:00 · 2015-08-26 13:40:29 +01:00
parent 612957f057
commit dc814f30da
14 changed files with 840 additions and 410 deletions
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -87,6 +87,14 @@ class CartesianCommunicator {
 			void *recv,
 			int recv_from_rank,
 			int bytes);
+
+    void RecvFrom(void *recv,
+		  int recv_from_rank,
+		  int bytes);
+    void SendTo(void *xmit,
+		int xmit_to_rank,
+		int bytes);
+
    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 			 void *xmit,
 			 int xmit_to_rank,
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -81,13 +81,30 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
 }
+void CartesianCommunicator::RecvFrom(void *recv,
+				     int from,
+				     int bytes) 
+{
+  MPI_Status stat;
+  int ierr=MPI_Recv(recv, bytes, MPI_CHAR,from,from,communicator,&stat);
+  assert(ierr==0);
+}
+void CartesianCommunicator::SendTo(void *xmit,
+				   int dest,
+				   int bytes)
+{
+  int rank = _processor; // used for tag; must know who it comes from
+  int ierr = MPI_Send(xmit, bytes, MPI_CHAR,dest,_processor,communicator);
+  assert(ierr==0);
+}
+
 // Basic Halo comms primitive
-  void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						  void *xmit,
-						  int dest,
-						  void *recv,
-						  int from,
-						  int bytes)
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes)
 {
  MPI_Request xrq;
  MPI_Request rrq;
@@ -100,7 +117,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,

  list.push_back(xrq);
  list.push_back(rrq);
-
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -22,6 +22,20 @@ void CartesianCommunicator::GlobalSum(double &){}
 void CartesianCommunicator::GlobalSum(uint32_t &){}
 void CartesianCommunicator::GlobalSumVector(double *,int N){}

+void CartesianCommunicator::RecvFrom(void *recv,
+				     int recv_from_rank,
+				     int bytes) 
+{
+  assert(0);
+}
+void CartesianCommunicator::SendTo(void *xmit,
+				   int xmit_to_rank,
+				   int bytes)
+{
+  assert(0);
+}
+
+
 // Basic Halo comms primitive -- should never call in single node
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,