mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 15:55:37 +00:00
dc814f30da
Number of IO MPI tasks can be varied by selecting which dimensions use parallel IO and which dimensions use Serial send to boss I/O. Thus can neck down from, say 1024 nodes = 4x4x8x8 to {1,8,32,64,128,256,1024} nodes doing the I/O. Interpolates nicely between ALL nodes write their data, a single boss per time-plane in processor space [old UKQCD fortran code did this], and a single node doing all I/O. Not sure I have the transfer sizes big enough and am not overly convinced fstream is guaranteed to not give buffer inconsistencies unless I set streambuf size to zero. Practically it has worked on 8 tasks, 2x1x2x2 writing /cloning NERSC configurations on my MacOS + OpenMPI and Clang environment. It is VERY easy to switch to pwrite at a later date, and also easy to send x-strips around from each node in order to gather bigger chunks at the syscall level. That would push us up to the circa 8x 18*4*8 == 4KB size write chunk, and by taking, say, x/y non parallel we get to 16MB contiguous chunks written in multi 4KB transactions per IOnode in 64^3 lattices for configuration I/O. I suspect this is fine for system performance.
158 lines
3.9 KiB
C++
158 lines
3.9 KiB
C++
#include "Grid.h"
|
|
#include <mpi.h>
|
|
|
|
namespace Grid {
|
|
|
|
// Should error check all MPI calls.
|
|
|
|
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
|
{
|
|
_ndimension = processors.size();
|
|
std::vector<int> periodic(_ndimension,1);
|
|
|
|
_Nprocessors=1;
|
|
_processors = processors;
|
|
_processor_coor.resize(_ndimension);
|
|
|
|
MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator);
|
|
MPI_Comm_rank(communicator,&_processor);
|
|
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
|
|
|
|
for(int i=0;i<_ndimension;i++){
|
|
_Nprocessors*=_processors[i];
|
|
}
|
|
|
|
int Size;
|
|
MPI_Comm_size(communicator,&Size);
|
|
|
|
assert(Size==_Nprocessors);
|
|
}
|
|
|
|
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
|
assert(ierr==0);
|
|
}
|
|
void CartesianCommunicator::GlobalSum(float &f){
|
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
|
assert(ierr==0);
|
|
}
|
|
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
|
{
|
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
|
assert(ierr==0);
|
|
}
|
|
void CartesianCommunicator::GlobalSum(double &d)
|
|
{
|
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
|
assert(ierr==0);
|
|
}
|
|
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
|
{
|
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
|
assert(ierr==0);
|
|
}
|
|
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
|
|
{
|
|
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
|
|
assert(ierr==0);
|
|
}
|
|
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
|
|
{
|
|
int rank;
|
|
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
|
|
assert(ierr==0);
|
|
return rank;
|
|
}
|
|
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
|
|
{
|
|
coor.resize(_ndimension);
|
|
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
|
|
assert(ierr==0);
|
|
}
|
|
|
|
// Basic Halo comms primitive
|
|
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
|
int dest,
|
|
void *recv,
|
|
int from,
|
|
int bytes)
|
|
{
|
|
std::vector<CommsRequest_t> reqs(0);
|
|
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
|
|
SendToRecvFromComplete(reqs);
|
|
}
|
|
void CartesianCommunicator::RecvFrom(void *recv,
|
|
int from,
|
|
int bytes)
|
|
{
|
|
MPI_Status stat;
|
|
int ierr=MPI_Recv(recv, bytes, MPI_CHAR,from,from,communicator,&stat);
|
|
assert(ierr==0);
|
|
}
|
|
void CartesianCommunicator::SendTo(void *xmit,
|
|
int dest,
|
|
int bytes)
|
|
{
|
|
int rank = _processor; // used for tag; must know who it comes from
|
|
int ierr = MPI_Send(xmit, bytes, MPI_CHAR,dest,_processor,communicator);
|
|
assert(ierr==0);
|
|
}
|
|
|
|
// Basic Halo comms primitive
|
|
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
|
void *xmit,
|
|
int dest,
|
|
void *recv,
|
|
int from,
|
|
int bytes)
|
|
{
|
|
MPI_Request xrq;
|
|
MPI_Request rrq;
|
|
int rank = _processor;
|
|
int ierr;
|
|
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
|
|
ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
|
|
|
|
assert(ierr==0);
|
|
|
|
list.push_back(xrq);
|
|
list.push_back(rrq);
|
|
}
|
|
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
|
{
|
|
int nreq=list.size();
|
|
std::vector<MPI_Status> status(nreq);
|
|
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
|
|
|
assert(ierr==0);
|
|
}
|
|
|
|
void CartesianCommunicator::Barrier(void)
|
|
{
|
|
int ierr = MPI_Barrier(communicator);
|
|
assert(ierr==0);
|
|
}
|
|
|
|
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
|
{
|
|
int ierr=MPI_Bcast(data,
|
|
bytes,
|
|
MPI_BYTE,
|
|
root,
|
|
communicator);
|
|
assert(ierr==0);
|
|
}
|
|
|
|
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
|
{
|
|
int ierr= MPI_Bcast(data,
|
|
bytes,
|
|
MPI_BYTE,
|
|
root,
|
|
MPI_COMM_WORLD);
|
|
assert(ierr==0);
|
|
}
|
|
|
|
}
|
|
|