2016-01-02 14:51:32 +00:00
|
|
|
/*************************************************************************************
|
|
|
|
|
|
|
|
Grid physics library, www.github.com/paboyle/Grid
|
|
|
|
|
|
|
|
Source file: ./lib/communicator/Communicator_mpi.cc
|
|
|
|
|
|
|
|
Copyright (C) 2015
|
|
|
|
|
|
|
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License along
|
|
|
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
|
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
|
|
|
|
|
|
See the full license in the file "LICENSE" in the top level distribution directory
|
|
|
|
*************************************************************************************/
|
|
|
|
/* END LEGAL */
|
2017-02-22 18:09:33 +00:00
|
|
|
#include <Grid/GridCore.h>
|
2017-02-28 21:31:54 +00:00
|
|
|
#include <Grid/GridQCDcore.h>
|
|
|
|
#include <Grid/qcd/action/ActionCore.h>
|
2015-03-29 20:35:37 +01:00
|
|
|
#include <mpi.h>
|
|
|
|
|
2015-04-03 05:29:54 +01:00
|
|
|
namespace Grid {
|
2015-03-29 20:35:37 +01:00
|
|
|
|
2016-10-24 17:30:43 +01:00
|
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
// Info that is setup once and indept of cartesian layout
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
MPI_Comm CartesianCommunicator::communicator_world;
|
|
|
|
|
|
|
|
// Should error check all MPI calls.
|
2016-02-14 20:24:38 +00:00
|
|
|
void CartesianCommunicator::Init(int *argc, char ***argv) {
|
2016-03-08 09:55:14 +00:00
|
|
|
int flag;
|
2017-02-07 06:24:54 +00:00
|
|
|
int provided;
|
2016-03-08 09:55:14 +00:00
|
|
|
MPI_Initialized(&flag); // needed to coexist with other libs apparently
|
|
|
|
if ( !flag ) {
|
2017-02-07 06:24:54 +00:00
|
|
|
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
|
2017-02-15 11:11:04 +00:00
|
|
|
if ( provided != MPI_THREAD_MULTIPLE ) {
|
|
|
|
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
|
|
|
|
}
|
2016-03-08 09:55:14 +00:00
|
|
|
}
|
2016-10-24 17:30:43 +01:00
|
|
|
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
|
|
|
|
ShmInitGeneric();
|
2016-10-21 09:03:26 +01:00
|
|
|
}
|
2015-03-29 20:35:37 +01:00
|
|
|
|
2015-06-02 16:57:12 +01:00
|
|
|
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
2015-03-29 20:35:37 +01:00
|
|
|
{
|
2015-04-03 04:52:53 +01:00
|
|
|
_ndimension = processors.size();
|
2015-03-29 20:35:37 +01:00
|
|
|
std::vector<int> periodic(_ndimension,1);
|
|
|
|
|
2015-04-03 04:52:53 +01:00
|
|
|
_Nprocessors=1;
|
2015-03-29 20:35:37 +01:00
|
|
|
_processors = processors;
|
2015-04-03 04:52:53 +01:00
|
|
|
_processor_coor.resize(_ndimension);
|
|
|
|
|
2016-10-24 17:30:43 +01:00
|
|
|
MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
|
2015-03-29 20:35:37 +01:00
|
|
|
MPI_Comm_rank(communicator,&_processor);
|
2015-04-03 04:52:53 +01:00
|
|
|
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
|
2015-04-03 22:54:13 +01:00
|
|
|
|
2015-04-03 04:52:53 +01:00
|
|
|
for(int i=0;i<_ndimension;i++){
|
|
|
|
_Nprocessors*=_processors[i];
|
|
|
|
}
|
2015-04-03 22:54:13 +01:00
|
|
|
|
|
|
|
int Size;
|
|
|
|
MPI_Comm_size(communicator,&Size);
|
|
|
|
|
|
|
|
assert(Size==_Nprocessors);
|
2015-03-29 20:35:37 +01:00
|
|
|
}
|
2015-04-22 22:46:48 +01:00
|
|
|
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
2015-05-02 23:42:30 +01:00
|
|
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
|
|
|
assert(ierr==0);
|
2015-04-22 22:46:48 +01:00
|
|
|
}
|
2016-03-12 00:06:54 +00:00
|
|
|
void CartesianCommunicator::GlobalSum(uint64_t &u){
|
|
|
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
|
|
|
|
assert(ierr==0);
|
|
|
|
}
|
2015-04-06 06:30:48 +01:00
|
|
|
void CartesianCommunicator::GlobalSum(float &f){
|
2015-05-02 23:42:30 +01:00
|
|
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
|
|
|
assert(ierr==0);
|
2015-03-29 20:35:37 +01:00
|
|
|
}
|
2015-04-06 06:30:48 +01:00
|
|
|
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
2015-03-29 20:35:37 +01:00
|
|
|
{
|
2015-05-02 23:42:30 +01:00
|
|
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
|
|
|
assert(ierr==0);
|
2015-03-29 20:35:37 +01:00
|
|
|
}
|
2015-04-06 06:30:48 +01:00
|
|
|
void CartesianCommunicator::GlobalSum(double &d)
|
2015-03-29 20:35:37 +01:00
|
|
|
{
|
2015-05-02 23:42:30 +01:00
|
|
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
|
|
|
assert(ierr==0);
|
2015-03-29 20:35:37 +01:00
|
|
|
}
|
2015-04-06 06:30:48 +01:00
|
|
|
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
2015-03-29 20:35:37 +01:00
|
|
|
{
|
2015-05-02 23:42:30 +01:00
|
|
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
|
|
|
assert(ierr==0);
|
2015-03-29 20:35:37 +01:00
|
|
|
}
|
2015-04-03 04:52:53 +01:00
|
|
|
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
|
2015-03-29 20:35:37 +01:00
|
|
|
{
|
2015-05-02 23:42:30 +01:00
|
|
|
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
|
|
|
|
assert(ierr==0);
|
2015-03-29 20:35:37 +01:00
|
|
|
}
|
2015-04-06 06:30:48 +01:00
|
|
|
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
|
2015-03-29 20:35:37 +01:00
|
|
|
{
|
|
|
|
int rank;
|
2015-05-02 23:42:30 +01:00
|
|
|
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
|
|
|
|
assert(ierr==0);
|
2015-03-29 20:35:37 +01:00
|
|
|
return rank;
|
|
|
|
}
|
2015-04-06 06:30:48 +01:00
|
|
|
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
|
|
|
|
{
|
|
|
|
coor.resize(_ndimension);
|
2015-05-02 23:42:30 +01:00
|
|
|
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
|
|
|
|
assert(ierr==0);
|
2015-04-06 06:30:48 +01:00
|
|
|
}
|
2015-03-29 20:35:37 +01:00
|
|
|
|
|
|
|
// Basic Halo comms primitive
|
|
|
|
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
2015-04-03 04:52:53 +01:00
|
|
|
int dest,
|
2015-03-29 20:35:37 +01:00
|
|
|
void *recv,
|
2015-04-03 04:52:53 +01:00
|
|
|
int from,
|
2015-03-29 20:35:37 +01:00
|
|
|
int bytes)
|
|
|
|
{
|
2015-05-02 23:42:30 +01:00
|
|
|
std::vector<CommsRequest_t> reqs(0);
|
|
|
|
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
|
|
|
|
SendToRecvFromComplete(reqs);
|
|
|
|
}
|
2016-02-21 14:03:21 +00:00
|
|
|
|
|
|
|
void CartesianCommunicator::SendRecvPacket(void *xmit,
|
|
|
|
void *recv,
|
|
|
|
int sender,
|
|
|
|
int receiver,
|
|
|
|
int bytes)
|
Binary IO file for generic Grid array parallel I/O.
Number of IO MPI tasks can be varied by selecting which
dimensions use parallel IO and which dimensions use Serial send to boss
I/O.
Thus can neck down from, say 1024 nodes = 4x4x8x8 to {1,8,32,64,128,256,1024} nodes
doing the I/O.
Interpolates nicely between ALL nodes write their data, a single boss per time-plane
in processor space [old UKQCD fortran code did this], and a single node doing all I/O.
Not sure I have the transfer sizes big enough and am not overly convinced fstream
is guaranteed to not give buffer inconsistencies unless I set streambuf size to zero.
Practically it has worked on 8 tasks, 2x1x2x2 writing /cloning NERSC configurations
on my MacOS + OpenMPI and Clang environment.
It is VERY easy to switch to pwrite at a later date, and also easy to send x-strips around from
each node in order to gather bigger chunks at the syscall level.
That would push us up to the circa 8x 18*4*8 == 4KB size write chunk, and by taking, say, x/y non
parallel we get to 16MB contiguous chunks written in multi 4KB transactions
per IOnode in 64^3 lattices for configuration I/O.
I suspect this is fine for system performance.
2015-08-26 13:40:29 +01:00
|
|
|
{
|
|
|
|
MPI_Status stat;
|
2016-02-21 14:03:21 +00:00
|
|
|
assert(sender != receiver);
|
|
|
|
int tag = sender;
|
|
|
|
if ( _processor == sender ) {
|
|
|
|
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
|
|
|
|
}
|
|
|
|
if ( _processor == receiver ) {
|
|
|
|
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
|
|
|
|
}
|
Binary IO file for generic Grid array parallel I/O.
Number of IO MPI tasks can be varied by selecting which
dimensions use parallel IO and which dimensions use Serial send to boss
I/O.
Thus can neck down from, say 1024 nodes = 4x4x8x8 to {1,8,32,64,128,256,1024} nodes
doing the I/O.
Interpolates nicely between ALL nodes write their data, a single boss per time-plane
in processor space [old UKQCD fortran code did this], and a single node doing all I/O.
Not sure I have the transfer sizes big enough and am not overly convinced fstream
is guaranteed to not give buffer inconsistencies unless I set streambuf size to zero.
Practically it has worked on 8 tasks, 2x1x2x2 writing /cloning NERSC configurations
on my MacOS + OpenMPI and Clang environment.
It is VERY easy to switch to pwrite at a later date, and also easy to send x-strips around from
each node in order to gather bigger chunks at the syscall level.
That would push us up to the circa 8x 18*4*8 == 4KB size write chunk, and by taking, say, x/y non
parallel we get to 16MB contiguous chunks written in multi 4KB transactions
per IOnode in 64^3 lattices for configuration I/O.
I suspect this is fine for system performance.
2015-08-26 13:40:29 +01:00
|
|
|
}
|
|
|
|
|
2015-05-02 23:42:30 +01:00
|
|
|
// Basic Halo comms primitive
|
2016-08-05 10:36:00 +01:00
|
|
|
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
|
|
|
void *xmit,
|
|
|
|
int dest,
|
|
|
|
void *recv,
|
|
|
|
int from,
|
|
|
|
int bytes)
|
2015-05-02 23:42:30 +01:00
|
|
|
{
|
2017-02-07 06:24:54 +00:00
|
|
|
int myrank = _processor;
|
2015-05-02 23:42:30 +01:00
|
|
|
int ierr;
|
2017-02-21 10:24:27 +00:00
|
|
|
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
|
2017-02-07 06:24:54 +00:00
|
|
|
MPI_Request xrq;
|
|
|
|
MPI_Request rrq;
|
|
|
|
|
|
|
|
ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
|
|
|
|
ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
|
|
|
|
|
|
|
|
assert(ierr==0);
|
|
|
|
list.push_back(xrq);
|
|
|
|
list.push_back(rrq);
|
|
|
|
} else {
|
|
|
|
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
|
|
|
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
|
|
|
|
recv,bytes,MPI_CHAR,from, from,
|
|
|
|
communicator,MPI_STATUS_IGNORE);
|
|
|
|
assert(ierr==0);
|
|
|
|
}
|
2015-05-02 23:42:30 +01:00
|
|
|
}
|
|
|
|
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
|
|
|
{
|
2017-02-21 10:24:27 +00:00
|
|
|
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
|
2017-02-07 06:24:54 +00:00
|
|
|
int nreq=list.size();
|
|
|
|
std::vector<MPI_Status> status(nreq);
|
|
|
|
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
|
|
|
assert(ierr==0);
|
|
|
|
}
|
2015-03-29 20:35:37 +01:00
|
|
|
}
|
|
|
|
|
2015-04-06 06:30:48 +01:00
|
|
|
void CartesianCommunicator::Barrier(void)
|
|
|
|
{
|
2015-05-02 23:42:30 +01:00
|
|
|
int ierr = MPI_Barrier(communicator);
|
|
|
|
assert(ierr==0);
|
2015-04-06 06:30:48 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
|
|
|
{
|
2015-05-02 23:42:30 +01:00
|
|
|
int ierr=MPI_Bcast(data,
|
|
|
|
bytes,
|
|
|
|
MPI_BYTE,
|
|
|
|
root,
|
|
|
|
communicator);
|
|
|
|
assert(ierr==0);
|
2015-04-06 06:30:48 +01:00
|
|
|
}
|
2016-10-24 17:30:43 +01:00
|
|
|
///////////////////////////////////////////////////////
|
|
|
|
// Should only be used prior to Grid Init finished.
|
|
|
|
// Check for this?
|
|
|
|
///////////////////////////////////////////////////////
|
2016-11-01 11:35:43 +00:00
|
|
|
int CartesianCommunicator::RankWorld(void){
|
|
|
|
int r;
|
|
|
|
MPI_Comm_rank(communicator_world,&r);
|
|
|
|
return r;
|
|
|
|
}
|
2015-04-24 20:21:40 +01:00
|
|
|
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
|
|
|
{
|
2015-05-02 23:42:30 +01:00
|
|
|
int ierr= MPI_Bcast(data,
|
|
|
|
bytes,
|
|
|
|
MPI_BYTE,
|
|
|
|
root,
|
2016-10-24 17:30:43 +01:00
|
|
|
communicator_world);
|
2015-05-02 23:42:30 +01:00
|
|
|
assert(ierr==0);
|
2015-04-24 20:21:40 +01:00
|
|
|
}
|
|
|
|
|
2015-03-29 20:35:37 +01:00
|
|
|
}
|
|
|
|
|