mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 15:55:37 +00:00
490 lines
15 KiB
C++
490 lines
15 KiB
C++
|
|
/*************************************************************************************
|
|
|
|
Grid physics library, www.github.com/paboyle/Grid
|
|
|
|
Source file: ./lib/communicator/Communicator_mpi.cc
|
|
|
|
Copyright (C) 2015
|
|
|
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License along
|
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
|
|
See the full license in the file "LICENSE" in the top level distribution directory
|
|
*************************************************************************************/
|
|
/* END LEGAL */
|
|
#include "Grid.h"
|
|
#include <mpi.h>
|
|
|
|
namespace Grid {
|
|
|
|
|
|
|
|
// Global used by Init and nowhere else. How to hide?
|
|
int Rank(void) {
|
|
int pe;
|
|
MPI_Comm_rank(MPI_COMM_WORLD,&pe);
|
|
return pe;
|
|
}
|
|
// Should error check all MPI calls.
|
|
void CartesianCommunicator::Init(int *argc, char ***argv) {
|
|
int flag;
|
|
MPI_Initialized(&flag); // needed to coexist with other libs apparently
|
|
if ( !flag ) {
|
|
MPI_Init(argc,argv);
|
|
}
|
|
}
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
// Want to implement some magic ... Group sub-cubes into those on same node
|
|
//
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
|
|
{
|
|
std::vector<int> coor = _processor_coor;
|
|
|
|
assert(std::abs(shift) <_processors[dim]);
|
|
|
|
coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
|
|
Lexicographic::IndexFromCoor(coor,source,_processors);
|
|
source = LexicographicToWorldRank[source];
|
|
|
|
coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
|
|
Lexicographic::IndexFromCoor(coor,dest,_processors);
|
|
dest = LexicographicToWorldRank[dest];
|
|
}
|
|
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
|
|
{
|
|
int rank;
|
|
Lexicographic::IndexFromCoor(coor,rank,_processors);
|
|
rank = LexicographicToWorldRank[rank];
|
|
return rank;
|
|
}
|
|
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
|
|
{
|
|
Lexicographic::CoorFromIndex(coor,rank,_processors);
|
|
rank = LexicographicToWorldRank[rank];
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
|
// Info that is setup once and indept of cartesian layout
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
|
int CartesianCommunicator::ShmSetup = 0;
|
|
int CartesianCommunicator::ShmRank;
|
|
int CartesianCommunicator::ShmSize;
|
|
int CartesianCommunicator::GroupRank;
|
|
int CartesianCommunicator::GroupSize;
|
|
MPI_Comm CartesianCommunicator::ShmComm;
|
|
MPI_Win CartesianCommunicator::ShmWindow;
|
|
std::vector<int> CartesianCommunicator::GroupRanks;
|
|
std::vector<int> CartesianCommunicator::MyGroup;
|
|
|
|
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
|
{
|
|
|
|
_ndimension = processors.size();
|
|
|
|
WorldDims = processors;
|
|
|
|
communicator = MPI_COMM_WORLD;
|
|
MPI_Comm_rank(communicator,&WorldRank);
|
|
MPI_Comm_size(communicator,&WorldSize);
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
// Plan: allocate a fixed SHM region. Scratch that is just used via some scheme during stencil comms, with no allocate free.
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
// Does every grid need one, or could we share across all grids via a singleton/guard?
|
|
int ierr;
|
|
|
|
if ( !ShmSetup ) {
|
|
|
|
MPI_Comm_split_type(communicator, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
|
|
MPI_Comm_rank(ShmComm ,&ShmRank);
|
|
MPI_Comm_size(ShmComm ,&ShmSize);
|
|
GroupSize = WorldSize/ShmSize;
|
|
|
|
/////////////////////////////////////////////////////////////////////
|
|
// find world ranks in our SHM group (i.e. which ranks are on our node)
|
|
/////////////////////////////////////////////////////////////////////
|
|
MPI_Group WorldGroup, ShmGroup;
|
|
MPI_Comm_group (communicator, &WorldGroup);
|
|
MPI_Comm_group (ShmComm, &ShmGroup);
|
|
|
|
std::vector<int> world_ranks(WorldSize);
|
|
GroupRanks.resize(WorldSize);
|
|
MyGroup.resize(ShmSize);
|
|
for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
|
|
|
|
MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]);
|
|
|
|
///////////////////////////////////////////////////////////////////
|
|
// Identify who is in my group and noninate the leader
|
|
///////////////////////////////////////////////////////////////////
|
|
int g=0;
|
|
for(int rank=0;rank<WorldSize;rank++){
|
|
if(GroupRanks[rank]!=MPI_UNDEFINED){
|
|
assert(g<ShmSize);
|
|
MyGroup[g++] = rank;
|
|
}
|
|
}
|
|
|
|
std::sort(MyGroup.begin(),MyGroup.end(),std::greater<int>());
|
|
int myleader = MyGroup[0];
|
|
|
|
std::vector<int> leaders_1hot(WorldSize,0);
|
|
std::vector<int> leaders_group(GroupSize,0);
|
|
leaders_1hot [ myleader ] = 1;
|
|
|
|
///////////////////////////////////////////////////////////////////
|
|
// global sum leaders over comm world
|
|
///////////////////////////////////////////////////////////////////
|
|
ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator);
|
|
assert(ierr==0);
|
|
|
|
///////////////////////////////////////////////////////////////////
|
|
// find the group leaders world rank
|
|
///////////////////////////////////////////////////////////////////
|
|
int group=0;
|
|
for(int l=0;l<WorldSize;l++){
|
|
if(leaders_1hot[l]){
|
|
leaders_group[group++] = l;
|
|
}
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////
|
|
// Identify the rank of the group in which I (and my leader) live
|
|
///////////////////////////////////////////////////////////////////
|
|
GroupRank=-1;
|
|
for(int g=0;g<GroupSize;g++){
|
|
if (myleader == leaders_group[g]){
|
|
GroupRank=g;
|
|
}
|
|
}
|
|
assert(GroupRank!=-1);
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
// allocate the shared window for our group
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
ShmCommBuf = 0;
|
|
ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow);
|
|
assert(ierr==0);
|
|
MPI_Win_lock_all (MPI_MODE_NOCHECK, ShmWindow);
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
// Verbose for now
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
std::cout<< "Ranks per node "<< ShmSize << std::endl;
|
|
std::cout<< "Nodes "<< GroupSize << std::endl;
|
|
std::cout<< "Ranks "<< WorldSize << std::endl;
|
|
std::cout<< "Shm CommBuf "<< ShmCommBuf << std::endl;
|
|
|
|
// Done
|
|
ShmSetup=1;
|
|
|
|
}
|
|
|
|
ShmCommBufs.resize(ShmSize);
|
|
ShmStencilBufs.resize(ShmSize);
|
|
for(int r=0;r<ShmSize;r++){
|
|
MPI_Aint sz;
|
|
int dsp_unit;
|
|
MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
|
|
ShmStencilBufs[r] = (void *) ((uint64_t)ShmCommBufs[r]+MAX_MPI_SHM_BYTES/4);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////
|
|
// Assert power of two shm_size.
|
|
////////////////////////////////////////////////////////////////
|
|
int log2size = -1;
|
|
for(int i=0;i<=MAXLOG2RANKSPERNODE;i++){
|
|
if ( (0x1<<i) == ShmSize ) {
|
|
log2size = i;
|
|
break;
|
|
}
|
|
}
|
|
assert(log2size != -1);
|
|
|
|
////////////////////////////////////////////////////////////////
|
|
// Identify subblock of ranks on node spreading across dims
|
|
// in a maximally symmetrical way
|
|
////////////////////////////////////////////////////////////////
|
|
int dim = 0;
|
|
|
|
ShmDims.resize(_ndimension,1);
|
|
GroupDims.resize(_ndimension);
|
|
|
|
ShmCoor.resize(_ndimension);
|
|
GroupCoor.resize(_ndimension);
|
|
WorldCoor.resize(_ndimension);
|
|
for(int l2=0;l2<log2size;l2++){
|
|
while ( WorldDims[dim] / ShmDims[dim] <= 1 ) dim=(dim+1)%_ndimension;
|
|
ShmDims[dim]*=2;
|
|
dim=(dim+1)%_ndimension;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////
|
|
// Establish torus of processes and nodes with sub-blockings
|
|
////////////////////////////////////////////////////////////////
|
|
for(int d=0;d<_ndimension;d++){
|
|
GroupDims[d] = WorldDims[d]/ShmDims[d];
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////
|
|
// Check processor counts match
|
|
////////////////////////////////////////////////////////////////
|
|
_Nprocessors=1;
|
|
_processors = processors;
|
|
_processor_coor.resize(_ndimension);
|
|
for(int i=0;i<_ndimension;i++){
|
|
_Nprocessors*=_processors[i];
|
|
}
|
|
assert(WorldSize==_Nprocessors);
|
|
|
|
////////////////////////////////////////////////////////////////
|
|
// Establish mapping between lexico physics coord and WorldRank
|
|
//
|
|
////////////////////////////////////////////////////////////////
|
|
LexicographicToWorldRank.resize(WorldSize,0);
|
|
Lexicographic::CoorFromIndex(GroupCoor,GroupRank,GroupDims);
|
|
Lexicographic::CoorFromIndex(ShmCoor,ShmRank,ShmDims);
|
|
for(int d=0;d<_ndimension;d++){
|
|
WorldCoor[d] = GroupCoor[d]*ShmDims[d]+ShmCoor[d];
|
|
}
|
|
_processor_coor = WorldCoor;
|
|
|
|
int lexico;
|
|
Lexicographic::IndexFromCoor(WorldCoor,lexico,WorldDims);
|
|
LexicographicToWorldRank[lexico]=WorldRank;
|
|
_processor = lexico;
|
|
|
|
///////////////////////////////////////////////////////////////////
|
|
// global sum Lexico to World mapping
|
|
///////////////////////////////////////////////////////////////////
|
|
ierr=MPI_Allreduce(MPI_IN_PLACE,&LexicographicToWorldRank[0],WorldSize,MPI_INT,MPI_SUM,communicator);
|
|
assert(ierr==0);
|
|
|
|
};
|
|
|
|
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
|
assert(ierr==0);
|
|
}
|
|
void CartesianCommunicator::GlobalSum(uint64_t &u){
|
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
|
|
assert(ierr==0);
|
|
}
|
|
void CartesianCommunicator::GlobalSum(float &f){
|
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
|
assert(ierr==0);
|
|
}
|
|
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
|
{
|
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
|
assert(ierr==0);
|
|
}
|
|
void CartesianCommunicator::GlobalSum(double &d)
|
|
{
|
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
|
assert(ierr==0);
|
|
}
|
|
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
|
{
|
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
|
assert(ierr==0);
|
|
}
|
|
|
|
|
|
// Basic Halo comms primitive
|
|
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
|
int dest,
|
|
void *recv,
|
|
int from,
|
|
int bytes)
|
|
{
|
|
std::vector<CommsRequest_t> reqs(0);
|
|
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
|
|
SendToRecvFromComplete(reqs);
|
|
}
|
|
|
|
void CartesianCommunicator::SendRecvPacket(void *xmit,
|
|
void *recv,
|
|
int sender,
|
|
int receiver,
|
|
int bytes)
|
|
{
|
|
MPI_Status stat;
|
|
assert(sender != receiver);
|
|
int tag = sender;
|
|
if ( _processor == sender ) {
|
|
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
|
|
}
|
|
if ( _processor == receiver ) {
|
|
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
|
|
}
|
|
}
|
|
|
|
// Basic Halo comms primitive
|
|
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
|
void *xmit,
|
|
int dest,
|
|
void *recv,
|
|
int from,
|
|
int bytes)
|
|
{
|
|
#undef SHM_USE_BCOPY
|
|
MPI_Request xrq;
|
|
MPI_Request rrq;
|
|
|
|
static int sequence;
|
|
|
|
int rank = _processor;
|
|
int ierr;
|
|
int tag;
|
|
int check;
|
|
|
|
assert(dest != _processor);
|
|
assert(from != _processor);
|
|
|
|
int gdest = GroupRanks[dest];
|
|
int gme = GroupRanks[_processor];
|
|
|
|
sequence++;
|
|
|
|
char *to_ptr = (char *)ShmCommBufs[gdest];
|
|
char *from_ptr = (char *)ShmCommBufs[ShmRank];
|
|
|
|
int small = (bytes<MAX_MPI_SHM_BYTES) || (shm_mode==0);
|
|
|
|
#ifndef SHM_USE_BCOPY
|
|
typedef double T;
|
|
int words = bytes/sizeof(T);
|
|
assert(((size_t)bytes &(sizeof(T)-1))==0);
|
|
// assert(((size_t)xmit &(sizeof(T)-1))==0);
|
|
// assert(((size_t)recv &(sizeof(T)-1))==0);
|
|
#endif
|
|
|
|
assert(gme == ShmRank);
|
|
|
|
// std::cerr << "proc dest from gme gdest "<<_processor<<" "<<dest <<" "<< from <<" "<<gme<<" "<< gdest<<std::endl; Barrier();
|
|
if ( small && (dest !=MPI_UNDEFINED) ) {
|
|
assert(gme != gdest);
|
|
|
|
#ifdef SHM_USE_BCOPY
|
|
bcopy(xmit,to_ptr,bytes);
|
|
#else
|
|
T *ip = (T *)xmit;
|
|
T *op = (T *)to_ptr;
|
|
PARALLEL_FOR_LOOP
|
|
for(int w=0;w<words;w++) {
|
|
op[w]=ip[w];
|
|
}
|
|
#endif
|
|
bcopy(&_processor,&to_ptr[bytes],sizeof(_processor));
|
|
bcopy(& sequence,&to_ptr[bytes+4],sizeof(sequence));
|
|
} else {
|
|
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
|
|
assert(ierr==0);
|
|
list.push_back(xrq);
|
|
}
|
|
|
|
// std::cout << "Syncing "<<std::endl; Barrier();
|
|
MPI_Win_sync (ShmWindow);
|
|
MPI_Barrier (ShmComm);
|
|
MPI_Win_sync (ShmWindow);
|
|
|
|
// std::cout << "Receiving "<<std::endl; Barrier();
|
|
|
|
if (small && (from !=MPI_UNDEFINED) ) {
|
|
#ifdef SHM_USE_BCOPY
|
|
bcopy(from_ptr,recv,bytes);
|
|
#else
|
|
T *ip = (T *)from_ptr;
|
|
T *op = (T *)recv;
|
|
PARALLEL_FOR_LOOP
|
|
for(int w=0;w<words;w++) {
|
|
op[w]=ip[w];
|
|
}
|
|
#endif
|
|
bcopy(&from_ptr[bytes] ,&tag ,sizeof(tag));
|
|
bcopy(&from_ptr[bytes+4],&check,sizeof(check));
|
|
assert(check==sequence);
|
|
assert(tag==from);
|
|
} else {
|
|
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
|
|
assert(ierr==0);
|
|
list.push_back(rrq);
|
|
}
|
|
|
|
// std::cout << "Syncing"<<std::endl; Barrier();
|
|
|
|
MPI_Win_sync (ShmWindow);
|
|
MPI_Barrier (ShmComm);
|
|
MPI_Win_sync (ShmWindow);
|
|
|
|
#if 0
|
|
MPI_Request xrq;
|
|
MPI_Request rrq;
|
|
int rank = _processor;
|
|
int ierr;
|
|
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
|
|
ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
|
|
|
|
assert(ierr==0);
|
|
|
|
list.push_back(xrq);
|
|
list.push_back(rrq);
|
|
#endif
|
|
}
|
|
|
|
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
|
{
|
|
int nreq=list.size();
|
|
std::vector<MPI_Status> status(nreq);
|
|
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
|
|
|
assert(ierr==0);
|
|
}
|
|
|
|
void CartesianCommunicator::Barrier(void)
|
|
{
|
|
int ierr = MPI_Barrier(communicator);
|
|
assert(ierr==0);
|
|
}
|
|
|
|
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
|
{
|
|
int ierr=MPI_Bcast(data,
|
|
bytes,
|
|
MPI_BYTE,
|
|
root,
|
|
communicator);
|
|
assert(ierr==0);
|
|
}
|
|
|
|
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
|
{
|
|
int ierr= MPI_Bcast(data,
|
|
bytes,
|
|
MPI_BYTE,
|
|
root,
|
|
MPI_COMM_WORLD);
|
|
assert(ierr==0);
|
|
}
|
|
|
|
}
|
|
|