1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-20 01:05:38 +01:00

MPI3 working with a bounce through shared memory on my laptop.

Longer term plan: make the "u_comm_buf" in Stencil point to the shared region and avoid the
send between ranks on same node.
This commit is contained in:
paboyle 2016-10-21 09:03:26 +01:00
parent 5b5925b8e5
commit a762b1fb71
8 changed files with 208 additions and 116 deletions

View File

@ -208,7 +208,6 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl;
RealF sum=0;
for(int x=0;x<latt4[0];x++){
for(int y=0;y<latt4[1];y++){

View File

@ -151,15 +151,6 @@ public:
void deallocate(pointer __p, size_type) {
shmem_free((void *)__p);
}
#elif defined(GRID_COMMS_MPI3)
pointer allocate(size_type __n, const void* _p= 0)
{
#error "implement MPI3 windowed allocate"
}
void deallocate(pointer __p, size_type) {
#error "implement MPI3 windowed allocate"
}
#else
pointer allocate(size_type __n, const void* _p= 0)
{

View File

@ -279,7 +279,7 @@ void Grid_init(int *argc,char ***argv)
void Grid_finalize(void)
{
#ifdef GRID_COMMS_MPI
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
MPI_Finalize();
Grid_unquiesce_nodes();
#endif

View File

@ -73,7 +73,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
////////////////////////////////////////////////////////////
void Grid_quiesce_nodes(void) {
int me = 0;
#ifdef GRID_COMMS_MPI
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3)
MPI_Comm_rank(MPI_COMM_WORLD, &me);
#endif
#ifdef GRID_COMMS_SHMEM

View File

@ -31,6 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifdef _OPENMP
#define GRID_OMP
#warning "OpenMP"
#endif
#define UNROLL _Pragma("unroll")
@ -127,6 +128,22 @@ class GridThread {
ThreadBarrier();
};
static void bcopy(const void *src, void *dst, size_t len) {
#ifdef GRID_OMP
#pragma omp parallel
{
const char *c_src =(char *) src;
char *c_dest=(char *) dst;
int me,mywork,myoff;
GridThread::GetWorkBarrier(len,me, mywork,myoff);
bcopy(&c_src[myoff],&c_dest[myoff],mywork);
}
#else
bcopy(src,dst,len);
#endif
}
};
}

View File

@ -56,10 +56,13 @@ class CartesianCommunicator {
MPI_Comm communicator;
typedef MPI_Request CommsRequest_t;
#elif GRID_COMMS_MPI3
int shm_mode;
MPI_Comm communicator;
typedef MPI_Request CommsRequest_t;
const int MAXLOG2RANKSPERNODE = 16; // 65536 ranks per node adequate for now
const uint64_t MAX_MPI_SHM_BYTES = 256*1024*1024; // 256MB shared memory for comms enought for 48^4 local vol comms
std::vector<int> WorldDims;
std::vector<int> GroupDims;
@ -69,14 +72,23 @@ class CartesianCommunicator {
std::vector<int> ShmCoor;
std::vector<int> WorldCoor;
int GroupRank;
int ShmRank;
int WorldRank;
static std::vector<int> GroupRanks;
static std::vector<int> MyGroup;
static int ShmSetup;
static MPI_Win ShmWindow;
static MPI_Comm ShmComm;
int GroupSize;
int ShmSize;
void * ShmCommBuf;
std::vector<void *> ShmCommBufs;
int WorldRank;
int WorldSize;
static int ShmRank;
static int ShmSize;
static int GroupSize;
static int GroupRank;
std::vector<int> LexicographicToWorldRank;
#else
typedef int CommsRequest_t;

View File

@ -168,7 +168,6 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
int nreq=list.size();
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
}

View File

@ -30,6 +30,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
namespace Grid {
// Global used by Init and nowhere else. How to hide?
int Rank(void) {
int pe;
@ -76,29 +78,129 @@ void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &c
rank = LexicographicToWorldRank[rank];
}
///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
int CartesianCommunicator::ShmSetup = 0;
int CartesianCommunicator::ShmRank;
int CartesianCommunicator::ShmSize;
int CartesianCommunicator::GroupRank;
int CartesianCommunicator::GroupSize;
MPI_Comm CartesianCommunicator::ShmComm;
MPI_Win CartesianCommunicator::ShmWindow;
std::vector<int> CartesianCommunicator::GroupRanks;
std::vector<int> CartesianCommunicator::MyGroup;
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{
_ndimension = processors.size();
std::cout << "Creating "<< _ndimension << " dim communicator "<<std::endl;
for(int d =0;d<_ndimension;d++){
std::cout << processors[d]<<" ";
};
std::cout << std::endl;
WorldDims = processors;
communicator = MPI_COMM_WORLD;
MPI_Comm shmcomm;
MPI_Comm_split_type(communicator, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&shmcomm);
MPI_Comm_rank(communicator,&WorldRank);
MPI_Comm_size(communicator,&WorldSize);
MPI_Comm_rank(shmcomm ,&ShmRank);
MPI_Comm_size(shmcomm ,&ShmSize);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Plan: allocate a fixed SHM region. Scratch that is just used via some scheme during stencil comms, with no allocate free.
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Does every grid need one, or could we share across all grids via a singleton/guard?
int ierr;
if ( !ShmSetup ) {
MPI_Comm_split_type(communicator, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
MPI_Comm_rank(ShmComm ,&ShmRank);
MPI_Comm_size(ShmComm ,&ShmSize);
GroupSize = WorldSize/ShmSize;
/////////////////////////////////////////////////////////////////////
// find world ranks in our SHM group (i.e. which ranks are on our node)
/////////////////////////////////////////////////////////////////////
MPI_Group WorldGroup, ShmGroup;
MPI_Comm_group (communicator, &WorldGroup);
MPI_Comm_group (ShmComm, &ShmGroup);
std::vector<int> world_ranks(WorldSize);
GroupRanks.resize(WorldSize);
MyGroup.resize(ShmSize);
for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]);
///////////////////////////////////////////////////////////////////
// Identify who is in my group and noninate the leader
///////////////////////////////////////////////////////////////////
int g=0;
for(int rank=0;rank<WorldSize;rank++){
if(GroupRanks[rank]!=MPI_UNDEFINED){
assert(g<ShmSize);
MyGroup[g++] = rank;
}
}
std::sort(MyGroup.begin(),MyGroup.end(),std::greater<int>());
int myleader = MyGroup[0];
std::vector<int> leaders_1hot(WorldSize,0);
std::vector<int> leaders_group(GroupSize,0);
leaders_1hot [ myleader ] = 1;
///////////////////////////////////////////////////////////////////
// global sum leaders over comm world
///////////////////////////////////////////////////////////////////
ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator);
assert(ierr==0);
///////////////////////////////////////////////////////////////////
// find the group leaders world rank
///////////////////////////////////////////////////////////////////
int group=0;
for(int l=0;l<WorldSize;l++){
if(leaders_1hot[l]){
leaders_group[group++] = l;
}
}
///////////////////////////////////////////////////////////////////
// Identify the rank of the group in which I (and my leader) live
///////////////////////////////////////////////////////////////////
GroupRank=-1;
for(int g=0;g<GroupSize;g++){
if (myleader == leaders_group[g]){
GroupRank=g;
}
}
assert(GroupRank!=-1);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared window for our group
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBuf = 0;
ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow);
assert(ierr==0);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Verbose for now
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout<< "Ranks per node "<< ShmSize << std::endl;
std::cout<< "Nodes "<< GroupSize << std::endl;
std::cout<< "Ranks "<< WorldSize << std::endl;
std::cout<< "Shm CommBuf "<< ShmCommBuf << std::endl;
// Done
ShmSetup=1;
}
ShmCommBufs.resize(ShmSize);
for(int r=0;r<ShmSize;r++){
MPI_Aint sz;
int dsp_unit;
MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
}
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
@ -130,34 +232,12 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
dim=(dim+1)%_ndimension;
}
std::cout << "Shm group dims "<<std::endl;
for(int d =0;d<_ndimension;d++){
std::cout << ShmDims[d]<<" ";
};
std::cout << std::endl;
////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings
////////////////////////////////////////////////////////////////
for(int d=0;d<_ndimension;d++){
GroupDims[d] = WorldDims[d]/ShmDims[d];
}
std::cout << "Group dims "<<std::endl;
for(int d =0;d<_ndimension;d++){
std::cout << GroupDims[d]<<" ";
};
std::cout << std::endl;
MPI_Group WorldGroup, ShmGroup;
MPI_Comm_group (communicator, &WorldGroup);
MPI_Comm_group (shmcomm, &ShmGroup);
std::vector<int> world_ranks(WorldSize);
std::vector<int> group_ranks(WorldSize);
std::vector<int> mygroup(GroupSize);
for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &group_ranks[0]);
////////////////////////////////////////////////////////////////
// Check processor counts match
@ -166,56 +246,10 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
_processors = processors;
_processor_coor.resize(_ndimension);
for(int i=0;i<_ndimension;i++){
std::cout << " p " << _processors[i]<<std::endl;
_Nprocessors*=_processors[i];
}
std::cout << " World " <<WorldSize <<" Nproc "<<_Nprocessors<<std::endl;
assert(WorldSize==_Nprocessors);
///////////////////////////////////////////////////////////////////
// Identify who is in my group and noninate the leader
///////////////////////////////////////////////////////////////////
int g=0;
for(int rank=0;rank<WorldSize;rank++){
if(group_ranks[rank]!=MPI_UNDEFINED){
mygroup[g] = rank;
}
}
std::sort(mygroup.begin(),mygroup.end(),std::greater<int>());
int myleader = mygroup[0];
std::vector<int> leaders_1hot(WorldSize,0);
std::vector<int> leaders_group(GroupSize,0);
leaders_1hot [ myleader ] = 1;
///////////////////////////////////////////////////////////////////
// global sum leaders over comm world
///////////////////////////////////////////////////////////////////
int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator);
assert(ierr==0);
///////////////////////////////////////////////////////////////////
// find the group leaders world rank
///////////////////////////////////////////////////////////////////
int group=0;
for(int l=0;l<WorldSize;l++){
if(leaders_1hot[l]){
leaders_group[group++] = l;
}
}
///////////////////////////////////////////////////////////////////
// Identify the rank of the group in which I (and my leader) live
///////////////////////////////////////////////////////////////////
GroupRank=-1;
for(int g=0;g<GroupSize;g++){
if (myleader == leaders_group[g]){
GroupRank=g;
}
}
assert(GroupRank!=-1);
////////////////////////////////////////////////////////////////
// Establish mapping between lexico physics coord and WorldRank
//
@ -309,16 +343,56 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
{
MPI_Request xrq;
MPI_Request rrq;
int rank = _processor;
int ierr;
int tag;
int small = (bytes<MAX_MPI_SHM_BYTES) || (shm_mode==0);
static int sequence;
int check;
assert(dest != _processor);
assert(from != _processor);
int gdest = GroupRanks[dest];
int gme = GroupRanks[_processor];
sequence++;
assert(gme == ShmRank);
if ( small && (dest !=MPI_UNDEFINED) ) {
char *ptr = (char *)ShmCommBufs[gdest];
assert(gme != gdest);
GridThread::bcopy(xmit,ptr,bytes);
bcopy(&_processor,&ptr[bytes],sizeof(_processor));
bcopy(& sequence,&ptr[bytes+4],sizeof(sequence));
} else {
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
assert(ierr==0);
list.push_back(xrq);
}
MPI_Win_sync (ShmWindow);
MPI_Barrier (ShmComm);
MPI_Win_sync (ShmWindow);
if (small && (from !=MPI_UNDEFINED) ) {
char *ptr = (char *)ShmCommBufs[ShmRank];
GridThread::bcopy(ptr,recv,bytes);
bcopy(&ptr[bytes] ,&tag ,sizeof(tag));
bcopy(&ptr[bytes+4],&check,sizeof(check));
assert(check==sequence);
assert(tag==from);
} else {
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
assert(ierr==0);
list.push_back(rrq);
}
MPI_Win_sync (ShmWindow);
MPI_Barrier (ShmComm);
MPI_Win_sync (ShmWindow);
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
int nreq=list.size();