diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc deleted file mode 100644 index 2075e4bf..00000000 --- a/lib/communicator/Communicator_mpi.cc +++ /dev/null @@ -1,222 +0,0 @@ - /************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: ./lib/communicator/Communicator_mpi.cc - - Copyright (C) 2015 - -Author: Peter Boyle - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ -#include -#include -#include -#include - -namespace Grid { - - -/////////////////////////////////////////////////////////////////////////////////////////////////// -// Info that is setup once and indept of cartesian layout -/////////////////////////////////////////////////////////////////////////////////////////////////// -MPI_Comm CartesianCommunicator::communicator_world; - -// Should error check all MPI calls. -void CartesianCommunicator::Init(int *argc, char ***argv) { - int flag; - int provided; - MPI_Initialized(&flag); // needed to coexist with other libs apparently - if ( !flag ) { - MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); - if ( provided != MPI_THREAD_MULTIPLE ) { - QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute; - } - } - MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); - ShmInitGeneric(); -} - -CartesianCommunicator::~CartesianCommunicator() -{ - int MPI_is_finalised; - MPI_Finalized(&MPI_is_finalised); - if (communicator && !MPI_is_finalised) - MPI_Comm_free(&communicator); -} - -void CartesianCommunicator::GlobalSum(uint32_t &u){ - int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); - assert(ierr==0); -} -void CartesianCommunicator::GlobalSum(uint64_t &u){ - int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); - assert(ierr==0); -} -void CartesianCommunicator::GlobalXOR(uint32_t &u){ - int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator); - assert(ierr==0); -} -void CartesianCommunicator::GlobalXOR(uint64_t &u){ - int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator); - assert(ierr==0); -} -void CartesianCommunicator::GlobalSum(float &f){ - int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); - assert(ierr==0); -} -void CartesianCommunicator::GlobalSumVector(float *f,int N) -{ - int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); - assert(ierr==0); -} -void CartesianCommunicator::GlobalSum(double &d) -{ - int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); - assert(ierr==0); -} -void CartesianCommunicator::GlobalSumVector(double *d,int N) -{ - int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); - assert(ierr==0); -} -void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) -{ - int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest); - assert(ierr==0); -} -int CartesianCommunicator::RankFromProcessorCoor(std::vector &coor) -{ - int rank; - int ierr=MPI_Cart_rank (communicator, &coor[0], &rank); - assert(ierr==0); - return rank; -} -void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector &coor) -{ - coor.resize(_ndimension); - int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]); - assert(ierr==0); -} - -// Basic Halo comms primitive -void CartesianCommunicator::SendToRecvFrom(void *xmit, - int dest, - void *recv, - int from, - int bytes) -{ - std::vector reqs(0); - SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes); - SendToRecvFromComplete(reqs); -} - -void CartesianCommunicator::SendRecvPacket(void *xmit, - void *recv, - int sender, - int receiver, - int bytes) -{ - MPI_Status stat; - assert(sender != receiver); - int tag = sender; - if ( _processor == sender ) { - MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator); - } - if ( _processor == receiver ) { - MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat); - } -} - -// Basic Halo comms primitive -void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, - void *xmit, - int dest, - void *recv, - int from, - int bytes) -{ - int myrank = _processor; - int ierr; - if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { - MPI_Request xrq; - MPI_Request rrq; - - ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); - ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); - - assert(ierr==0); - list.push_back(xrq); - list.push_back(rrq); - } else { - // Give the CPU to MPI immediately; can use threads to overlap optionally - ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, - recv,bytes,MPI_CHAR,from, from, - communicator,MPI_STATUS_IGNORE); - assert(ierr==0); - } -} -void CartesianCommunicator::SendToRecvFromComplete(std::vector &list) -{ - if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { - int nreq=list.size(); - std::vector status(nreq); - int ierr = MPI_Waitall(nreq,&list[0],&status[0]); - assert(ierr==0); - } -} - -void CartesianCommunicator::Barrier(void) -{ - int ierr = MPI_Barrier(communicator); - assert(ierr==0); -} - -void CartesianCommunicator::Broadcast(int root,void* data, int bytes) -{ - int ierr=MPI_Bcast(data, - bytes, - MPI_BYTE, - root, - communicator); - assert(ierr==0); -} - /////////////////////////////////////////////////////// - // Should only be used prior to Grid Init finished. - // Check for this? - /////////////////////////////////////////////////////// -int CartesianCommunicator::RankWorld(void){ - int r; - MPI_Comm_rank(communicator_world,&r); - return r; -} -void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) -{ - int ierr= MPI_Bcast(data, - bytes, - MPI_BYTE, - root, - communicator_world); - assert(ierr==0); -} - - - -} - diff --git a/lib/communicator/Communicator_mpi3_leader.cc b/lib/communicator/Communicator_mpi3_leader.cc deleted file mode 100644 index 6e26bd3e..00000000 --- a/lib/communicator/Communicator_mpi3_leader.cc +++ /dev/null @@ -1,988 +0,0 @@ - /************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: ./lib/communicator/Communicator_mpi.cc - - Copyright (C) 2015 - -Author: Peter Boyle - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ -#include "Grid.h" -#include -//#include - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////// -/// Workarounds: -/// i) bloody mac os doesn't implement unnamed semaphores since it is "optional" posix. -/// darwin dispatch semaphores don't seem to be multiprocess. -/// -/// ii) openmpi under --mca shmem posix works with two squadrons per node; -/// openmpi under default mca settings (I think --mca shmem mmap) on MacOS makes two squadrons map the SAME -/// memory as each other, despite their living on different communicators. This appears to be a bug in OpenMPI. -/// -//////////////////////////////////////////////////////////////////////////////////////////////////////////////// -#include -#include -#include -#include -typedef sem_t *Grid_semaphore; - - -#error /*THis is deprecated*/ - -#if 0 -#define SEM_INIT(S) S = sem_open(sem_name,0,0600,0); assert ( S != SEM_FAILED ); -#define SEM_INIT_EXCL(S) sem_unlink(sem_name); S = sem_open(sem_name,O_CREAT|O_EXCL,0600,0); assert ( S != SEM_FAILED ); -#define SEM_POST(S) assert ( sem_post(S) == 0 ); -#define SEM_WAIT(S) assert ( sem_wait(S) == 0 ); -#else -#define SEM_INIT(S) ; -#define SEM_INIT_EXCL(S) ; -#define SEM_POST(S) ; -#define SEM_WAIT(S) ; -#endif -#include - -namespace Grid { - -enum { COMMAND_ISEND, COMMAND_IRECV, COMMAND_WAITALL, COMMAND_SENDRECV }; - -struct Descriptor { - uint64_t buf; - size_t bytes; - int rank; - int tag; - int command; - uint64_t xbuf; - uint64_t rbuf; - int xtag; - int rtag; - int src; - int dest; - MPI_Request request; -}; - -const int pool = 48; - -class SlaveState { -public: - volatile int head; - volatile int start; - volatile int tail; - volatile Descriptor Descrs[pool]; -}; - -class Slave { -public: - Grid_semaphore sem_head; - Grid_semaphore sem_tail; - SlaveState *state; - MPI_Comm squadron; - uint64_t base; - int universe_rank; - int vertical_rank; - char sem_name [NAME_MAX]; - //////////////////////////////////////////////////////////// - // Descriptor circular pointers - //////////////////////////////////////////////////////////// - Slave() {}; - - void Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank); - - void SemInit(void) { - sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank); - SEM_INIT(sem_head); - sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank); - SEM_INIT(sem_tail); - } - void SemInitExcl(void) { - sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank); - SEM_INIT_EXCL(sem_head); - sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank); - SEM_INIT_EXCL(sem_tail); - } - void WakeUpDMA(void) { - SEM_POST(sem_head); - }; - void WakeUpCompute(void) { - SEM_POST(sem_tail); - }; - void WaitForCommand(void) { - SEM_WAIT(sem_head); - }; - void WaitForComplete(void) { - SEM_WAIT(sem_tail); - }; - void EventLoop (void) { - // std::cout<< " Entering event loop "<head,0,0); - int s=state->start; - if ( s != state->head ) { - _mm_mwait(0,0); - } -#endif - Event(); - } - } - - int Event (void) ; - - uint64_t QueueCommand(int command,void *buf, int bytes, int hashtag, MPI_Comm comm,int u_rank) ; - void QueueSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) ; - - void WaitAll() { - // std::cout << "Queueing WAIT command "<tail != state->head ); - } -}; - -//////////////////////////////////////////////////////////////////////// -// One instance of a data mover. -// Master and Slave must agree on location in shared memory -//////////////////////////////////////////////////////////////////////// - -class MPIoffloadEngine { -public: - - static std::vector Slaves; - - static int ShmSetup; - - static int UniverseRank; - static int UniverseSize; - - static MPI_Comm communicator_universe; - static MPI_Comm communicator_cached; - - static MPI_Comm HorizontalComm; - static int HorizontalRank; - static int HorizontalSize; - - static MPI_Comm VerticalComm; - static MPI_Win VerticalWindow; - static int VerticalSize; - static int VerticalRank; - - static std::vector VerticalShmBufs; - static std::vector > UniverseRanks; - static std::vector UserCommunicatorToWorldRanks; - - static MPI_Group WorldGroup, CachedGroup; - - static void CommunicatorInit (MPI_Comm &communicator_world, - MPI_Comm &ShmComm, - void * &ShmCommBuf); - - static void MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int commrank); - - ///////////////////////////////////////////////////////// - // routines for master proc must handle any communicator - ///////////////////////////////////////////////////////// - - static void QueueSend(int slave,void *buf, int bytes, int tag, MPI_Comm comm,int rank) { - // std::cout<< " Queueing send "<< bytes<< " slave "<< slave << " to comm "<= units ) { - mywork = myoff = 0; - } else { - mywork = (nwork+me)/units; - myoff = basework * me; - if ( me > backfill ) - myoff+= (me-backfill); - } - return; - }; - - static void QueueRoundRobinSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) { - uint8_t * cxbuf = (uint8_t *) xbuf; - uint8_t * crbuf = (uint8_t *) rbuf; - static int rrp=0; - int procs = VerticalSize-1; - int myoff=0; - int mywork=bytes; - QueueSendRecv(rrp+1,&cxbuf[myoff],&crbuf[myoff],mywork,xtag,rtag,comm,dest,src); - rrp = rrp+1; - if ( rrp == (VerticalSize-1) ) rrp = 0; - } - - static void QueueMultiplexedSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) { - uint8_t * cxbuf = (uint8_t *) xbuf; - uint8_t * crbuf = (uint8_t *) rbuf; - int mywork, myoff, procs; - procs = VerticalSize-1; - for(int s=0;s MPIoffloadEngine::Slaves; - -int MPIoffloadEngine::UniverseRank; -int MPIoffloadEngine::UniverseSize; - -MPI_Comm MPIoffloadEngine::communicator_universe; -MPI_Comm MPIoffloadEngine::communicator_cached; -MPI_Group MPIoffloadEngine::WorldGroup; -MPI_Group MPIoffloadEngine::CachedGroup; - -MPI_Comm MPIoffloadEngine::HorizontalComm; -int MPIoffloadEngine::HorizontalRank; -int MPIoffloadEngine::HorizontalSize; - -MPI_Comm MPIoffloadEngine::VerticalComm; -int MPIoffloadEngine::VerticalSize; -int MPIoffloadEngine::VerticalRank; -MPI_Win MPIoffloadEngine::VerticalWindow; -std::vector MPIoffloadEngine::VerticalShmBufs; -std::vector > MPIoffloadEngine::UniverseRanks; -std::vector MPIoffloadEngine::UserCommunicatorToWorldRanks; - -int CartesianCommunicator::NodeCount(void) { return HorizontalSize;}; -int MPIoffloadEngine::ShmSetup = 0; - -void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world, - MPI_Comm &ShmComm, - void * &ShmCommBuf) -{ - int flag; - assert(ShmSetup==0); - - ////////////////////////////////////////////////////////////////////// - // Universe is all nodes prior to squadron grouping - ////////////////////////////////////////////////////////////////////// - MPI_Comm_dup (MPI_COMM_WORLD,&communicator_universe); - MPI_Comm_rank(communicator_universe,&UniverseRank); - MPI_Comm_size(communicator_universe,&UniverseSize); - - ///////////////////////////////////////////////////////////////////// - // Split into groups that can share memory (Verticals) - ///////////////////////////////////////////////////////////////////// -#undef MPI_SHARED_MEM_DEBUG -#ifdef MPI_SHARED_MEM_DEBUG - MPI_Comm_split(communicator_universe,(UniverseRank/4),UniverseRank,&VerticalComm); -#else - MPI_Comm_split_type(communicator_universe, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&VerticalComm); -#endif - MPI_Comm_rank(VerticalComm ,&VerticalRank); - MPI_Comm_size(VerticalComm ,&VerticalSize); - - ////////////////////////////////////////////////////////////////////// - // Split into horizontal groups by rank in squadron - ////////////////////////////////////////////////////////////////////// - MPI_Comm_split(communicator_universe,VerticalRank,UniverseRank,&HorizontalComm); - MPI_Comm_rank(HorizontalComm,&HorizontalRank); - MPI_Comm_size(HorizontalComm,&HorizontalSize); - assert(HorizontalSize*VerticalSize==UniverseSize); - - //////////////////////////////////////////////////////////////////////////////// - // What is my place in the world - //////////////////////////////////////////////////////////////////////////////// - int WorldRank=0; - if(VerticalRank==0) WorldRank = HorizontalRank; - int ierr=MPI_Allreduce(MPI_IN_PLACE,&WorldRank,1,MPI_INT,MPI_SUM,VerticalComm); - assert(ierr==0); - - //////////////////////////////////////////////////////////////////////////////// - // Where is the world in the universe? - //////////////////////////////////////////////////////////////////////////////// - UniverseRanks = std::vector >(HorizontalSize,std::vector(VerticalSize,0)); - UniverseRanks[WorldRank][VerticalRank] = UniverseRank; - for(int w=0;w0 ) size = sizeof(SlaveState); - - sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldRank,r); - - shm_unlink(shm_name); - - int fd=shm_open(shm_name,O_RDWR|O_CREAT,0600); - if ( fd < 0 ) { - perror("failed shm_open"); - assert(0); - } - - ftruncate(fd, size); - - VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - if ( VerticalShmBufs[r] == MAP_FAILED ) { - perror("failed mmap"); - assert(0); - } - - /* - for(uint64_t page=0;page0 ) size = sizeof(SlaveState); - - sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldRank,r); - - int fd=shm_open(shm_name,O_RDWR|O_CREAT,0600); - if ( fd<0 ) { - perror("failed shm_open"); - assert(0); - } - VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - - uint64_t * check = (uint64_t *) VerticalShmBufs[r]; - assert(check[0]== WorldRank); - assert(check[1]== r); - // std::cerr<<"SHM "<"<"< cached_ranks(size); - - for(int r=0;r"<>0 )&0xFFFF)^((icomm>>16)&0xFFFF) - ^ ((icomm>>32)&0xFFFF)^((icomm>>48)&0xFFFF); - - // hashtag = (comm_hash<<15) | tag; - hashtag = tag; - -}; - -void Slave::Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank) -{ - squadron=_squadron; - universe_rank=_universe_rank; - vertical_rank=_vertical_rank; - state =_state; - // std::cout << "state "<<_state<<" comm "<<_squadron<<" universe_rank"<head = state->tail = state->start = 0; - base = (uint64_t)MPIoffloadEngine::VerticalShmBufs[0]; - int rank; MPI_Comm_rank(_squadron,&rank); -} -#define PERI_PLUS(A) ( (A+1)%pool ) -int Slave::Event (void) { - - static int tail_last; - static int head_last; - static int start_last; - int ierr; - MPI_Status stat; - static int i=0; - - //////////////////////////////////////////////////// - // Try to advance the start pointers - //////////////////////////////////////////////////// - int s=state->start; - if ( s != state->head ) { - switch ( state->Descrs[s].command ) { - case COMMAND_ISEND: - ierr = MPI_Isend((void *)(state->Descrs[s].buf+base), - state->Descrs[s].bytes, - MPI_CHAR, - state->Descrs[s].rank, - state->Descrs[s].tag, - MPIoffloadEngine::communicator_universe, - (MPI_Request *)&state->Descrs[s].request); - assert(ierr==0); - state->start = PERI_PLUS(s); - return 1; - break; - - case COMMAND_IRECV: - ierr=MPI_Irecv((void *)(state->Descrs[s].buf+base), - state->Descrs[s].bytes, - MPI_CHAR, - state->Descrs[s].rank, - state->Descrs[s].tag, - MPIoffloadEngine::communicator_universe, - (MPI_Request *)&state->Descrs[s].request); - - // std::cout<< " Request is "<Descrs[s].request<Descrs[0].request<start = PERI_PLUS(s); - return 1; - break; - - case COMMAND_SENDRECV: - - // fprintf(stderr,"Sendrecv ->%d %d : <-%d %d \n",state->Descrs[s].dest, state->Descrs[s].xtag+i*10,state->Descrs[s].src, state->Descrs[s].rtag+i*10); - - ierr=MPI_Sendrecv((void *)(state->Descrs[s].xbuf+base), state->Descrs[s].bytes, MPI_CHAR, state->Descrs[s].dest, state->Descrs[s].xtag+i*10, - (void *)(state->Descrs[s].rbuf+base), state->Descrs[s].bytes, MPI_CHAR, state->Descrs[s].src , state->Descrs[s].rtag+i*10, - MPIoffloadEngine::communicator_universe,MPI_STATUS_IGNORE); - - assert(ierr==0); - - // fprintf(stderr,"Sendrecv done %d %d\n",ierr,i); - // MPI_Barrier(MPIoffloadEngine::HorizontalComm); - // fprintf(stderr,"Barrier\n"); - i++; - - state->start = PERI_PLUS(s); - - return 1; - break; - - case COMMAND_WAITALL: - - for(int t=state->tail;t!=s; t=PERI_PLUS(t) ){ - if ( state->Descrs[t].command != COMMAND_SENDRECV ) { - MPI_Wait((MPI_Request *)&state->Descrs[t].request,MPI_STATUS_IGNORE); - } - }; - s=PERI_PLUS(s); - state->start = s; - state->tail = s; - - WakeUpCompute(); - - return 1; - break; - - default: - assert(0); - break; - } - } - return 0; -} - ////////////////////////////////////////////////////////////////////////////// - // External interaction with the queue - ////////////////////////////////////////////////////////////////////////////// - -void Slave::QueueSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) -{ - int head =state->head; - int next = PERI_PLUS(head); - - // Set up descriptor - int worldrank; - int hashtag; - MPI_Comm communicator; - MPI_Request request; - uint64_t relative; - - relative = (uint64_t)xbuf - base; - state->Descrs[head].xbuf = relative; - - relative= (uint64_t)rbuf - base; - state->Descrs[head].rbuf = relative; - - state->Descrs[head].bytes = bytes; - - MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,xtag,comm,dest); - state->Descrs[head].dest = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]; - state->Descrs[head].xtag = hashtag; - - MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,rtag,comm,src); - state->Descrs[head].src = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]; - state->Descrs[head].rtag = hashtag; - - state->Descrs[head].command= COMMAND_SENDRECV; - - // Block until FIFO has space - while( state->tail==next ); - - // Msync on weak order architectures - - // Advance pointer - state->head = next; - -}; -uint64_t Slave::QueueCommand(int command,void *buf, int bytes, int tag, MPI_Comm comm,int commrank) -{ - ///////////////////////////////////////// - // Spin; if FIFO is full until not full - ///////////////////////////////////////// - int head =state->head; - int next = PERI_PLUS(head); - - // Set up descriptor - int worldrank; - int hashtag; - MPI_Comm communicator; - MPI_Request request; - - MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,tag,comm,commrank); - - uint64_t relative= (uint64_t)buf - base; - state->Descrs[head].buf = relative; - state->Descrs[head].bytes = bytes; - state->Descrs[head].rank = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]; - state->Descrs[head].tag = hashtag; - state->Descrs[head].command= command; - - /* - if ( command == COMMAND_ISEND ) { - std::cout << "QueueSend from "<< universe_rank <<" to commrank " << commrank - << " to worldrank " << worldrank <tail==next ); - - // Msync on weak order architectures - // Advance pointer - state->head = next; - - return 0; -} - - -/////////////////////////////////////////////////////////////////////////////////////////////////// -// Info that is setup once and indept of cartesian layout -/////////////////////////////////////////////////////////////////////////////////////////////////// - -MPI_Comm CartesianCommunicator::communicator_world; - -void CartesianCommunicator::Init(int *argc, char ***argv) -{ - int flag; - MPI_Initialized(&flag); // needed to coexist with other libs apparently - if ( !flag ) { - MPI_Init(argc,argv); - } - communicator_world = MPI_COMM_WORLD; - MPI_Comm ShmComm; - MPIoffloadEngine::CommunicatorInit (communicator_world,ShmComm,ShmCommBuf); -} -void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) -{ - int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest); - assert(ierr==0); -} -int CartesianCommunicator::RankFromProcessorCoor(std::vector &coor) -{ - int rank; - int ierr=MPI_Cart_rank (communicator, &coor[0], &rank); - assert(ierr==0); - return rank; -} -void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector &coor) -{ - coor.resize(_ndimension); - int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]); - assert(ierr==0); -} - -CartesianCommunicator::CartesianCommunicator(const std::vector &processors) -{ - _ndimension = processors.size(); - std::vector periodic(_ndimension,1); - - _Nprocessors=1; - _processors = processors; - - for(int i=0;i<_ndimension;i++){ - _Nprocessors*=_processors[i]; - } - - int Size; - MPI_Comm_size(communicator_world,&Size); - assert(Size==_Nprocessors); - - _processor_coor.resize(_ndimension); - MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator); - MPI_Comm_rank (communicator,&_processor); - MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]); -}; - -void CartesianCommunicator::GlobalSum(uint32_t &u){ - int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); - assert(ierr==0); -} -void CartesianCommunicator::GlobalSum(uint64_t &u){ - int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); - assert(ierr==0); -} -void CartesianCommunicator::GlobalSum(float &f){ - int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); - assert(ierr==0); -} -void CartesianCommunicator::GlobalSumVector(float *f,int N) -{ - int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); - assert(ierr==0); -} -void CartesianCommunicator::GlobalSum(double &d) -{ - int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); - assert(ierr==0); -} -void CartesianCommunicator::GlobalSumVector(double *d,int N) -{ - int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); - assert(ierr==0); -} - -// Basic Halo comms primitive -void CartesianCommunicator::SendToRecvFrom(void *xmit, - int dest, - void *recv, - int from, - int bytes) -{ - std::vector reqs(0); - SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes); - SendToRecvFromComplete(reqs); -} - -void CartesianCommunicator::SendRecvPacket(void *xmit, - void *recv, - int sender, - int receiver, - int bytes) -{ - MPI_Status stat; - assert(sender != receiver); - int tag = sender; - if ( _processor == sender ) { - MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator); - } - if ( _processor == receiver ) { - MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat); - } -} - -// Basic Halo comms primitive -void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, - void *xmit, - int dest, - void *recv, - int from, - int bytes) -{ - MPI_Request xrq; - MPI_Request rrq; - int rank = _processor; - int ierr; - ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); - ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); - - assert(ierr==0); - - list.push_back(xrq); - list.push_back(rrq); -} - -void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, - void *xmit, - int dest, - void *recv, - int from, - int bytes) -{ - uint64_t xmit_i = (uint64_t) xmit; - uint64_t recv_i = (uint64_t) recv; - uint64_t shm = (uint64_t) ShmCommBuf; - // assert xmit and recv lie in shared memory region - assert( (xmit_i >= shm) && (xmit_i+bytes <= shm+MAX_MPI_SHM_BYTES) ); - assert( (recv_i >= shm) && (recv_i+bytes <= shm+MAX_MPI_SHM_BYTES) ); - assert(from!=_processor); - assert(dest!=_processor); - - MPIoffloadEngine::QueueMultiplexedSendRecv(xmit,recv,bytes,_processor,from,communicator,dest,from); - - //MPIoffloadEngine::QueueRoundRobinSendRecv(xmit,recv,bytes,_processor,from,communicator,dest,from); - - //MPIoffloadEngine::QueueMultiplexedSend(xmit,bytes,_processor,communicator,dest); - //MPIoffloadEngine::QueueMultiplexedRecv(recv,bytes,from,communicator,from); -} - -void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &list) -{ - MPIoffloadEngine::WaitAll(); - //this->Barrier(); -} - -void CartesianCommunicator::StencilBarrier(void) { } - -void CartesianCommunicator::SendToRecvFromComplete(std::vector &list) -{ - int nreq=list.size(); - std::vector status(nreq); - int ierr = MPI_Waitall(nreq,&list[0],&status[0]); - assert(ierr==0); -} - -void CartesianCommunicator::Barrier(void) -{ - int ierr = MPI_Barrier(communicator); - assert(ierr==0); -} - -void CartesianCommunicator::Broadcast(int root,void* data, int bytes) -{ - int ierr=MPI_Bcast(data, - bytes, - MPI_BYTE, - root, - communicator); - assert(ierr==0); -} - -void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) -{ - int ierr= MPI_Bcast(data, - bytes, - MPI_BYTE, - root, - communicator_world); - assert(ierr==0); -} - -void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; } - -void *CartesianCommunicator::ShmBuffer(int rank) { - return NULL; -} -void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { - return NULL; -} - - -}; - diff --git a/lib/communicator/Communicator_mpit.cc b/lib/communicator/Communicator_mpit.cc deleted file mode 100644 index bceea0d8..00000000 --- a/lib/communicator/Communicator_mpit.cc +++ /dev/null @@ -1,273 +0,0 @@ - /************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: ./lib/communicator/Communicator_mpi.cc - - Copyright (C) 2015 - -Author: Peter Boyle - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ -#include -#include -#include -#include - -namespace Grid { - - -/////////////////////////////////////////////////////////////////////////////////////////////////// -// Info that is setup once and indept of cartesian layout -/////////////////////////////////////////////////////////////////////////////////////////////////// -MPI_Comm CartesianCommunicator::communicator_world; - -// Should error check all MPI calls. -void CartesianCommunicator::Init(int *argc, char ***argv) { - int flag; - int provided; - MPI_Initialized(&flag); // needed to coexist with other libs apparently - if ( !flag ) { - MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); - if ( provided != MPI_THREAD_MULTIPLE ) { - QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute; - } - } - MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); - ShmInitGeneric(); -} - -CartesianCommunicator::~CartesianCommunicator() -{ - int MPI_is_finalised; - MPI_Finalized(&MPI_is_finalised); - if (communicator && !MPI_is_finalised){ - MPI_Comm_free(&communicator); - for(int i=0;i< communicator_halo.size();i++){ - MPI_Comm_free(&communicator_halo[i]); - } - } -} - -void CartesianCommunicator::GlobalSum(uint32_t &u){ - int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); - assert(ierr==0); -} -void CartesianCommunicator::GlobalSum(uint64_t &u){ - int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); - assert(ierr==0); -} -void CartesianCommunicator::GlobalXOR(uint32_t &u){ - int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator); - assert(ierr==0); -} -void CartesianCommunicator::GlobalXOR(uint64_t &u){ - int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator); - assert(ierr==0); -} -void CartesianCommunicator::GlobalSum(float &f){ - int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); - assert(ierr==0); -} -void CartesianCommunicator::GlobalSumVector(float *f,int N) -{ - int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); - assert(ierr==0); -} -void CartesianCommunicator::GlobalSum(double &d) -{ - int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); - assert(ierr==0); -} -void CartesianCommunicator::GlobalSumVector(double *d,int N) -{ - int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); - assert(ierr==0); -} -void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) -{ - int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest); - assert(ierr==0); -} -int CartesianCommunicator::RankFromProcessorCoor(std::vector &coor) -{ - int rank; - int ierr=MPI_Cart_rank (communicator, &coor[0], &rank); - assert(ierr==0); - return rank; -} -void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector &coor) -{ - coor.resize(_ndimension); - int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]); - assert(ierr==0); -} - -// Basic Halo comms primitive -void CartesianCommunicator::SendToRecvFrom(void *xmit, - int dest, - void *recv, - int from, - int bytes) -{ - std::vector reqs(0); - SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes); - SendToRecvFromComplete(reqs); -} - -void CartesianCommunicator::SendRecvPacket(void *xmit, - void *recv, - int sender, - int receiver, - int bytes) -{ - MPI_Status stat; - assert(sender != receiver); - int tag = sender; - if ( _processor == sender ) { - MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator); - } - if ( _processor == receiver ) { - MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat); - } -} - -// Basic Halo comms primitive -void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, - void *xmit, - int dest, - void *recv, - int from, - int bytes) -{ - int myrank = _processor; - int ierr; - if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { - MPI_Request xrq; - MPI_Request rrq; - - ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); - ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); - - assert(ierr==0); - list.push_back(xrq); - list.push_back(rrq); - } else { - // Give the CPU to MPI immediately; can use threads to overlap optionally - ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, - recv,bytes,MPI_CHAR,from, from, - communicator,MPI_STATUS_IGNORE); - assert(ierr==0); - } -} -void CartesianCommunicator::SendToRecvFromComplete(std::vector &list) -{ - if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { - int nreq=list.size(); - std::vector status(nreq); - int ierr = MPI_Waitall(nreq,&list[0],&status[0]); - assert(ierr==0); - } -} - -void CartesianCommunicator::Barrier(void) -{ - int ierr = MPI_Barrier(communicator); - assert(ierr==0); -} - -void CartesianCommunicator::Broadcast(int root,void* data, int bytes) -{ - int ierr=MPI_Bcast(data, - bytes, - MPI_BYTE, - root, - communicator); - assert(ierr==0); -} - /////////////////////////////////////////////////////// - // Should only be used prior to Grid Init finished. - // Check for this? - /////////////////////////////////////////////////////// -int CartesianCommunicator::RankWorld(void){ - int r; - MPI_Comm_rank(communicator_world,&r); - return r; -} -void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) -{ - int ierr= MPI_Bcast(data, - bytes, - MPI_BYTE, - root, - communicator_world); - assert(ierr==0); -} - -double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, - void *xmit, - int xmit_to_rank, - void *recv, - int recv_from_rank, - int bytes,int dir) -{ - int myrank = _processor; - int ierr; - int ncomm =communicator_halo.size(); - int commdir=dir%ncomm; - - // std::cout << " sending on communicator "< &waitall,int dir) -{ - int nreq=waitall.size(); - MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE); -} -double CartesianCommunicator::StencilSendToRecvFrom(void *xmit, - int xmit_to_rank, - void *recv, - int recv_from_rank, - int bytes,int dir) -{ - int myrank = _processor; - int ierr; - // std::cout << " sending on communicator "< - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ -#include -#include -#include - -namespace Grid { - - // Should error check all MPI calls. -#define SHMEM_VET(addr) - -#define SHMEM_VET_DEBUG(addr) { \ - if ( ! shmem_addr_accessible(addr,_processor) ) {\ - std::fprintf(stderr,"%d Inaccessible shmem address %lx %s %s\n",_processor,addr,__FUNCTION__,#addr); \ - BACKTRACEFILE(); \ - }\ -} - - -/////////////////////////////////////////////////////////////////////////////////////////////////// -// Info that is setup once and indept of cartesian layout -/////////////////////////////////////////////////////////////////////////////////////////////////// - -typedef struct HandShake_t { - uint64_t seq_local; - uint64_t seq_remote; -} HandShake; - -std::array make_psync_init(void) { - std::array ret; - ret.fill(SHMEM_SYNC_VALUE); - return ret; -} -static std::array psync_init = make_psync_init(); - -static Vector< HandShake > XConnections; -static Vector< HandShake > RConnections; - -void CartesianCommunicator::Init(int *argc, char ***argv) { - shmem_init(); - XConnections.resize(shmem_n_pes()); - RConnections.resize(shmem_n_pes()); - for(int pe =0 ; pe &processors,const CartesianCommunicator &parent) - : CartesianCommunicator(processors) -{ - std::cout << "Attempts to split SHMEM communicators will fail " < &processors) -{ - _ndimension = processors.size(); - std::vector periodic(_ndimension,1); - - _Nprocessors=1; - _processors = processors; - _processor_coor.resize(_ndimension); - - _processor = shmem_my_pe(); - - Lexicographic::CoorFromIndex(_processor_coor,_processor,_processors); - - for(int i=0;i<_ndimension;i++){ - _Nprocessors*=_processors[i]; - } - - int Size = shmem_n_pes(); - - - assert(Size==_Nprocessors); -} - -void CartesianCommunicator::GlobalSum(uint32_t &u){ - static long long source ; - static long long dest ; - static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE]; - static std::array psync = psync_init; - - // int nreduce=1; - // int pestart=0; - // int logStride=0; - - source = u; - dest = 0; - shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data()); - shmem_barrier_all(); // necessary? - u = dest; -} -void CartesianCommunicator::GlobalSum(uint64_t &u){ - static long long source ; - static long long dest ; - static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE]; - static std::array psync = psync_init; - - // int nreduce=1; - // int pestart=0; - // int logStride=0; - - source = u; - dest = 0; - shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data()); - shmem_barrier_all(); // necessary? - u = dest; -} -void CartesianCommunicator::GlobalSum(float &f){ - static float source ; - static float dest ; - static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE]; - static std::array psync = psync_init; - - source = f; - dest =0.0; - shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data()); - shmem_barrier_all(); - f = dest; -} -void CartesianCommunicator::GlobalSumVector(float *f,int N) -{ - static float source ; - static float dest = 0 ; - static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE]; - static std::array psync = psync_init; - - if ( shmem_addr_accessible(f,_processor) ){ - shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync.data()); - shmem_barrier_all(); - return; - } - - for(int i=0;i psync = psync_init; - - source = d; - dest = 0; - shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data()); - shmem_barrier_all(); - d = dest; -} -void CartesianCommunicator::GlobalSumVector(double *d,int N) -{ - static double source ; - static double dest ; - static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE]; - static std::array psync = psync_init; - - - if ( shmem_addr_accessible(d,_processor) ){ - shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync.data()); - shmem_barrier_all(); - return; - } - - for(int i=0;i coor = _processor_coor; - - assert(std::abs(shift) <_processors[dim]); - - coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim]; - Lexicographic::IndexFromCoor(coor,source,_processors); - - coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim]; - Lexicographic::IndexFromCoor(coor,dest,_processors); - -} -int CartesianCommunicator::RankFromProcessorCoor(std::vector &coor) -{ - int rank; - Lexicographic::IndexFromCoor(coor,rank,_processors); - return rank; -} -void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector &coor) -{ - Lexicographic::CoorFromIndex(coor,rank,_processors); -} - -// Basic Halo comms primitive -void CartesianCommunicator::SendToRecvFrom(void *xmit, - int dest, - void *recv, - int from, - int bytes) -{ - SHMEM_VET(xmit); - SHMEM_VET(recv); - std::vector reqs(0); - SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes); - SendToRecvFromComplete(reqs); -} - -void CartesianCommunicator::SendRecvPacket(void *xmit, - void *recv, - int sender, - int receiver, - int bytes) -{ - static uint64_t seq; - - assert(recv!=xmit); - volatile HandShake *RecvSeq = (volatile HandShake *) & RConnections[sender]; - volatile HandShake *SendSeq = (volatile HandShake *) & XConnections[receiver]; - - if ( _processor == sender ) { - - // Check he has posted a receive - while(SendSeq->seq_remote == SendSeq->seq_local); - - // Advance our send count - seq = ++(SendSeq->seq_local); - - // Send this packet - SHMEM_VET(recv); - shmem_putmem(recv,xmit,bytes,receiver); - shmem_fence(); - - //Notify him we're done - shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver); - shmem_fence(); - } - if ( _processor == receiver ) { - - // Post a receive - seq = ++(RecvSeq->seq_local); - shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender); - - // Now wait until he has advanced our reception counter - while(RecvSeq->seq_remote != RecvSeq->seq_local); - - } -} - -// Basic Halo comms primitive -void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, - void *xmit, - int dest, - void *recv, - int from, - int bytes) -{ - SHMEM_VET(xmit); - SHMEM_VET(recv); - // shmem_putmem_nb(recv,xmit,bytes,dest,NULL); - shmem_putmem(recv,xmit,bytes,dest); - - if ( CommunicatorPolicy == CommunicatorPolicySequential ) shmem_barrier_all(); -} -void CartesianCommunicator::SendToRecvFromComplete(std::vector &list) -{ - // shmem_quiet(); // I'm done - if( CommunicatorPolicy == CommunicatorPolicyConcurrent ) shmem_barrier_all();// He's done too -} -void CartesianCommunicator::Barrier(void) -{ - shmem_barrier_all(); -} -void CartesianCommunicator::Broadcast(int root,void* data, int bytes) -{ - static std::array psync = psync_init; - static uint32_t word; - uint32_t *array = (uint32_t *) data; - assert( (bytes % 4)==0); - int words = bytes/4; - - if ( shmem_addr_accessible(data,_processor) ){ - shmem_broadcast32(data,data,words,root,0,0,shmem_n_pes(),psync.data()); - return; - } - - for(int w=0;w psync = psync_init; - static uint32_t word; - uint32_t *array = (uint32_t *) data; - assert( (bytes % 4)==0); - int words = bytes/4; - - for(int w=0;w + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +#include + +namespace Grid { + +// static data + +uint64_t GlobalSharedMemory::MAX_MPI_SHM_BYTES = 1024LL*1024LL*1024LL; +int GlobalSharedMemory::Hugepages = 0; +int GlobalSharedMemory::ShmSetup; + +std::vector GlobalSharedMemory::WorldShmCommBufs; + +Grid_MPI_Comm GlobalSharedMemory::WorldShmComm; +int GlobalSharedMemory::WorldShmRank; +int GlobalSharedMemory::WorldShmSize; +std::vector GlobalSharedMemory::WorldShmRanks; + +Grid_MPI_Comm GlobalSharedMemory::WorldComm; +int GlobalSharedMemory::WorldSize; +int GlobalSharedMemory::WorldRank; + +int GlobalSharedMemory::WorldNodes; +int GlobalSharedMemory::WorldNode; + + +} diff --git a/lib/communicator/SharedMemory.h b/lib/communicator/SharedMemory.h new file mode 100644 index 00000000..2bb112e5 --- /dev/null +++ b/lib/communicator/SharedMemory.h @@ -0,0 +1,158 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/communicator/SharedMemory.cc + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + + +// TODO +// 1) move includes into SharedMemory.cc +// +// 2) split shared memory into a) optimal communicator creation from comm world +// +// b) shared memory buffers container +// -- static globally shared; init once +// -- per instance set of buffers. +// + +#pragma once + +#include + +#if defined (GRID_COMMS_MPI3) +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_NUMAIF_H +#include +#endif + +namespace Grid { + +#if defined (GRID_COMMS_MPI3) + typedef MPI_Comm Grid_MPI_Comm; + typedef MPI_Request CommsRequest_t; +#else + typedef int CommsRequest_t; + typedef int Grid_MPI_Comm; +#endif + +class GlobalSharedMemory { + private: + // Init once lock on the buffer allocation + static int ShmSetup; + static const int MAXLOG2RANKSPERNODE = 16; + + public: + static uint64_t MAX_MPI_SHM_BYTES; + static int Hugepages; + + static std::vector WorldShmCommBufs; + + static Grid_MPI_Comm WorldComm; + static int WorldRank; + static int WorldSize; + + static Grid_MPI_Comm WorldShmComm; + static int WorldShmRank; + static int WorldShmSize; + + static int WorldNodes; + static int WorldNode; + + static std::vector WorldShmRanks; + + ////////////////////////////////////////////////////////////////////////////////////// + // Create an optimal reordered communicator that makes MPI_Cart_create get it right + ////////////////////////////////////////////////////////////////////////////////////// + static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD + static void OptimalCommunicator(const std::vector &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian + /////////////////////////////////////////////////// + // Provide shared memory facilities off comm world + /////////////////////////////////////////////////// + static void SharedMemoryAllocate(uint64_t bytes, int flags); + static void SharedMemoryFree(void); + +}; + +////////////////////////////// +// one per communicator +////////////////////////////// +class SharedMemory +{ + private: + static const int MAXLOG2RANKSPERNODE = 16; + + size_t heap_top; + size_t heap_bytes; + size_t heap_size; + + protected: + + Grid_MPI_Comm ShmComm; // for barriers + int ShmRank; + int ShmSize; + std::vector ShmCommBufs; + std::vector ShmRanks;// Mapping comm ranks to Shm ranks + + public: + SharedMemory() {}; + /////////////////////////////////////////////////////////////////////////////////////// + // set the buffers & sizes + /////////////////////////////////////////////////////////////////////////////////////// + void SetCommunicator(Grid_MPI_Comm comm); + + //////////////////////////////////////////////////////////////////////// + // For this instance ; disjoint buffer sets between splits if split grid + //////////////////////////////////////////////////////////////////////// + void ShmBarrier(void); + + /////////////////////////////////////////////////// + // Call on any instance + /////////////////////////////////////////////////// + void SharedMemoryTest(void); + void *ShmBufferSelf(void); + void *ShmBuffer (int rank); + void *ShmBufferTranslate(int rank,void * local_p); + void *ShmBufferMalloc(size_t bytes); + void ShmBufferFreeAll(void) ; + + ////////////////////////////////////////////////////////////////////////// + // Make info on Nodes & ranks and Shared memory available + ////////////////////////////////////////////////////////////////////////// + int NodeCount(void) { return GlobalSharedMemory::WorldNodes;}; + int RankCount(void) { return GlobalSharedMemory::WorldSize;}; + +}; + +} diff --git a/lib/communicator/SharedMemoryMPI.cc b/lib/communicator/SharedMemoryMPI.cc new file mode 100644 index 00000000..af4f9702 --- /dev/null +++ b/lib/communicator/SharedMemoryMPI.cc @@ -0,0 +1,415 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/communicator/SharedMemory.cc + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +#include + +namespace Grid { + +/*Construct from an MPI communicator*/ +void GlobalSharedMemory::Init(Grid_MPI_Comm comm) +{ + WorldComm = comm; + MPI_Comm_rank(WorldComm,&WorldRank); + MPI_Comm_size(WorldComm,&WorldSize); + // WorldComm, WorldSize, WorldRank + + ///////////////////////////////////////////////////////////////////// + // Split into groups that can share memory + ///////////////////////////////////////////////////////////////////// + MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm); + MPI_Comm_rank(WorldShmComm ,&WorldShmRank); + MPI_Comm_size(WorldShmComm ,&WorldShmSize); + // WorldShmComm, WorldShmSize, WorldShmRank + + // WorldNodes + WorldNodes = WorldSize/WorldShmSize; + assert( (WorldNodes * WorldShmSize) == WorldSize ); + + // FIXME: Check all WorldShmSize are the same ? + + ///////////////////////////////////////////////////////////////////// + // find world ranks in our SHM group (i.e. which ranks are on our node) + ///////////////////////////////////////////////////////////////////// + MPI_Group WorldGroup, ShmGroup; + MPI_Comm_group (WorldComm, &WorldGroup); + MPI_Comm_group (WorldShmComm, &ShmGroup); + + std::vector world_ranks(WorldSize); for(int r=0;r MyGroup; + MyGroup.resize(WorldShmSize); + for(int rank=0;rank()); + int myleader = MyGroup[0]; + + std::vector leaders_1hot(WorldSize,0); + std::vector leaders_group(WorldNodes,0); + leaders_1hot [ myleader ] = 1; + + /////////////////////////////////////////////////////////////////// + // global sum leaders over comm world + /////////////////////////////////////////////////////////////////// + int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm); + assert(ierr==0); + + /////////////////////////////////////////////////////////////////// + // find the group leaders world rank + /////////////////////////////////////////////////////////////////// + int group=0; + for(int l=0;l &processors,Grid_MPI_Comm & optimal_comm) +{ + //////////////////////////////////////////////////////////////// + // Assert power of two shm_size. + //////////////////////////////////////////////////////////////// + int log2size = -1; + for(int i=0;i<=MAXLOG2RANKSPERNODE;i++){ + if ( (0x1< processor_coor(ndimension); + std::vector WorldDims = processors; std::vector ShmDims (ndimension,1); std::vector NodeDims (ndimension); + std::vector ShmCoor (ndimension); std::vector NodeCoor (ndimension); std::vector WorldCoor(ndimension); + int dim = 0; + for(int l2=0;l2 ranks(size); for(int r=0;r= heap_size) { + std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm flag" < + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +#include + +namespace Grid { + +/*Construct from an MPI communicator*/ +void GlobalSharedMemory::Init(Grid_MPI_Comm comm) +{ + WorldComm = 0; + WorldRank = 0; + WorldSize = 1; + WorldShmComm = 0 ; + WorldShmRank = 0 ; + WorldShmSize = 1 ; + WorldNodes = 1 ; + WorldNode = 0 ; + WorldShmRanks.resize(WorldSize); WorldShmRanks[0] = 0; + WorldShmCommBufs.resize(1); +} + +void GlobalSharedMemory::OptimalCommunicator(const std::vector &processors,Grid_MPI_Comm & optimal_comm) +{ + optimal_comm = WorldComm; +} + +//////////////////////////////////////////////////////////////////////////////////////////// +// Hugetlbfs mapping intended, use anonymous mmap +//////////////////////////////////////////////////////////////////////////////////////////// +void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) +{ + void * ShmCommBuf ; + MAX_MPI_SHM_BYTES=bytes; + int mmap_flag =0; +#ifdef MAP_ANONYMOUS + mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS; +#endif +#ifdef MAP_ANON + mmap_flag = mmap_flag| MAP_SHARED | MAP_ANON; +#endif +#ifdef MAP_HUGETLB + if ( flags ) mmap_flag |= MAP_HUGETLB; +#endif + ShmCommBuf =(void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); + if (ShmCommBuf == (void *)MAP_FAILED) { + perror("mmap failed "); + exit(EXIT_FAILURE); + } +#ifdef MADV_HUGEPAGE + if (!Hugepages ) madvise(ShmCommBuf,bytes,MADV_HUGEPAGE); +#endif + bzero(ShmCommBuf,bytes); + WorldShmCommBufs[0] = ShmCommBuf; +}; + +void GlobalSharedMemory::SharedMemoryFree(void) +{ + assert(ShmSetup); + assert(0); // unimplemented +} + + //////////////////////////////////////////////////////// + // Global shared functionality finished + // Now move to per communicator functionality + //////////////////////////////////////////////////////// +void SharedMemory::SetCommunicator(Grid_MPI_Comm comm) +{ + ShmRanks.resize(1); + ShmCommBufs.resize(1); + ShmRanks[0] = 0; + ShmRank = 0; + ShmSize = 1; + ////////////////////////////////////////////////////////////////////// + // Map ShmRank to WorldShmRank and use the right buffer + ////////////////////////////////////////////////////////////////////// + ShmCommBufs[0] = GlobalSharedMemory::WorldShmCommBufs[0]; + heap_size = GlobalSharedMemory::MAX_MPI_SHM_BYTES; + ShmBufferFreeAll(); + return; +} +////////////////////////////////////////////////////////////////// +// On node barrier +////////////////////////////////////////////////////////////////// +void SharedMemory::ShmBarrier(void){ return ; } + +////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Test the shared memory is working +////////////////////////////////////////////////////////////////////////////////////////////////////////// +void SharedMemory::SharedMemoryTest(void) { return; } + +void *SharedMemory::ShmBufferSelf(void) +{ + return ShmCommBufs[ShmRank]; +} +void *SharedMemory::ShmBuffer(int rank) +{ + return NULL; +} +void *SharedMemory::ShmBufferTranslate(int rank,void * local_p) +{ + return NULL; +} + +///////////////////////////////// +// Alloc, free shmem region ; common to MPI and none? +///////////////////////////////// +void *SharedMemory::ShmBufferMalloc(size_t bytes){ + void *ptr = (void *)heap_top; + heap_top += bytes; + heap_bytes+= bytes; + if (heap_bytes >= heap_size) { + std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm flag" <