/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/communicator/Communicator_mpi.cc Copyright (C) 2015 Author: Peter Boyle This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ #include #include #include #include #include #include #include #include #include #include //#include #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif namespace Grid { /////////////////////////////////////////////////////////////////////////////////////////////////// // Info that is setup once and indept of cartesian layout /////////////////////////////////////////////////////////////////////////////////////////////////// int CartesianCommunicator::ShmSetup = 0; int CartesianCommunicator::ShmRank; int CartesianCommunicator::ShmSize; int CartesianCommunicator::GroupRank; int CartesianCommunicator::GroupSize; int CartesianCommunicator::WorldRank; int CartesianCommunicator::WorldSize; MPI_Comm CartesianCommunicator::communicator_world; MPI_Comm CartesianCommunicator::ShmComm; MPI_Win CartesianCommunicator::ShmWindow; std::vector CartesianCommunicator::GroupRanks; std::vector CartesianCommunicator::MyGroup; std::vector CartesianCommunicator::ShmCommBufs; int CartesianCommunicator::NodeCount(void) { return GroupSize;}; #undef FORCE_COMMS void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBufs[ShmRank]; } void *CartesianCommunicator::ShmBuffer(int rank) { int gpeer = GroupRanks[rank]; #ifdef FORCE_COMMS return NULL; #endif if (gpeer == MPI_UNDEFINED){ return NULL; } else { return ShmCommBufs[gpeer]; } } void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { static int count =0; int gpeer = GroupRanks[rank]; assert(gpeer!=ShmRank); // never send to self assert(rank!=WorldRank);// never send to self #ifdef FORCE_COMMS return NULL; #endif if (gpeer == MPI_UNDEFINED){ return NULL; } else { uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank]; uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset; return (void *) remote; } } void CartesianCommunicator::Init(int *argc, char ***argv) { int flag; int provided; // mtrace(); MPI_Initialized(&flag); // needed to coexist with other libs apparently if ( !flag ) { MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); assert (provided == MPI_THREAD_MULTIPLE); } Grid_quiesce_nodes(); MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); MPI_Comm_rank(communicator_world,&WorldRank); MPI_Comm_size(communicator_world,&WorldSize); if ( WorldRank == 0 ) { std::cout << GridLogMessage<< "Initialising MPI "<< WorldRank <<"/"< world_ranks(WorldSize); GroupRanks.resize(WorldSize); for(int r=0;r()); int myleader = MyGroup[0]; std::vector leaders_1hot(WorldSize,0); std::vector leaders_group(GroupSize,0); leaders_1hot [ myleader ] = 1; /////////////////////////////////////////////////////////////////// // global sum leaders over comm world /////////////////////////////////////////////////////////////////// int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator_world); assert(ierr==0); /////////////////////////////////////////////////////////////////// // find the group leaders world rank /////////////////////////////////////////////////////////////////// int group=0; for(int l=0;l shmids(ShmSize); if ( ShmRank == 0 ) { for(int r=0;r coor = _processor_coor; // my coord assert(std::abs(shift) <_processors[dim]); coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim]; Lexicographic::IndexFromCoor(coor,source,_processors); source = LexicographicToWorldRank[source]; coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim]; Lexicographic::IndexFromCoor(coor,dest,_processors); dest = LexicographicToWorldRank[dest]; }// rank is world rank. int CartesianCommunicator::RankFromProcessorCoor(std::vector &coor) { int rank; Lexicographic::IndexFromCoor(coor,rank,_processors); rank = LexicographicToWorldRank[rank]; return rank; }// rank is world rank void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector &coor) { int lr=-1; for(int r=0;r &processors) { int ierr; communicator=communicator_world; _ndimension = processors.size(); //////////////////////////////////////////////////////////////// // Assert power of two shm_size. //////////////////////////////////////////////////////////////// int log2size = -1; for(int i=0;i<=MAXLOG2RANKSPERNODE;i++){ if ( (0x1< WorldDims = processors; ShmDims.resize (_ndimension,1); GroupDims.resize(_ndimension); ShmCoor.resize (_ndimension); GroupCoor.resize(_ndimension); WorldCoor.resize(_ndimension); int dim = 0; for(int l2=0;l2 coor(_ndimension); ProcessorCoorFromRank(wr,coor); // from world rank int ck = RankFromProcessorCoor(coor); assert(ck==wr); if ( wr == WorldRank ) { for(int j=0;j mcoor = coor; this->Broadcast(0,(void *)&mcoor[0],mcoor.size()*sizeof(int)); for(int d = 0 ; d< _ndimension; d++) { assert(coor[d] == mcoor[d]); } } }; void CartesianCommunicator::GlobalSum(uint32_t &u){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); assert(ierr==0); } void CartesianCommunicator::GlobalSum(uint64_t &u){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); assert(ierr==0); } void CartesianCommunicator::GlobalSum(float &f){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); assert(ierr==0); } void CartesianCommunicator::GlobalSumVector(float *f,int N) { int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); assert(ierr==0); } void CartesianCommunicator::GlobalSum(double &d) { int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); assert(ierr==0); } void CartesianCommunicator::GlobalSumVector(double *d,int N) { int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); assert(ierr==0); } // Basic Halo comms primitive void CartesianCommunicator::SendToRecvFrom(void *xmit, int dest, void *recv, int from, int bytes) { std::vector reqs(0); // unsigned long xcrc = crc32(0L, Z_NULL, 0); // unsigned long rcrc = crc32(0L, Z_NULL, 0); // xcrc = crc32(xcrc,(unsigned char *)xmit,bytes); SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes); SendToRecvFromComplete(reqs); // rcrc = crc32(rcrc,(unsigned char *)recv,bytes); // printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc); } void CartesianCommunicator::SendRecvPacket(void *xmit, void *recv, int sender, int receiver, int bytes) { MPI_Status stat; assert(sender != receiver); int tag = sender; if ( _processor == sender ) { MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator); } if ( _processor == receiver ) { MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat); } } // Basic Halo comms primitive void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, void *xmit, int dest, void *recv, int from, int bytes) { int myrank = _processor; int ierr; if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { MPI_Request xrq; MPI_Request rrq; ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); assert(ierr==0); list.push_back(xrq); list.push_back(rrq); } else { // Give the CPU to MPI immediately; can use threads to overlap optionally ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, recv,bytes,MPI_CHAR,from, from, communicator,MPI_STATUS_IGNORE); assert(ierr==0); } } double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, void *xmit, int dest, void *recv, int from, int bytes) { MPI_Request xrq; MPI_Request rrq; int ierr; int gdest = GroupRanks[dest]; int gfrom = GroupRanks[from]; int gme = GroupRanks[_processor]; assert(dest != _processor); assert(from != _processor); assert(gme == ShmRank); double off_node_bytes=0.0; #ifdef FORCE_COMMS gdest = MPI_UNDEFINED; gfrom = MPI_UNDEFINED; #endif if ( gfrom ==MPI_UNDEFINED) { ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); assert(ierr==0); list.push_back(rrq); off_node_bytes+=bytes; } if ( gdest == MPI_UNDEFINED ) { ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); assert(ierr==0); list.push_back(xrq); off_node_bytes+=bytes; } if ( CommunicatorPolicy == CommunicatorPolicySequential ) { this->StencilSendToRecvFromComplete(list); } return off_node_bytes; } void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &waitall) { SendToRecvFromComplete(waitall); } void CartesianCommunicator::StencilBarrier(void) { MPI_Barrier (ShmComm); } void CartesianCommunicator::SendToRecvFromComplete(std::vector &list) { int nreq=list.size(); if (nreq==0) return; std::vector status(nreq); int ierr = MPI_Waitall(nreq,&list[0],&status[0]); assert(ierr==0); list.resize(0); } void CartesianCommunicator::Barrier(void) { int ierr = MPI_Barrier(communicator); assert(ierr==0); } void CartesianCommunicator::Broadcast(int root,void* data, int bytes) { int ierr=MPI_Bcast(data, bytes, MPI_BYTE, root, communicator); assert(ierr==0); } int CartesianCommunicator::RankWorld(void){ int r; MPI_Comm_rank(communicator_world,&r); return r; } void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { int ierr= MPI_Bcast(data, bytes, MPI_BYTE, root, communicator_world); assert(ierr==0); } }