mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-25 21:25:56 +01:00
867 lines
26 KiB
C++
867 lines
26 KiB
C++
/*************************************************************************************
|
|
|
|
Grid physics library, www.github.com/paboyle/Grid
|
|
|
|
Source file: ./lib/communicator/Communicator_mpi.cc
|
|
|
|
Copyright (C) 2015
|
|
|
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License along
|
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
|
|
See the full license in the file "LICENSE" in the top level distribution directory
|
|
*************************************************************************************/
|
|
/* END LEGAL */
|
|
#include <Grid/GridCore.h>
|
|
#include <Grid/communicator/SharedMemory.h>
|
|
|
|
NAMESPACE_BEGIN(Grid);
|
|
|
|
|
|
Grid_MPI_Comm CartesianCommunicator::communicator_world;
|
|
|
|
////////////////////////////////////////////
|
|
// First initialise of comms system
|
|
////////////////////////////////////////////
|
|
void CartesianCommunicator::Init(int *argc, char ***argv)
|
|
{
|
|
|
|
int flag;
|
|
int provided;
|
|
|
|
MPI_Initialized(&flag); // needed to coexist with other libs apparently
|
|
if ( !flag ) {
|
|
|
|
#ifndef GRID_COMMS_THREADS
|
|
nCommThreads=1;
|
|
// wrong results here too
|
|
// For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
|
|
// other comms schemes are ok
|
|
MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided);
|
|
#else
|
|
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
|
|
#endif
|
|
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
|
|
if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) {
|
|
assert(0);
|
|
}
|
|
|
|
if( (nCommThreads > 1) && (provided != MPI_THREAD_MULTIPLE) ) {
|
|
assert(0);
|
|
}
|
|
}
|
|
|
|
// Never clean up as done once.
|
|
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
|
|
|
|
Grid_quiesce_nodes();
|
|
GlobalSharedMemory::Init(communicator_world);
|
|
GlobalSharedMemory::SharedMemoryAllocate(
|
|
GlobalSharedMemory::MAX_MPI_SHM_BYTES,
|
|
GlobalSharedMemory::Hugepages);
|
|
Grid_unquiesce_nodes();
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// Use cartesian communicators now even in MPI3
|
|
///////////////////////////////////////////////////////////////////////////
|
|
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
|
|
{
|
|
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
|
|
assert(ierr==0);
|
|
}
|
|
int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor)
|
|
{
|
|
int rank;
|
|
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
|
|
assert(ierr==0);
|
|
return rank;
|
|
}
|
|
void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor)
|
|
{
|
|
coor.resize(_ndimension);
|
|
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
|
|
assert(ierr==0);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
// Initialises from communicator_world
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
|
|
{
|
|
MPI_Comm optimal_comm;
|
|
////////////////////////////////////////////////////
|
|
// Remap using the shared memory optimising routine
|
|
// The remap creates a comm which must be freed
|
|
////////////////////////////////////////////////////
|
|
GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm,_shm_processors);
|
|
InitFromMPICommunicator(processors,optimal_comm);
|
|
SetCommunicator(optimal_comm);
|
|
///////////////////////////////////////////////////
|
|
// Free the temp communicator
|
|
///////////////////////////////////////////////////
|
|
MPI_Comm_free(&optimal_comm);
|
|
}
|
|
|
|
//////////////////////////////////
|
|
// Try to subdivide communicator
|
|
//////////////////////////////////
|
|
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
|
|
{
|
|
_ndimension = processors.size(); assert(_ndimension>=1);
|
|
int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
|
|
Coordinate parent_processor_coor(_ndimension,0);
|
|
Coordinate parent_processors (_ndimension,1);
|
|
Coordinate shm_processors (_ndimension,1);
|
|
// Can make 5d grid from 4d etc...
|
|
int pad = _ndimension-parent_ndimension;
|
|
for(int d=0;d<parent_ndimension;d++){
|
|
parent_processor_coor[pad+d]=parent._processor_coor[d];
|
|
parent_processors [pad+d]=parent._processors[d];
|
|
shm_processors [pad+d]=parent._shm_processors[d];
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
// split the communicator
|
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
// int Nparent = parent._processors ;
|
|
int Nparent;
|
|
MPI_Comm_size(parent.communicator,&Nparent);
|
|
|
|
int childsize=1;
|
|
for(int d=0;d<processors.size();d++) {
|
|
childsize *= processors[d];
|
|
}
|
|
int Nchild = Nparent/childsize;
|
|
assert (childsize * Nchild == Nparent);
|
|
|
|
Coordinate ccoor(_ndimension); // coor within subcommunicator
|
|
Coordinate scoor(_ndimension); // coor of split within parent
|
|
Coordinate ssize(_ndimension); // coor of split within parent
|
|
|
|
for(int d=0;d<_ndimension;d++){
|
|
ccoor[d] = parent_processor_coor[d] % processors[d];
|
|
scoor[d] = parent_processor_coor[d] / processors[d];
|
|
ssize[d] = parent_processors[d] / processors[d];
|
|
if ( processors[d] < shm_processors[d] ) shm_processors[d] = processors[d]; // subnode splitting.
|
|
}
|
|
|
|
// rank within subcomm ; srank is rank of subcomm within blocks of subcomms
|
|
int crank;
|
|
// Mpi uses the reverse Lexico convention to us; so reversed routines called
|
|
Lexicographic::IndexFromCoorReversed(ccoor,crank,processors); // processors is the split grid dimensions
|
|
Lexicographic::IndexFromCoorReversed(scoor,srank,ssize); // ssize is the number of split grids
|
|
|
|
MPI_Comm comm_split;
|
|
if ( Nchild > 1 ) {
|
|
|
|
////////////////////////////////////////////////////////////////
|
|
// Split the communicator
|
|
////////////////////////////////////////////////////////////////
|
|
int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
|
|
assert(ierr==0);
|
|
|
|
} else {
|
|
srank = 0;
|
|
int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
|
|
assert(ierr==0);
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
// Set up from the new split communicator
|
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
InitFromMPICommunicator(processors,comm_split);
|
|
|
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
// Take the right SHM buffers
|
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
SetCommunicator(comm_split);
|
|
|
|
///////////////////////////////////////////////
|
|
// Free the temp communicator
|
|
///////////////////////////////////////////////
|
|
MPI_Comm_free(&comm_split);
|
|
|
|
if(0){
|
|
std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
|
|
for(int d=0;d<processors.size();d++){
|
|
std::cout << d<< " " << _processor_coor[d] <<" " << ccoor[d]<<std::endl;
|
|
}
|
|
}
|
|
for(int d=0;d<processors.size();d++){
|
|
assert(_processor_coor[d] == ccoor[d] );
|
|
}
|
|
}
|
|
|
|
void CartesianCommunicator::InitFromMPICommunicator(const Coordinate &processors, MPI_Comm communicator_base)
|
|
{
|
|
////////////////////////////////////////////////////
|
|
// Creates communicator, and the communicator_halo
|
|
////////////////////////////////////////////////////
|
|
_ndimension = processors.size();
|
|
_processor_coor.resize(_ndimension);
|
|
|
|
/////////////////////////////////
|
|
// Count the requested nodes
|
|
/////////////////////////////////
|
|
_Nprocessors=1;
|
|
_processors = processors;
|
|
for(int i=0;i<_ndimension;i++){
|
|
_Nprocessors*=_processors[i];
|
|
}
|
|
|
|
Coordinate periodic(_ndimension,1);
|
|
MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator);
|
|
MPI_Comm_rank(communicator,&_processor);
|
|
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
|
|
|
|
if ( 0 && (communicator_base != communicator_world) ) {
|
|
std::cout << "InitFromMPICommunicator Cartesian communicator created with a non-world communicator"<<std::endl;
|
|
std::cout << " new communicator rank "<<_processor<< " coor ["<<_ndimension<<"] ";
|
|
for(int d=0;d<_processors.size();d++){
|
|
std::cout << _processor_coor[d]<<" ";
|
|
}
|
|
std::cout << std::endl;
|
|
}
|
|
|
|
int Size;
|
|
MPI_Comm_size(communicator,&Size);
|
|
|
|
communicator_halo.resize (2*_ndimension);
|
|
for(int i=0;i<_ndimension*2;i++){
|
|
MPI_Comm_dup(communicator,&communicator_halo[i]);
|
|
}
|
|
assert(Size==_Nprocessors);
|
|
}
|
|
|
|
CartesianCommunicator::~CartesianCommunicator()
|
|
{
|
|
int MPI_is_finalised;
|
|
MPI_Finalized(&MPI_is_finalised);
|
|
if (communicator && !MPI_is_finalised) {
|
|
MPI_Comm_free(&communicator);
|
|
for(int i=0;i<communicator_halo.size();i++){
|
|
MPI_Comm_free(&communicator_halo[i]);
|
|
}
|
|
}
|
|
}
|
|
#ifdef USE_GRID_REDUCTION
|
|
void CartesianCommunicator::GlobalSum(float &f){
|
|
CartesianCommunicator::GlobalSumP2P(f);
|
|
}
|
|
void CartesianCommunicator::GlobalSum(double &d)
|
|
{
|
|
CartesianCommunicator::GlobalSumP2P(d);
|
|
}
|
|
#else
|
|
void CartesianCommunicator::GlobalSum(float &f){
|
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
|
assert(ierr==0);
|
|
}
|
|
void CartesianCommunicator::GlobalSum(double &d)
|
|
{
|
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
|
assert(ierr==0);
|
|
}
|
|
#endif
|
|
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
|
assert(ierr==0);
|
|
}
|
|
void CartesianCommunicator::GlobalSum(uint64_t &u){
|
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
|
|
assert(ierr==0);
|
|
}
|
|
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
|
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
|
|
assert(ierr==0);
|
|
}
|
|
void CartesianCommunicator::GlobalXOR(uint32_t &u){
|
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
|
|
assert(ierr==0);
|
|
}
|
|
void CartesianCommunicator::GlobalXOR(uint64_t &u){
|
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
|
|
assert(ierr==0);
|
|
}
|
|
void CartesianCommunicator::GlobalMax(float &f)
|
|
{
|
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
|
|
assert(ierr==0);
|
|
}
|
|
void CartesianCommunicator::GlobalMax(double &d)
|
|
{
|
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
|
|
assert(ierr==0);
|
|
}
|
|
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
|
{
|
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
|
assert(ierr==0);
|
|
}
|
|
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
|
{
|
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
|
assert(ierr==0);
|
|
}
|
|
|
|
void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
|
|
void *xmit,
|
|
int dest,
|
|
void *recv,
|
|
int from,
|
|
int bytes,int dir)
|
|
{
|
|
MPI_Request xrq;
|
|
MPI_Request rrq;
|
|
|
|
assert(dest != _processor);
|
|
assert(from != _processor);
|
|
|
|
int tag;
|
|
|
|
tag= dir+from*32;
|
|
int ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator,&rrq);
|
|
assert(ierr==0);
|
|
list.push_back(rrq);
|
|
|
|
tag= dir+_processor*32;
|
|
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator,&xrq);
|
|
assert(ierr==0);
|
|
list.push_back(xrq);
|
|
}
|
|
void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
|
|
{
|
|
int nreq=list.size();
|
|
|
|
if (nreq==0) return;
|
|
|
|
std::vector<MPI_Status> status(nreq);
|
|
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
|
assert(ierr==0);
|
|
list.resize(0);
|
|
}
|
|
|
|
// Basic Halo comms primitive
|
|
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
|
int dest,
|
|
void *recv,
|
|
int from,
|
|
int bytes)
|
|
{
|
|
std::vector<MpiCommsRequest_t> reqs(0);
|
|
|
|
int myrank = _processor;
|
|
int ierr;
|
|
|
|
// Enforce no UVM in comms, device or host OK
|
|
assert(acceleratorIsCommunicable(xmit));
|
|
assert(acceleratorIsCommunicable(recv));
|
|
|
|
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
|
// printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
|
|
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
|
|
recv,bytes,MPI_CHAR,from, from,
|
|
communicator,MPI_STATUS_IGNORE);
|
|
assert(ierr==0);
|
|
|
|
}
|
|
// Basic Halo comms primitive
|
|
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
|
int dest, int dox,
|
|
void *recv,
|
|
int from, int dor,
|
|
int bytes,int dir)
|
|
{
|
|
std::vector<CommsRequest_t> list;
|
|
double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
|
offbytes += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
|
StencilSendToRecvFromComplete(list,dir);
|
|
return offbytes;
|
|
}
|
|
|
|
|
|
#ifdef ACCELERATOR_AWARE_MPI
|
|
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
|
|
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
|
|
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
|
void *xmit,
|
|
int dest,int dox,
|
|
void *recv,
|
|
int from,int dor,
|
|
int xbytes,int rbytes,int dir)
|
|
{
|
|
return 0.0; // Do nothing -- no preparation required
|
|
}
|
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
|
void *xmit,
|
|
int dest,int dox,
|
|
void *recv,
|
|
int from,int dor,
|
|
int xbytes,int rbytes,int dir)
|
|
{
|
|
int ncomm =communicator_halo.size();
|
|
int commdir=dir%ncomm;
|
|
|
|
MPI_Request xrq;
|
|
MPI_Request rrq;
|
|
|
|
int ierr;
|
|
int gdest = ShmRanks[dest];
|
|
int gfrom = ShmRanks[from];
|
|
int gme = ShmRanks[_processor];
|
|
|
|
assert(dest != _processor);
|
|
assert(from != _processor);
|
|
assert(gme == ShmRank);
|
|
double off_node_bytes=0.0;
|
|
int tag;
|
|
|
|
if ( dor ) {
|
|
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
|
|
tag= dir+from*32;
|
|
ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
|
|
assert(ierr==0);
|
|
list.push_back(rrq);
|
|
off_node_bytes+=rbytes;
|
|
}
|
|
#ifdef NVLINK_GET
|
|
else {
|
|
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
|
|
assert(shm!=NULL);
|
|
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
|
|
}
|
|
#endif
|
|
}
|
|
// This is a NVLINK PUT
|
|
if (dox) {
|
|
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
|
tag= dir+_processor*32;
|
|
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
|
assert(ierr==0);
|
|
list.push_back(xrq);
|
|
off_node_bytes+=xbytes;
|
|
} else {
|
|
#ifndef NVLINK_GET
|
|
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
|
assert(shm!=NULL);
|
|
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
|
#endif
|
|
}
|
|
}
|
|
return off_node_bytes;
|
|
}
|
|
|
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
|
{
|
|
int nreq=list.size();
|
|
/*finishes Get/Put*/
|
|
acceleratorCopySynchronise();
|
|
|
|
if (nreq==0) return;
|
|
std::vector<MPI_Status> status(nreq);
|
|
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
|
assert(ierr==0);
|
|
list.resize(0);
|
|
this->StencilBarrier();
|
|
}
|
|
|
|
#else /* NOT ... ACCELERATOR_AWARE_MPI */
|
|
///////////////////////////////////////////
|
|
// Pipeline mode through host memory
|
|
///////////////////////////////////////////
|
|
/*
|
|
* In prepare (phase 1):
|
|
* PHASE 1: (prepare)
|
|
* - post MPI receive buffers asynch
|
|
* - post device - host send buffer transfer asynch
|
|
* PHASE 2: (Begin)
|
|
* - complete all copies
|
|
* - post MPI send asynch
|
|
* - post device - device transfers
|
|
* PHASE 3: (Complete)
|
|
* - MPI_waitall
|
|
* - host-device transfers
|
|
*
|
|
*********************************
|
|
* NB could split this further:
|
|
*--------------------------------
|
|
* PHASE 1: (Prepare)
|
|
* - post MPI receive buffers asynch
|
|
* - post device - host send buffer transfer asynch
|
|
* PHASE 2: (BeginInterNode)
|
|
* - complete all copies
|
|
* - post MPI send asynch
|
|
* PHASE 3: (BeginIntraNode)
|
|
* - post device - device transfers
|
|
* PHASE 4: (Complete)
|
|
* - MPI_waitall
|
|
* - host-device transfers asynch
|
|
* - (complete all copies)
|
|
*/
|
|
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
|
void *xmit,
|
|
int dest,int dox,
|
|
void *recv,
|
|
int from,int dor,
|
|
int xbytes,int rbytes,int dir)
|
|
{
|
|
/*
|
|
* Bring sequence from Stencil.h down to lower level.
|
|
* Assume using XeLink is ok
|
|
*/
|
|
int ncomm =communicator_halo.size();
|
|
int commdir=dir%ncomm;
|
|
|
|
MPI_Request xrq;
|
|
MPI_Request rrq;
|
|
|
|
int ierr;
|
|
int gdest = ShmRanks[dest];
|
|
int gfrom = ShmRanks[from];
|
|
int gme = ShmRanks[_processor];
|
|
|
|
assert(dest != _processor);
|
|
assert(from != _processor);
|
|
assert(gme == ShmRank);
|
|
double off_node_bytes=0.0;
|
|
int tag;
|
|
|
|
void * host_recv = NULL;
|
|
void * host_xmit = NULL;
|
|
|
|
/*
|
|
* PHASE 1: (Prepare)
|
|
* - post MPI receive buffers asynch
|
|
* - post device - host send buffer transfer asynch
|
|
*/
|
|
|
|
if ( dor ) {
|
|
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
|
|
tag= dir+from*32;
|
|
host_recv = this->HostBufferMalloc(rbytes);
|
|
ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
|
|
assert(ierr==0);
|
|
CommsRequest_t srq;
|
|
srq.PacketType = InterNodeRecv;
|
|
srq.bytes = rbytes;
|
|
srq.req = rrq;
|
|
srq.host_buf = host_recv;
|
|
srq.device_buf = recv;
|
|
list.push_back(srq);
|
|
off_node_bytes+=rbytes;
|
|
}
|
|
}
|
|
|
|
if (dox) {
|
|
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
|
|
|
tag= dir+_processor*32;
|
|
|
|
host_xmit = this->HostBufferMalloc(xbytes);
|
|
CommsRequest_t srq;
|
|
|
|
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
|
|
|
|
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
|
// assert(ierr==0);
|
|
// off_node_bytes+=xbytes;
|
|
|
|
srq.PacketType = InterNodeXmit;
|
|
srq.bytes = xbytes;
|
|
// srq.req = xrq;
|
|
srq.host_buf = host_xmit;
|
|
srq.device_buf = xmit;
|
|
srq.tag = tag;
|
|
srq.dest = dest;
|
|
srq.commdir = commdir;
|
|
list.push_back(srq);
|
|
}
|
|
}
|
|
|
|
return off_node_bytes;
|
|
}
|
|
/*
|
|
* In the interest of better pipelining, poll for completion on each DtoH and
|
|
* start MPI_ISend in the meantime
|
|
*/
|
|
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
|
|
{
|
|
int pending = 0;
|
|
do {
|
|
|
|
pending = 0;
|
|
|
|
for(int idx = 0; idx<list.size();idx++){
|
|
|
|
if ( list[idx].PacketType==InterNodeRecv ) {
|
|
|
|
int flag = 0;
|
|
MPI_Status status;
|
|
int ierr = MPI_Test(&list[idx].req,&flag,&status);
|
|
assert(ierr==0);
|
|
|
|
if ( flag ) {
|
|
// std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
|
|
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
|
|
list[idx].PacketType=InterNodeReceiveHtoD;
|
|
} else {
|
|
pending ++;
|
|
}
|
|
}
|
|
}
|
|
// std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
|
|
} while ( pending );
|
|
|
|
}
|
|
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
|
|
{
|
|
int pending = 0;
|
|
do {
|
|
|
|
pending = 0;
|
|
|
|
for(int idx = 0; idx<list.size();idx++){
|
|
|
|
if ( list[idx].PacketType==InterNodeXmit ) {
|
|
|
|
if ( acceleratorEventIsComplete(list[idx].ev) ) {
|
|
|
|
void *host_xmit = list[idx].host_buf;
|
|
uint32_t xbytes = list[idx].bytes;
|
|
int dest = list[idx].dest;
|
|
int tag = list[idx].tag;
|
|
int commdir = list[idx].commdir;
|
|
///////////////////
|
|
// Send packet
|
|
///////////////////
|
|
|
|
// std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
|
|
|
|
MPI_Request xrq;
|
|
int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
|
assert(ierr==0);
|
|
|
|
list[idx].req = xrq; // Update the MPI request in the list
|
|
|
|
list[idx].PacketType=InterNodeXmitISend;
|
|
|
|
} else {
|
|
// not done, so return to polling loop
|
|
pending++;
|
|
}
|
|
}
|
|
}
|
|
} while (pending);
|
|
}
|
|
|
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
|
void *xmit,
|
|
int dest,int dox,
|
|
void *recv,
|
|
int from,int dor,
|
|
int xbytes,int rbytes,int dir)
|
|
{
|
|
int ncomm =communicator_halo.size();
|
|
int commdir=dir%ncomm;
|
|
|
|
MPI_Request xrq;
|
|
MPI_Request rrq;
|
|
|
|
int ierr;
|
|
int gdest = ShmRanks[dest];
|
|
int gfrom = ShmRanks[from];
|
|
int gme = ShmRanks[_processor];
|
|
|
|
assert(dest != _processor);
|
|
assert(from != _processor);
|
|
assert(gme == ShmRank);
|
|
double off_node_bytes=0.0;
|
|
int tag;
|
|
|
|
void * host_xmit = NULL;
|
|
|
|
////////////////////////////////
|
|
// Receives already posted
|
|
// Copies already started
|
|
////////////////////////////////
|
|
/*
|
|
* PHASE 2: (Begin)
|
|
* - complete all copies
|
|
* - post MPI send asynch
|
|
*/
|
|
#ifdef NVLINK_GET
|
|
if ( dor ) {
|
|
|
|
if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
|
|
// Intranode
|
|
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
|
|
assert(shm!=NULL);
|
|
|
|
CommsRequest_t srq;
|
|
|
|
srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
|
|
|
|
srq.PacketType = IntraNodeRecv;
|
|
srq.bytes = xbytes;
|
|
// srq.req = xrq;
|
|
srq.host_buf = NULL;
|
|
srq.device_buf = xmit;
|
|
srq.tag = -1;
|
|
srq.dest = dest;
|
|
srq.commdir = dir;
|
|
list.push_back(srq);
|
|
}
|
|
}
|
|
#else
|
|
if (dox) {
|
|
|
|
if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
|
|
// Intranode
|
|
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
|
assert(shm!=NULL);
|
|
|
|
CommsRequest_t srq;
|
|
|
|
srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
|
|
|
srq.PacketType = IntraNodeXmit;
|
|
srq.bytes = xbytes;
|
|
// srq.req = xrq;
|
|
srq.host_buf = NULL;
|
|
srq.device_buf = xmit;
|
|
srq.tag = -1;
|
|
srq.dest = dest;
|
|
srq.commdir = dir;
|
|
list.push_back(srq);
|
|
|
|
}
|
|
}
|
|
#endif
|
|
return off_node_bytes;
|
|
}
|
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
|
{
|
|
acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
|
|
|
|
std::vector<MPI_Status> status;
|
|
std::vector<MPI_Request> MpiRequests;
|
|
|
|
for(int r=0;r<list.size();r++){
|
|
// Must check each Send buf is clear to reuse
|
|
if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
|
|
// if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
|
|
}
|
|
|
|
int nreq=MpiRequests.size();
|
|
|
|
if (nreq>0) {
|
|
status.resize(MpiRequests.size());
|
|
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
|
|
assert(ierr==0);
|
|
}
|
|
|
|
// for(int r=0;r<nreq;r++){
|
|
// if ( list[r].PacketType==InterNodeRecv ) {
|
|
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
|
|
// }
|
|
// }
|
|
|
|
|
|
list.resize(0); // Delete the list
|
|
this->HostBufferFreeAll(); // Clean up the buffer allocs
|
|
#ifndef NVLINK_GET
|
|
this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
|
|
#endif
|
|
}
|
|
#endif
|
|
////////////////////////////////////////////
|
|
// END PIPELINE MODE / NO CUDA AWARE MPI
|
|
////////////////////////////////////////////
|
|
|
|
void CartesianCommunicator::StencilBarrier(void)
|
|
{
|
|
MPI_Barrier (ShmComm);
|
|
}
|
|
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
|
//{
|
|
//}
|
|
void CartesianCommunicator::Barrier(void)
|
|
{
|
|
int ierr = MPI_Barrier(communicator);
|
|
assert(ierr==0);
|
|
}
|
|
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
|
{
|
|
int ierr=MPI_Bcast(data,
|
|
bytes,
|
|
MPI_BYTE,
|
|
root,
|
|
communicator);
|
|
assert(ierr==0);
|
|
}
|
|
int CartesianCommunicator::RankWorld(void){
|
|
int r;
|
|
MPI_Comm_rank(communicator_world,&r);
|
|
return r;
|
|
}
|
|
void CartesianCommunicator::BarrierWorld(void){
|
|
int ierr = MPI_Barrier(communicator_world);
|
|
assert(ierr==0);
|
|
}
|
|
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
|
{
|
|
int ierr= MPI_Bcast(data,
|
|
bytes,
|
|
MPI_BYTE,
|
|
root,
|
|
communicator_world);
|
|
assert(ierr==0);
|
|
}
|
|
|
|
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
|
|
{
|
|
Coordinate row(_ndimension,1);
|
|
assert(dim>=0 && dim<_ndimension);
|
|
|
|
// Split the communicator
|
|
row[dim] = _processors[dim];
|
|
|
|
int me;
|
|
CartesianCommunicator Comm(row,*this,me);
|
|
Comm.AllToAll(in,out,words,bytes);
|
|
}
|
|
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
|
|
{
|
|
// MPI is a pain and uses "int" arguments
|
|
// 64*64*64*128*16 == 500Million elements of data.
|
|
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
|
|
// (Turns up on 32^3 x 64 Gparity too)
|
|
MPI_Datatype object;
|
|
int iwords;
|
|
int ibytes;
|
|
iwords = words;
|
|
ibytes = bytes;
|
|
assert(words == iwords); // safe to cast to int ?
|
|
assert(bytes == ibytes); // safe to cast to int ?
|
|
MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
|
|
MPI_Type_commit(&object);
|
|
MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);
|
|
MPI_Type_free(&object);
|
|
}
|
|
|
|
NAMESPACE_END(Grid);
|