From 1ef424b1392038df12130b1ce2f855c8b1cc1dbd Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 27 Oct 2017 14:20:35 +0100 Subject: [PATCH] Split grid Y2K bug fix attempt --- lib/communicator/Communicator_base.h | 14 +++++++++----- lib/communicator/Communicator_mpi.cc | 26 +++++++++++++++++++++----- lib/communicator/Communicator_none.cc | 8 ++++++-- lib/lattice/Lattice_transfer.h | 8 ++++---- 4 files changed, 40 insertions(+), 16 deletions(-) diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h index 22c9e4d0..ff054497 100644 --- a/lib/communicator/Communicator_base.h +++ b/lib/communicator/Communicator_base.h @@ -274,12 +274,16 @@ class CartesianCommunicator { // std::cerr << " AllToAll in.size() "< void Broadcast(int root,obj &data) { diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc index 5a2dc4d0..ef612f98 100644 --- a/lib/communicator/Communicator_mpi.cc +++ b/lib/communicator/Communicator_mpi.cc @@ -55,7 +55,9 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { CartesianCommunicator::~CartesianCommunicator() { - if (communicator && !MPI::Is_finalized()) + int MPI_is_finalised; + MPI_Finalized(&MPI_is_finalised); + if (communicator && MPI_is_finalised) MPI_Comm_free(&communicator); } @@ -195,7 +197,7 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes) communicator); assert(ierr==0); } -void CartesianCommunicator::AllToAll(int dim,void *in,void *out,int bytes) +void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) { std::vector row(_ndimension,1); assert(dim>=0 && dim<_ndimension); @@ -204,11 +206,25 @@ void CartesianCommunicator::AllToAll(int dim,void *in,void *out,int bytes) row[dim] = _processors[dim]; CartesianCommunicator Comm(row,*this); - Comm.AllToAll(in,out,bytes); + Comm.AllToAll(in,out,words,bytes); } -void CartesianCommunicator::AllToAll(void *in,void *out,int bytes) +void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes) { - MPI_Alltoall(in ,bytes,MPI_BYTE,out,bytes,MPI_BYTE,communicator); + // MPI is a pain and uses "int" arguments + // 64*64*64*128*16 == 500Million elements of data. + // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug. + // (Turns up on 32^3 x 64 Gparity too) + MPI_Datatype object; + int iwords; + int ibytes; + iwords = words; + ibytes = bytes; + assert(words == iwords); // safe to cast to int ? + assert(bytes == ibytes); // safe to cast to int ? + MPI_Type_contiguous(ibytes,MPI_BYTE,&object); + MPI_Type_commit(&object); + MPI_Alltoall(in,iwords,object,out,iwords,object,communicator); + MPI_Type_free(&object); } /////////////////////////////////////////////////////// // Should only be used prior to Grid Init finished. diff --git a/lib/communicator/Communicator_none.cc b/lib/communicator/Communicator_none.cc index 629a3e4a..a862d52a 100644 --- a/lib/communicator/Communicator_none.cc +++ b/lib/communicator/Communicator_none.cc @@ -100,9 +100,13 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector & { assert(0); } -void CartesianCommunicator::AllToAll(int dim,void *in,void *out,int bytes) +void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) { - bcopy(in,out,bytes); + bcopy(in,out,bytes*words); +} +void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes) +{ + bcopy(in,out,bytes*words); } int CartesianCommunicator::RankWorld(void){return 0;} diff --git a/lib/lattice/Lattice_transfer.h b/lib/lattice/Lattice_transfer.h index 713a8788..bc59e9eb 100644 --- a/lib/lattice/Lattice_transfer.h +++ b/lib/lattice/Lattice_transfer.h @@ -790,8 +790,8 @@ void Grid_split(std::vector > & full,Lattice & split) ratio[d] = full_grid->_processors[d]/ split_grid->_processors[d]; } - int lsites = full_grid->lSites(); - Integer sz = lsites * nvector; + uint64_t lsites = full_grid->lSites(); + uint64_t sz = lsites * nvector; std::vector tmpdata(sz); std::vector alldata(sz); std::vector scalardata(lsites); @@ -908,8 +908,8 @@ void Grid_unsplit(std::vector > & full,Lattice & split) ratio[d] = full_grid->_processors[d]/ split_grid->_processors[d]; } - int lsites = full_grid->lSites(); - Integer sz = lsites * nvector; + uint64_t lsites = full_grid->lSites(); + uint64_t sz = lsites * nvector; std::vector tmpdata(sz); std::vector alldata(sz); std::vector scalardata(lsites);