1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-25 21:25:56 +01:00

Experimental: Multiple communicators to see if we can avoid thread locks in --enable-comms=mpit

This commit is contained in:
paboyle 2017-06-24 23:10:24 +01:00
parent 869b99ec1e
commit 54e94360ad
13 changed files with 139 additions and 90 deletions

View File

@ -68,7 +68,7 @@ int main (int argc, char ** argv)
int Nloop=100; int Nloop=100;
int nmu=0; int nmu=0;
int maxlat=24; int maxlat=32;
for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++; for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl; std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
@ -80,7 +80,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
header(); header();
for(int lat=4;lat<=maxlat;lat+=4){ for(int lat=4;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=32;Ls*=2){ for(int Ls=8;Ls<=8;Ls*=2){
std::vector<int> latt_size ({lat*mpi_layout[0], std::vector<int> latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1], lat*mpi_layout[1],
@ -163,7 +163,7 @@ int main (int argc, char ** argv)
header(); header();
for(int lat=4;lat<=maxlat;lat+=4){ for(int lat=4;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=32;Ls*=2){ for(int Ls=8;Ls<=8;Ls*=2){
std::vector<int> latt_size ({lat,lat,lat,lat}); std::vector<int> latt_size ({lat,lat,lat,lat});
@ -249,7 +249,7 @@ int main (int argc, char ** argv)
header(); header();
for(int lat=4;lat<=maxlat;lat+=4){ for(int lat=4;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=32;Ls*=2){ for(int Ls=8;Ls<=8;Ls*=2){
std::vector<int> latt_size ({lat*mpi_layout[0], std::vector<int> latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1], lat*mpi_layout[1],
@ -299,7 +299,7 @@ int main (int argc, char ** argv)
xmit_to_rank, xmit_to_rank,
(void *)&rbuf[mu][0], (void *)&rbuf[mu][0],
recv_from_rank, recv_from_rank,
bytes); bytes,mu);
comm_proc = mpi_layout[mu]-1; comm_proc = mpi_layout[mu]-1;
@ -310,11 +310,11 @@ int main (int argc, char ** argv)
xmit_to_rank, xmit_to_rank,
(void *)&rbuf[mu+4][0], (void *)&rbuf[mu+4][0],
recv_from_rank, recv_from_rank,
bytes); bytes,mu+4);
} }
} }
Grid.StencilSendToRecvFromComplete(requests); Grid.StencilSendToRecvFromComplete(requests,0);
Grid.Barrier(); Grid.Barrier();
double stop=usecond(); double stop=usecond();
t_time[i] = stop-start; // microseconds t_time[i] = stop-start; // microseconds
@ -346,7 +346,7 @@ int main (int argc, char ** argv)
header(); header();
for(int lat=4;lat<=maxlat;lat+=4){ for(int lat=4;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=32;Ls*=2){ for(int Ls=8;Ls<=8;Ls*=2){
std::vector<int> latt_size ({lat*mpi_layout[0], std::vector<int> latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1], lat*mpi_layout[1],
@ -393,8 +393,8 @@ int main (int argc, char ** argv)
xmit_to_rank, xmit_to_rank,
(void *)&rbuf[mu][0], (void *)&rbuf[mu][0],
recv_from_rank, recv_from_rank,
bytes); bytes,mu);
Grid.StencilSendToRecvFromComplete(requests); Grid.StencilSendToRecvFromComplete(requests,mu);
requests.resize(0); requests.resize(0);
comm_proc = mpi_layout[mu]-1; comm_proc = mpi_layout[mu]-1;
@ -406,8 +406,8 @@ int main (int argc, char ** argv)
xmit_to_rank, xmit_to_rank,
(void *)&rbuf[mu+4][0], (void *)&rbuf[mu+4][0],
recv_from_rank, recv_from_rank,
bytes); bytes,mu+4);
Grid.StencilSendToRecvFromComplete(requests); Grid.StencilSendToRecvFromComplete(requests,mu+4);
requests.resize(0); requests.resize(0);
} }
@ -435,6 +435,9 @@ int main (int argc, char ** argv)
} }
} }
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
Grid_finalize(); Grid_finalize();
} }

View File

@ -324,14 +324,14 @@ case ${ac_COMMS} in
AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] ) AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
comms_type='none' comms_type='none'
;; ;;
mpi3l*)
AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] )
comms_type='mpi3l'
;;
mpi3*) mpi3*)
AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] ) AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
comms_type='mpi3' comms_type='mpi3'
;; ;;
mpit)
AC_DEFINE([GRID_COMMS_MPIT],[1],[GRID_COMMS_MPIT] )
comms_type='mpit'
;;
mpi*) mpi*)
AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] ) AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
comms_type='mpi' comms_type='mpi'
@ -359,7 +359,7 @@ esac
AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ]) AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ])
AM_CONDITIONAL(BUILD_COMMS_MPI, [ test "${comms_type}X" == "mpiX" ]) AM_CONDITIONAL(BUILD_COMMS_MPI, [ test "${comms_type}X" == "mpiX" ])
AM_CONDITIONAL(BUILD_COMMS_MPI3, [ test "${comms_type}X" == "mpi3X" ] ) AM_CONDITIONAL(BUILD_COMMS_MPI3, [ test "${comms_type}X" == "mpi3X" ] )
AM_CONDITIONAL(BUILD_COMMS_MPI3L, [ test "${comms_type}X" == "mpi3lX" ] ) AM_CONDITIONAL(BUILD_COMMS_MPIT, [ test "${comms_type}X" == "mpitX" ] )
AM_CONDITIONAL(BUILD_COMMS_NONE, [ test "${comms_type}X" == "noneX" ]) AM_CONDITIONAL(BUILD_COMMS_NONE, [ test "${comms_type}X" == "noneX" ])
############### RNG selection ############### RNG selection

View File

@ -10,8 +10,8 @@ if BUILD_COMMS_MPI3
extra_sources+=communicator/Communicator_base.cc extra_sources+=communicator/Communicator_base.cc
endif endif
if BUILD_COMMS_MPI3L if BUILD_COMMS_MPIT
extra_sources+=communicator/Communicator_mpi3_leader.cc extra_sources+=communicator/Communicator_mpit.cc
extra_sources+=communicator/Communicator_base.cc extra_sources+=communicator/Communicator_base.cc
endif endif

View File

@ -89,25 +89,31 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
GlobalSumVector((double *)c,2*N); GlobalSumVector((double *)c,2*N);
} }
#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L) #if !defined( GRID_COMMS_MPI3)
int CartesianCommunicator::NodeCount(void) { return ProcessorCount();}; int CartesianCommunicator::NodeCount(void) { return ProcessorCount();};
int CartesianCommunicator::RankCount(void) { return ProcessorCount();}; int CartesianCommunicator::RankCount(void) { return ProcessorCount();};
#endif
#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT)
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int xmit_to_rank, int xmit_to_rank,
void *recv, void *recv,
int recv_from_rank, int recv_from_rank,
int bytes) int bytes, int dir)
{ {
// Discard the "dir"
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
return 2.0*bytes; return 2.0*bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{ {
SendToRecvFromComplete(waitall); SendToRecvFromComplete(waitall);
} }
#endif
#if !defined( GRID_COMMS_MPI3)
void CartesianCommunicator::StencilBarrier(void){}; void CartesianCommunicator::StencilBarrier(void){};
commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector; commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;

View File

@ -38,7 +38,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifdef GRID_COMMS_MPI3 #ifdef GRID_COMMS_MPI3
#include <mpi.h> #include <mpi.h>
#endif #endif
#ifdef GRID_COMMS_MPI3L #ifdef GRID_COMMS_MPIT
#include <mpi.h> #include <mpi.h>
#endif #endif
#ifdef GRID_COMMS_SHMEM #ifdef GRID_COMMS_SHMEM
@ -64,7 +64,7 @@ class CartesianCommunicator {
std::vector<int> _processor_coor; // linear processor coordinate std::vector<int> _processor_coor; // linear processor coordinate
unsigned long _ndimension; unsigned long _ndimension;
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L) #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
static MPI_Comm communicator_world; static MPI_Comm communicator_world;
MPI_Comm communicator; MPI_Comm communicator;
typedef MPI_Request CommsRequest_t; typedef MPI_Request CommsRequest_t;
@ -72,6 +72,10 @@ class CartesianCommunicator {
typedef int CommsRequest_t; typedef int CommsRequest_t;
#endif #endif
#if defined (GRID_COMMS_MPIT)
std::vector<MPI_Comm> communicator_halo;
#endif
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// Helper functionality for SHM Windows common to all other impls // Helper functionality for SHM Windows common to all other impls
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
@ -212,13 +216,13 @@ class CartesianCommunicator {
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall); void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int xmit_to_rank, int xmit_to_rank,
void *recv, void *recv,
int recv_from_rank, int recv_from_rank,
int bytes); int bytes,int dir);
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall); void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
void StencilBarrier(void); void StencilBarrier(void);
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////

View File

@ -600,11 +600,11 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
} }
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int dest, int dest,
void *recv, void *recv,
int from, int from,
int bytes) int bytes,int dir)
{ {
MPI_Request xrq; MPI_Request xrq;
MPI_Request rrq; MPI_Request rrq;
@ -643,7 +643,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
return off_node_bytes; return off_node_bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{ {
SendToRecvFromComplete(waitall); SendToRecvFromComplete(waitall);
} }

View File

@ -235,24 +235,30 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
assert(ierr==0); assert(ierr==0);
} }
double CartesianCommunicator::StencilSendToRecvFromBegin(int dir, double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
std::vector<CommsRequest_t> &list, void *xmit,
void *xmit, int xmit_to_rank,
int xmit_to_rank, void *recv,
void *recv, int recv_from_rank,
int recv_from_rank, int bytes,int dir)
int bytes)
{ {
int myrank = _processor; int myrank = _processor;
int ierr; int ierr;
assert(dir < communicator_halo.size());
// std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
// Give the CPU to MPI immediately; can use threads to overlap optionally // Give the CPU to MPI immediately; can use threads to overlap optionally
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,xmit_to_rank,myrank,
recv,bytes,MPI_CHAR,from, from, recv,bytes,MPI_CHAR,recv_from_rank, recv_from_rank,
communicator_halo[dir],MPI_STATUS_IGNORE); communicator_halo[dir],MPI_STATUS_IGNORE);
assert(ierr==0); assert(ierr==0);
return 2.0*bytes; return 2.0*bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall){ }; void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{
// Do nothing
};

View File

@ -42,7 +42,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/cshift/Cshift_mpi.h> #include <Grid/cshift/Cshift_mpi.h>
#endif #endif
#ifdef GRID_COMMS_MPI3L #ifdef GRID_COMMS_MPIT
#include <Grid/cshift/Cshift_mpi.h> #include <Grid/cshift/Cshift_mpi.h>
#endif #endif

View File

@ -95,7 +95,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
void Grid_quiesce_nodes(void) { void Grid_quiesce_nodes(void) {
int me = 0; int me = 0;
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L) #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
MPI_Comm_rank(MPI_COMM_WORLD, &me); MPI_Comm_rank(MPI_COMM_WORLD, &me);
#endif #endif
#ifdef GRID_COMMS_SHMEM #ifdef GRID_COMMS_SHMEM

View File

@ -29,7 +29,7 @@
#ifndef GRID_BINARY_IO_H #ifndef GRID_BINARY_IO_H
#define GRID_BINARY_IO_H #define GRID_BINARY_IO_H
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
#define USE_MPI_IO #define USE_MPI_IO
#else #else
#undef USE_MPI_IO #undef USE_MPI_IO

View File

@ -379,7 +379,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
{ {
#ifdef GRID_OMP #ifdef GRID_OMP
// assert((dag==DaggerNo) ||(dag==DaggerYes)); // assert((dag==DaggerNo) ||(dag==DaggerYes));
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
Compressor compressor(dag); Compressor compressor(dag);
@ -388,46 +387,46 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
DhopFaceTime-=usecond(); DhopFaceTime-=usecond();
st.HaloExchangeOptGather(in,compressor); st.HaloExchangeOptGather(in,compressor);
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
DhopFaceTime+=usecond(); DhopFaceTime+=usecond();
std::vector<std::vector<CommsRequest_t> > reqs;
// Rely on async comms; start comms before merge of local data // Rely on async comms; start comms before merge of local data
DhopComputeTime-=usecond();
DhopCommTime-=usecond(); DhopCommTime-=usecond();
st.CommunicateBegin(reqs);
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor);
DhopFaceTime+=usecond();
// Perhaps use omp task and region
#pragma omp parallel #pragma omp parallel
{ {
int nthreads = omp_get_num_threads(); // Should time this somehow; hard as the threads fork nowait
int me = omp_get_thread_num(); st.CommunicateThreaded();
int myoff, mywork;
GridThread::GetWork(len,me-1,mywork,myoff,nthreads-1); if (dag == DaggerYes) {
int sF = LLs * myoff; #pragma omp for
for (int ss = 0; ss < U._grid->oSites(); ss++) {
if ( me == 0 ) { int sU = ss;
st.CommunicateComplete(reqs); int sF = LLs * sU;
DhopCommTime+=usecond(); Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
} else { }
// Interior links in stencil } else {
if ( me==1 ) DhopComputeTime-=usecond(); #pragma omp for
if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0); for (int ss = 0; ss < U._grid->oSites(); ss++) {
else Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0); int sU = ss;
if ( me==1 ) DhopComputeTime+=usecond(); int sF = LLs * sU;
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
} }
} }
#pragma omp single
DhopComputeTime+=usecond();
#pragma omp taskwait
#pragma omp single
DhopCommTime+=usecond();
} // Closes parallel region and waits the comms (I hope)
DhopFaceTime-=usecond(); DhopFaceTime-=usecond();
st.CommsMerge(compressor); st.CommsMerge(compressor);
DhopFaceTime+=usecond(); DhopFaceTime+=usecond();
// Load imbalance alert. Should use dynamic schedule OMP for loop
// Perhaps create a list of only those sites with face work, and
// load balance process the list.
DhopComputeTime2-=usecond(); DhopComputeTime2-=usecond();
if (dag == DaggerYes) { if (dag == DaggerYes) {
int sz=st.surface_list.size(); int sz=st.surface_list.size();
@ -448,11 +447,9 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
#else #else
assert(0); assert(0);
#endif #endif
} }
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U, DoubledGaugeField & U,

View File

@ -248,24 +248,57 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
////////////////////////////////////////// //////////////////////////////////////////
// Comms packet queue for asynch thread // Comms packet queue for asynch thread
////////////////////////////////////////// //////////////////////////////////////////
void CommunicateThreaded()
{
for(int i=0;i<Packets.size();i++){
#pragma omp task
{
double start;
double stop;
start = usecond();
uint64_t bytes;
std::vector<CommsRequest_t> reqs;
bytes=_grid->StencilSendToRecvFromBegin(reqs,
Packets[i].send_buf,
Packets[i].to_rank,
Packets[i].recv_buf,
Packets[i].from_rank,
Packets[i].bytes,i);
_grid->StencilSendToRecvFromComplete(reqs,i);
// Last task logged; this is approximate but hard to catch
// the last to complete
stop = usecond();
stop = stop - start;
if ( i==0 ) commtime+=stop;
#pragma omp critical
{
comms_bytes+=bytes;
}
}
}
}
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
{ {
reqs.resize(Packets.size()); reqs.resize(Packets.size());
commtime-=usecond(); commtime-=usecond();
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
comms_bytes+=_grid->StencilSendToRecvFromBegin(reqs[i], comms_bytes+=_grid->StencilSendToRecvFromBegin(reqs[i],
Packets[i].send_buf, Packets[i].send_buf,
Packets[i].to_rank, Packets[i].to_rank,
Packets[i].recv_buf, Packets[i].recv_buf,
Packets[i].from_rank, Packets[i].from_rank,
Packets[i].bytes); Packets[i].bytes,i);
} }
} }
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
{ {
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
_grid->StencilSendToRecvFromComplete(reqs[i]); _grid->StencilSendToRecvFromComplete(reqs[i],i);
} }
commtime+=usecond(); commtime+=usecond();
} }

View File

@ -393,7 +393,7 @@ void Grid_init(int *argc,char ***argv)
void Grid_finalize(void) void Grid_finalize(void)
{ {
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
MPI_Finalize(); MPI_Finalize();
Grid_unquiesce_nodes(); Grid_unquiesce_nodes();
#endif #endif