mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-25 21:25:56 +01:00
Experimental: Multiple communicators to see if we can avoid thread locks in --enable-comms=mpit
This commit is contained in:
parent
869b99ec1e
commit
54e94360ad
@ -68,7 +68,7 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
int Nloop=100;
|
int Nloop=100;
|
||||||
int nmu=0;
|
int nmu=0;
|
||||||
int maxlat=24;
|
int maxlat=32;
|
||||||
for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
|
for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
|
std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
|
||||||
@ -80,7 +80,7 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
header();
|
header();
|
||||||
for(int lat=4;lat<=maxlat;lat+=4){
|
for(int lat=4;lat<=maxlat;lat+=4){
|
||||||
for(int Ls=8;Ls<=32;Ls*=2){
|
for(int Ls=8;Ls<=8;Ls*=2){
|
||||||
|
|
||||||
std::vector<int> latt_size ({lat*mpi_layout[0],
|
std::vector<int> latt_size ({lat*mpi_layout[0],
|
||||||
lat*mpi_layout[1],
|
lat*mpi_layout[1],
|
||||||
@ -163,7 +163,7 @@ int main (int argc, char ** argv)
|
|||||||
header();
|
header();
|
||||||
|
|
||||||
for(int lat=4;lat<=maxlat;lat+=4){
|
for(int lat=4;lat<=maxlat;lat+=4){
|
||||||
for(int Ls=8;Ls<=32;Ls*=2){
|
for(int Ls=8;Ls<=8;Ls*=2){
|
||||||
|
|
||||||
std::vector<int> latt_size ({lat,lat,lat,lat});
|
std::vector<int> latt_size ({lat,lat,lat,lat});
|
||||||
|
|
||||||
@ -249,7 +249,7 @@ int main (int argc, char ** argv)
|
|||||||
header();
|
header();
|
||||||
|
|
||||||
for(int lat=4;lat<=maxlat;lat+=4){
|
for(int lat=4;lat<=maxlat;lat+=4){
|
||||||
for(int Ls=8;Ls<=32;Ls*=2){
|
for(int Ls=8;Ls<=8;Ls*=2){
|
||||||
|
|
||||||
std::vector<int> latt_size ({lat*mpi_layout[0],
|
std::vector<int> latt_size ({lat*mpi_layout[0],
|
||||||
lat*mpi_layout[1],
|
lat*mpi_layout[1],
|
||||||
@ -299,7 +299,7 @@ int main (int argc, char ** argv)
|
|||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
(void *)&rbuf[mu][0],
|
(void *)&rbuf[mu][0],
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes,mu);
|
||||||
|
|
||||||
comm_proc = mpi_layout[mu]-1;
|
comm_proc = mpi_layout[mu]-1;
|
||||||
|
|
||||||
@ -310,11 +310,11 @@ int main (int argc, char ** argv)
|
|||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
(void *)&rbuf[mu+4][0],
|
(void *)&rbuf[mu+4][0],
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes,mu+4);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Grid.StencilSendToRecvFromComplete(requests);
|
Grid.StencilSendToRecvFromComplete(requests,0);
|
||||||
Grid.Barrier();
|
Grid.Barrier();
|
||||||
double stop=usecond();
|
double stop=usecond();
|
||||||
t_time[i] = stop-start; // microseconds
|
t_time[i] = stop-start; // microseconds
|
||||||
@ -346,7 +346,7 @@ int main (int argc, char ** argv)
|
|||||||
header();
|
header();
|
||||||
|
|
||||||
for(int lat=4;lat<=maxlat;lat+=4){
|
for(int lat=4;lat<=maxlat;lat+=4){
|
||||||
for(int Ls=8;Ls<=32;Ls*=2){
|
for(int Ls=8;Ls<=8;Ls*=2){
|
||||||
|
|
||||||
std::vector<int> latt_size ({lat*mpi_layout[0],
|
std::vector<int> latt_size ({lat*mpi_layout[0],
|
||||||
lat*mpi_layout[1],
|
lat*mpi_layout[1],
|
||||||
@ -393,8 +393,8 @@ int main (int argc, char ** argv)
|
|||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
(void *)&rbuf[mu][0],
|
(void *)&rbuf[mu][0],
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes,mu);
|
||||||
Grid.StencilSendToRecvFromComplete(requests);
|
Grid.StencilSendToRecvFromComplete(requests,mu);
|
||||||
requests.resize(0);
|
requests.resize(0);
|
||||||
|
|
||||||
comm_proc = mpi_layout[mu]-1;
|
comm_proc = mpi_layout[mu]-1;
|
||||||
@ -406,8 +406,8 @@ int main (int argc, char ** argv)
|
|||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
(void *)&rbuf[mu+4][0],
|
(void *)&rbuf[mu+4][0],
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes,mu+4);
|
||||||
Grid.StencilSendToRecvFromComplete(requests);
|
Grid.StencilSendToRecvFromComplete(requests,mu+4);
|
||||||
requests.resize(0);
|
requests.resize(0);
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -435,6 +435,9 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
}
|
}
|
||||||
|
10
configure.ac
10
configure.ac
@ -324,14 +324,14 @@ case ${ac_COMMS} in
|
|||||||
AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
|
AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
|
||||||
comms_type='none'
|
comms_type='none'
|
||||||
;;
|
;;
|
||||||
mpi3l*)
|
|
||||||
AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] )
|
|
||||||
comms_type='mpi3l'
|
|
||||||
;;
|
|
||||||
mpi3*)
|
mpi3*)
|
||||||
AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
|
AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
|
||||||
comms_type='mpi3'
|
comms_type='mpi3'
|
||||||
;;
|
;;
|
||||||
|
mpit)
|
||||||
|
AC_DEFINE([GRID_COMMS_MPIT],[1],[GRID_COMMS_MPIT] )
|
||||||
|
comms_type='mpit'
|
||||||
|
;;
|
||||||
mpi*)
|
mpi*)
|
||||||
AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
|
AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
|
||||||
comms_type='mpi'
|
comms_type='mpi'
|
||||||
@ -359,7 +359,7 @@ esac
|
|||||||
AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ])
|
AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ])
|
||||||
AM_CONDITIONAL(BUILD_COMMS_MPI, [ test "${comms_type}X" == "mpiX" ])
|
AM_CONDITIONAL(BUILD_COMMS_MPI, [ test "${comms_type}X" == "mpiX" ])
|
||||||
AM_CONDITIONAL(BUILD_COMMS_MPI3, [ test "${comms_type}X" == "mpi3X" ] )
|
AM_CONDITIONAL(BUILD_COMMS_MPI3, [ test "${comms_type}X" == "mpi3X" ] )
|
||||||
AM_CONDITIONAL(BUILD_COMMS_MPI3L, [ test "${comms_type}X" == "mpi3lX" ] )
|
AM_CONDITIONAL(BUILD_COMMS_MPIT, [ test "${comms_type}X" == "mpitX" ] )
|
||||||
AM_CONDITIONAL(BUILD_COMMS_NONE, [ test "${comms_type}X" == "noneX" ])
|
AM_CONDITIONAL(BUILD_COMMS_NONE, [ test "${comms_type}X" == "noneX" ])
|
||||||
|
|
||||||
############### RNG selection
|
############### RNG selection
|
||||||
|
@ -10,8 +10,8 @@ if BUILD_COMMS_MPI3
|
|||||||
extra_sources+=communicator/Communicator_base.cc
|
extra_sources+=communicator/Communicator_base.cc
|
||||||
endif
|
endif
|
||||||
|
|
||||||
if BUILD_COMMS_MPI3L
|
if BUILD_COMMS_MPIT
|
||||||
extra_sources+=communicator/Communicator_mpi3_leader.cc
|
extra_sources+=communicator/Communicator_mpit.cc
|
||||||
extra_sources+=communicator/Communicator_base.cc
|
extra_sources+=communicator/Communicator_base.cc
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -89,25 +89,31 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
|
|||||||
GlobalSumVector((double *)c,2*N);
|
GlobalSumVector((double *)c,2*N);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)
|
#if !defined( GRID_COMMS_MPI3)
|
||||||
|
|
||||||
int CartesianCommunicator::NodeCount(void) { return ProcessorCount();};
|
int CartesianCommunicator::NodeCount(void) { return ProcessorCount();};
|
||||||
int CartesianCommunicator::RankCount(void) { return ProcessorCount();};
|
int CartesianCommunicator::RankCount(void) { return ProcessorCount();};
|
||||||
|
#endif
|
||||||
|
#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT)
|
||||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
int xmit_to_rank,
|
int xmit_to_rank,
|
||||||
void *recv,
|
void *recv,
|
||||||
int recv_from_rank,
|
int recv_from_rank,
|
||||||
int bytes)
|
int bytes, int dir)
|
||||||
{
|
{
|
||||||
|
// Discard the "dir"
|
||||||
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
|
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
|
||||||
return 2.0*bytes;
|
return 2.0*bytes;
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||||
{
|
{
|
||||||
SendToRecvFromComplete(waitall);
|
SendToRecvFromComplete(waitall);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined( GRID_COMMS_MPI3)
|
||||||
|
|
||||||
void CartesianCommunicator::StencilBarrier(void){};
|
void CartesianCommunicator::StencilBarrier(void){};
|
||||||
|
|
||||||
commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
|
commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
|
||||||
|
@ -38,7 +38,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifdef GRID_COMMS_MPI3
|
#ifdef GRID_COMMS_MPI3
|
||||||
#include <mpi.h>
|
#include <mpi.h>
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_COMMS_MPI3L
|
#ifdef GRID_COMMS_MPIT
|
||||||
#include <mpi.h>
|
#include <mpi.h>
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_COMMS_SHMEM
|
#ifdef GRID_COMMS_SHMEM
|
||||||
@ -64,7 +64,7 @@ class CartesianCommunicator {
|
|||||||
std::vector<int> _processor_coor; // linear processor coordinate
|
std::vector<int> _processor_coor; // linear processor coordinate
|
||||||
unsigned long _ndimension;
|
unsigned long _ndimension;
|
||||||
|
|
||||||
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L)
|
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
|
||||||
static MPI_Comm communicator_world;
|
static MPI_Comm communicator_world;
|
||||||
MPI_Comm communicator;
|
MPI_Comm communicator;
|
||||||
typedef MPI_Request CommsRequest_t;
|
typedef MPI_Request CommsRequest_t;
|
||||||
@ -72,6 +72,10 @@ class CartesianCommunicator {
|
|||||||
typedef int CommsRequest_t;
|
typedef int CommsRequest_t;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined (GRID_COMMS_MPIT)
|
||||||
|
std::vector<MPI_Comm> communicator_halo;
|
||||||
|
#endif
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// Helper functionality for SHM Windows common to all other impls
|
// Helper functionality for SHM Windows common to all other impls
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
@ -212,13 +216,13 @@ class CartesianCommunicator {
|
|||||||
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
|
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
|
||||||
|
|
||||||
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
int xmit_to_rank,
|
int xmit_to_rank,
|
||||||
void *recv,
|
void *recv,
|
||||||
int recv_from_rank,
|
int recv_from_rank,
|
||||||
int bytes);
|
int bytes,int dir);
|
||||||
|
|
||||||
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
|
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
|
||||||
void StencilBarrier(void);
|
void StencilBarrier(void);
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
|
@ -600,11 +600,11 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
|
|||||||
}
|
}
|
||||||
|
|
||||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
int dest,
|
int dest,
|
||||||
void *recv,
|
void *recv,
|
||||||
int from,
|
int from,
|
||||||
int bytes)
|
int bytes,int dir)
|
||||||
{
|
{
|
||||||
MPI_Request xrq;
|
MPI_Request xrq;
|
||||||
MPI_Request rrq;
|
MPI_Request rrq;
|
||||||
@ -643,7 +643,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
|
|
||||||
return off_node_bytes;
|
return off_node_bytes;
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||||
{
|
{
|
||||||
SendToRecvFromComplete(waitall);
|
SendToRecvFromComplete(waitall);
|
||||||
}
|
}
|
||||||
|
@ -235,24 +235,30 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
|||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
|
|
||||||
double CartesianCommunicator::StencilSendToRecvFromBegin(int dir,
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
std::vector<CommsRequest_t> &list,
|
void *xmit,
|
||||||
void *xmit,
|
int xmit_to_rank,
|
||||||
int xmit_to_rank,
|
void *recv,
|
||||||
void *recv,
|
int recv_from_rank,
|
||||||
int recv_from_rank,
|
int bytes,int dir)
|
||||||
int bytes)
|
|
||||||
{
|
{
|
||||||
|
|
||||||
int myrank = _processor;
|
int myrank = _processor;
|
||||||
int ierr;
|
int ierr;
|
||||||
|
assert(dir < communicator_halo.size());
|
||||||
|
|
||||||
|
// std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
|
||||||
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
||||||
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
|
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,xmit_to_rank,myrank,
|
||||||
recv,bytes,MPI_CHAR,from, from,
|
recv,bytes,MPI_CHAR,recv_from_rank, recv_from_rank,
|
||||||
communicator_halo[dir],MPI_STATUS_IGNORE);
|
communicator_halo[dir],MPI_STATUS_IGNORE);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
return 2.0*bytes;
|
return 2.0*bytes;
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall){ };
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||||
|
{
|
||||||
|
// Do nothing
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -42,7 +42,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/cshift/Cshift_mpi.h>
|
#include <Grid/cshift/Cshift_mpi.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GRID_COMMS_MPI3L
|
#ifdef GRID_COMMS_MPIT
|
||||||
#include <Grid/cshift/Cshift_mpi.h>
|
#include <Grid/cshift/Cshift_mpi.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -95,7 +95,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
|
|||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
void Grid_quiesce_nodes(void) {
|
void Grid_quiesce_nodes(void) {
|
||||||
int me = 0;
|
int me = 0;
|
||||||
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L)
|
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_COMMS_SHMEM
|
#ifdef GRID_COMMS_SHMEM
|
||||||
|
@ -29,7 +29,7 @@
|
|||||||
#ifndef GRID_BINARY_IO_H
|
#ifndef GRID_BINARY_IO_H
|
||||||
#define GRID_BINARY_IO_H
|
#define GRID_BINARY_IO_H
|
||||||
|
|
||||||
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3)
|
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
|
||||||
#define USE_MPI_IO
|
#define USE_MPI_IO
|
||||||
#else
|
#else
|
||||||
#undef USE_MPI_IO
|
#undef USE_MPI_IO
|
||||||
|
@ -379,7 +379,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
|||||||
{
|
{
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||||
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
|
|
||||||
|
|
||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
|
|
||||||
@ -388,46 +387,46 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
|||||||
|
|
||||||
DhopFaceTime-=usecond();
|
DhopFaceTime-=usecond();
|
||||||
st.HaloExchangeOptGather(in,compressor);
|
st.HaloExchangeOptGather(in,compressor);
|
||||||
|
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
||||||
DhopFaceTime+=usecond();
|
DhopFaceTime+=usecond();
|
||||||
std::vector<std::vector<CommsRequest_t> > reqs;
|
|
||||||
|
|
||||||
// Rely on async comms; start comms before merge of local data
|
// Rely on async comms; start comms before merge of local data
|
||||||
|
DhopComputeTime-=usecond();
|
||||||
DhopCommTime-=usecond();
|
DhopCommTime-=usecond();
|
||||||
st.CommunicateBegin(reqs);
|
|
||||||
|
|
||||||
DhopFaceTime-=usecond();
|
|
||||||
st.CommsMergeSHM(compressor);
|
|
||||||
DhopFaceTime+=usecond();
|
|
||||||
|
|
||||||
// Perhaps use omp task and region
|
|
||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
{
|
{
|
||||||
int nthreads = omp_get_num_threads();
|
// Should time this somehow; hard as the threads fork nowait
|
||||||
int me = omp_get_thread_num();
|
st.CommunicateThreaded();
|
||||||
int myoff, mywork;
|
|
||||||
|
|
||||||
GridThread::GetWork(len,me-1,mywork,myoff,nthreads-1);
|
if (dag == DaggerYes) {
|
||||||
int sF = LLs * myoff;
|
#pragma omp for
|
||||||
|
for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
||||||
if ( me == 0 ) {
|
int sU = ss;
|
||||||
st.CommunicateComplete(reqs);
|
int sF = LLs * sU;
|
||||||
DhopCommTime+=usecond();
|
Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
|
||||||
} else {
|
}
|
||||||
// Interior links in stencil
|
} else {
|
||||||
if ( me==1 ) DhopComputeTime-=usecond();
|
#pragma omp for
|
||||||
if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
|
for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
||||||
else Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
|
int sU = ss;
|
||||||
if ( me==1 ) DhopComputeTime+=usecond();
|
int sF = LLs * sU;
|
||||||
|
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#pragma omp single
|
||||||
|
DhopComputeTime+=usecond();
|
||||||
|
|
||||||
|
#pragma omp taskwait
|
||||||
|
|
||||||
|
#pragma omp single
|
||||||
|
DhopCommTime+=usecond();
|
||||||
|
} // Closes parallel region and waits the comms (I hope)
|
||||||
|
|
||||||
|
|
||||||
DhopFaceTime-=usecond();
|
DhopFaceTime-=usecond();
|
||||||
st.CommsMerge(compressor);
|
st.CommsMerge(compressor);
|
||||||
DhopFaceTime+=usecond();
|
DhopFaceTime+=usecond();
|
||||||
|
|
||||||
// Load imbalance alert. Should use dynamic schedule OMP for loop
|
|
||||||
// Perhaps create a list of only those sites with face work, and
|
|
||||||
// load balance process the list.
|
|
||||||
DhopComputeTime2-=usecond();
|
DhopComputeTime2-=usecond();
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
int sz=st.surface_list.size();
|
int sz=st.surface_list.size();
|
||||||
@ -448,11 +447,9 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
|||||||
#else
|
#else
|
||||||
assert(0);
|
assert(0);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
|
void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField & U,
|
DoubledGaugeField & U,
|
||||||
|
@ -248,24 +248,57 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
// Comms packet queue for asynch thread
|
// Comms packet queue for asynch thread
|
||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
|
void CommunicateThreaded()
|
||||||
|
{
|
||||||
|
for(int i=0;i<Packets.size();i++){
|
||||||
|
#pragma omp task
|
||||||
|
{
|
||||||
|
double start;
|
||||||
|
double stop;
|
||||||
|
start = usecond();
|
||||||
|
uint64_t bytes;
|
||||||
|
std::vector<CommsRequest_t> reqs;
|
||||||
|
bytes=_grid->StencilSendToRecvFromBegin(reqs,
|
||||||
|
Packets[i].send_buf,
|
||||||
|
Packets[i].to_rank,
|
||||||
|
Packets[i].recv_buf,
|
||||||
|
Packets[i].from_rank,
|
||||||
|
Packets[i].bytes,i);
|
||||||
|
_grid->StencilSendToRecvFromComplete(reqs,i);
|
||||||
|
// Last task logged; this is approximate but hard to catch
|
||||||
|
// the last to complete
|
||||||
|
stop = usecond();
|
||||||
|
stop = stop - start;
|
||||||
|
|
||||||
|
if ( i==0 ) commtime+=stop;
|
||||||
|
|
||||||
|
#pragma omp critical
|
||||||
|
{
|
||||||
|
comms_bytes+=bytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
reqs.resize(Packets.size());
|
reqs.resize(Packets.size());
|
||||||
commtime-=usecond();
|
commtime-=usecond();
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
comms_bytes+=_grid->StencilSendToRecvFromBegin(reqs[i],
|
comms_bytes+=_grid->StencilSendToRecvFromBegin(reqs[i],
|
||||||
Packets[i].send_buf,
|
Packets[i].send_buf,
|
||||||
Packets[i].to_rank,
|
Packets[i].to_rank,
|
||||||
Packets[i].recv_buf,
|
Packets[i].recv_buf,
|
||||||
Packets[i].from_rank,
|
Packets[i].from_rank,
|
||||||
Packets[i].bytes);
|
Packets[i].bytes,i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
_grid->StencilSendToRecvFromComplete(reqs[i]);
|
_grid->StencilSendToRecvFromComplete(reqs[i],i);
|
||||||
}
|
}
|
||||||
commtime+=usecond();
|
commtime+=usecond();
|
||||||
}
|
}
|
||||||
|
@ -393,7 +393,7 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
|
|
||||||
void Grid_finalize(void)
|
void Grid_finalize(void)
|
||||||
{
|
{
|
||||||
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
|
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
|
||||||
MPI_Finalize();
|
MPI_Finalize();
|
||||||
Grid_unquiesce_nodes();
|
Grid_unquiesce_nodes();
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
x
Reference in New Issue
Block a user