mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Experimental: Multiple communicators to see if we can avoid thread locks in --enable-comms=mpit
This commit is contained in:
		@@ -68,7 +68,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
  int Nloop=100;
 | 
			
		||||
  int nmu=0;
 | 
			
		||||
  int maxlat=24;
 | 
			
		||||
  int maxlat=32;
 | 
			
		||||
  for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
 | 
			
		||||
@@ -80,7 +80,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
  header();
 | 
			
		||||
  for(int lat=4;lat<=maxlat;lat+=4){
 | 
			
		||||
    for(int Ls=8;Ls<=32;Ls*=2){
 | 
			
		||||
    for(int Ls=8;Ls<=8;Ls*=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],
 | 
			
		||||
      				    lat*mpi_layout[1],
 | 
			
		||||
@@ -163,7 +163,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  header();
 | 
			
		||||
 | 
			
		||||
  for(int lat=4;lat<=maxlat;lat+=4){
 | 
			
		||||
    for(int Ls=8;Ls<=32;Ls*=2){
 | 
			
		||||
    for(int Ls=8;Ls<=8;Ls*=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat,lat,lat,lat});
 | 
			
		||||
 | 
			
		||||
@@ -249,7 +249,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  header();
 | 
			
		||||
 | 
			
		||||
  for(int lat=4;lat<=maxlat;lat+=4){
 | 
			
		||||
    for(int Ls=8;Ls<=32;Ls*=2){
 | 
			
		||||
    for(int Ls=8;Ls<=8;Ls*=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],
 | 
			
		||||
      				    lat*mpi_layout[1],
 | 
			
		||||
@@ -299,7 +299,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
					      xmit_to_rank,
 | 
			
		||||
					      (void *)&rbuf[mu][0],
 | 
			
		||||
					      recv_from_rank,
 | 
			
		||||
					      bytes);
 | 
			
		||||
					      bytes,mu);
 | 
			
		||||
	
 | 
			
		||||
	    comm_proc = mpi_layout[mu]-1;
 | 
			
		||||
	  
 | 
			
		||||
@@ -310,11 +310,11 @@ int main (int argc, char ** argv)
 | 
			
		||||
					      xmit_to_rank,
 | 
			
		||||
					      (void *)&rbuf[mu+4][0],
 | 
			
		||||
					      recv_from_rank,
 | 
			
		||||
					      bytes);
 | 
			
		||||
					      bytes,mu+4);
 | 
			
		||||
	  
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
	Grid.StencilSendToRecvFromComplete(requests);
 | 
			
		||||
	Grid.StencilSendToRecvFromComplete(requests,0);
 | 
			
		||||
	Grid.Barrier();
 | 
			
		||||
	double stop=usecond();
 | 
			
		||||
	t_time[i] = stop-start; // microseconds
 | 
			
		||||
@@ -346,7 +346,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  header();
 | 
			
		||||
 | 
			
		||||
  for(int lat=4;lat<=maxlat;lat+=4){
 | 
			
		||||
    for(int Ls=8;Ls<=32;Ls*=2){
 | 
			
		||||
    for(int Ls=8;Ls<=8;Ls*=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],
 | 
			
		||||
      				    lat*mpi_layout[1],
 | 
			
		||||
@@ -393,8 +393,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
					      xmit_to_rank,
 | 
			
		||||
					      (void *)&rbuf[mu][0],
 | 
			
		||||
					      recv_from_rank,
 | 
			
		||||
					      bytes);
 | 
			
		||||
	    Grid.StencilSendToRecvFromComplete(requests);
 | 
			
		||||
					      bytes,mu);
 | 
			
		||||
	    Grid.StencilSendToRecvFromComplete(requests,mu);
 | 
			
		||||
	    requests.resize(0);
 | 
			
		||||
 | 
			
		||||
	    comm_proc = mpi_layout[mu]-1;
 | 
			
		||||
@@ -406,8 +406,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
					      xmit_to_rank,
 | 
			
		||||
					      (void *)&rbuf[mu+4][0],
 | 
			
		||||
					      recv_from_rank,
 | 
			
		||||
					      bytes);
 | 
			
		||||
	    Grid.StencilSendToRecvFromComplete(requests);
 | 
			
		||||
					      bytes,mu+4);
 | 
			
		||||
	    Grid.StencilSendToRecvFromComplete(requests,mu+4);
 | 
			
		||||
	    requests.resize(0);
 | 
			
		||||
	  
 | 
			
		||||
	  }
 | 
			
		||||
@@ -435,6 +435,9 @@ int main (int argc, char ** argv)
 | 
			
		||||
 
 | 
			
		||||
    }
 | 
			
		||||
  }    
 | 
			
		||||
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										10
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										10
									
								
								configure.ac
									
									
									
									
									
								
							@@ -324,14 +324,14 @@ case ${ac_COMMS} in
 | 
			
		||||
        AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
 | 
			
		||||
        comms_type='none'
 | 
			
		||||
     ;;
 | 
			
		||||
     mpi3l*)
 | 
			
		||||
       AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] )
 | 
			
		||||
       comms_type='mpi3l'
 | 
			
		||||
     ;;
 | 
			
		||||
     mpi3*)
 | 
			
		||||
        AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
 | 
			
		||||
        comms_type='mpi3'
 | 
			
		||||
     ;;
 | 
			
		||||
     mpit)
 | 
			
		||||
        AC_DEFINE([GRID_COMMS_MPIT],[1],[GRID_COMMS_MPIT] )
 | 
			
		||||
        comms_type='mpit'
 | 
			
		||||
     ;;
 | 
			
		||||
     mpi*)
 | 
			
		||||
        AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
 | 
			
		||||
        comms_type='mpi'
 | 
			
		||||
@@ -359,7 +359,7 @@ esac
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ])
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_MPI,   [ test "${comms_type}X" == "mpiX" ])
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_MPI3,  [ test "${comms_type}X" == "mpi3X" ] )
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_MPI3L, [ test "${comms_type}X" == "mpi3lX" ] )
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_MPIT,  [ test "${comms_type}X" == "mpitX" ] )
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_NONE,  [ test "${comms_type}X" == "noneX" ])
 | 
			
		||||
 | 
			
		||||
############### RNG selection
 | 
			
		||||
 
 | 
			
		||||
@@ -10,8 +10,8 @@ if BUILD_COMMS_MPI3
 | 
			
		||||
  extra_sources+=communicator/Communicator_base.cc
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
if BUILD_COMMS_MPI3L
 | 
			
		||||
  extra_sources+=communicator/Communicator_mpi3_leader.cc
 | 
			
		||||
if BUILD_COMMS_MPIT
 | 
			
		||||
  extra_sources+=communicator/Communicator_mpit.cc
 | 
			
		||||
  extra_sources+=communicator/Communicator_base.cc
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -89,25 +89,31 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
 | 
			
		||||
  GlobalSumVector((double *)c,2*N);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)
 | 
			
		||||
#if !defined( GRID_COMMS_MPI3) 
 | 
			
		||||
 | 
			
		||||
int                      CartesianCommunicator::NodeCount(void)    { return ProcessorCount();};
 | 
			
		||||
int                      CartesianCommunicator::RankCount(void)    { return ProcessorCount();};
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT)
 | 
			
		||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
			
		||||
							 void *xmit,
 | 
			
		||||
							 int xmit_to_rank,
 | 
			
		||||
							 void *recv,
 | 
			
		||||
							 int recv_from_rank,
 | 
			
		||||
						       int bytes)
 | 
			
		||||
							 int bytes, int dir)
 | 
			
		||||
{
 | 
			
		||||
  // Discard the "dir"
 | 
			
		||||
  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
 | 
			
		||||
  return 2.0*bytes;
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
 | 
			
		||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 | 
			
		||||
{
 | 
			
		||||
  SendToRecvFromComplete(waitall);
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if !defined( GRID_COMMS_MPI3) 
 | 
			
		||||
 | 
			
		||||
void CartesianCommunicator::StencilBarrier(void){};
 | 
			
		||||
 | 
			
		||||
commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
 | 
			
		||||
 
 | 
			
		||||
@@ -38,7 +38,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#ifdef GRID_COMMS_MPI3
 | 
			
		||||
#include <mpi.h>
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_COMMS_MPI3L
 | 
			
		||||
#ifdef GRID_COMMS_MPIT
 | 
			
		||||
#include <mpi.h>
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_COMMS_SHMEM
 | 
			
		||||
@@ -64,7 +64,7 @@ class CartesianCommunicator {
 | 
			
		||||
  std::vector<int> _processor_coor;  // linear processor coordinate
 | 
			
		||||
  unsigned long _ndimension;
 | 
			
		||||
 | 
			
		||||
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L)
 | 
			
		||||
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
 | 
			
		||||
  static MPI_Comm communicator_world;
 | 
			
		||||
         MPI_Comm communicator;
 | 
			
		||||
  typedef MPI_Request CommsRequest_t;
 | 
			
		||||
@@ -72,6 +72,10 @@ class CartesianCommunicator {
 | 
			
		||||
  typedef int CommsRequest_t;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if defined (GRID_COMMS_MPIT)
 | 
			
		||||
  std::vector<MPI_Comm> communicator_halo;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Helper functionality for SHM Windows common to all other impls
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -216,9 +220,9 @@ class CartesianCommunicator {
 | 
			
		||||
				    int xmit_to_rank,
 | 
			
		||||
				    void *recv,
 | 
			
		||||
				    int recv_from_rank,
 | 
			
		||||
				  int bytes);
 | 
			
		||||
				    int bytes,int dir);
 | 
			
		||||
  
 | 
			
		||||
  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
 | 
			
		||||
  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
 | 
			
		||||
  void StencilBarrier(void);
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
@@ -604,7 +604,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 | 
			
		||||
							 int dest,
 | 
			
		||||
							 void *recv,
 | 
			
		||||
							 int from,
 | 
			
		||||
						       int bytes)
 | 
			
		||||
							 int bytes,int dir)
 | 
			
		||||
{
 | 
			
		||||
  MPI_Request xrq;
 | 
			
		||||
  MPI_Request rrq;
 | 
			
		||||
@@ -643,7 +643,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 | 
			
		||||
 | 
			
		||||
  return off_node_bytes;
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
 | 
			
		||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 | 
			
		||||
{
 | 
			
		||||
  SendToRecvFromComplete(waitall);
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -235,24 +235,30 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
  double CartesianCommunicator::StencilSendToRecvFromBegin(int dir,
 | 
			
		||||
							   std::vector<CommsRequest_t> &list,
 | 
			
		||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
			
		||||
							 void *xmit,
 | 
			
		||||
							 int xmit_to_rank,
 | 
			
		||||
							 void *recv,
 | 
			
		||||
							 int recv_from_rank,
 | 
			
		||||
							   int bytes)
 | 
			
		||||
							 int bytes,int dir)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
  int myrank = _processor;
 | 
			
		||||
  int ierr;
 | 
			
		||||
  assert(dir < communicator_halo.size());
 | 
			
		||||
 | 
			
		||||
  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
 | 
			
		||||
  // Give the CPU to MPI immediately; can use threads to overlap optionally
 | 
			
		||||
  ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
 | 
			
		||||
		    recv,bytes,MPI_CHAR,from, from,
 | 
			
		||||
  ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,xmit_to_rank,myrank,
 | 
			
		||||
		    recv,bytes,MPI_CHAR,recv_from_rank, recv_from_rank,
 | 
			
		||||
		    communicator_halo[dir],MPI_STATUS_IGNORE);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
  return 2.0*bytes;
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall){ };
 | 
			
		||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 | 
			
		||||
{ 
 | 
			
		||||
  // Do nothing
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -42,7 +42,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include <Grid/cshift/Cshift_mpi.h>
 | 
			
		||||
#endif 
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_COMMS_MPI3L
 | 
			
		||||
#ifdef GRID_COMMS_MPIT
 | 
			
		||||
#include <Grid/cshift/Cshift_mpi.h>
 | 
			
		||||
#endif 
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -95,7 +95,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
 | 
			
		||||
////////////////////////////////////////////////////////////
 | 
			
		||||
void Grid_quiesce_nodes(void) {
 | 
			
		||||
  int me = 0;
 | 
			
		||||
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L)
 | 
			
		||||
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
 | 
			
		||||
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_COMMS_SHMEM
 | 
			
		||||
 
 | 
			
		||||
@@ -29,7 +29,7 @@
 | 
			
		||||
#ifndef GRID_BINARY_IO_H
 | 
			
		||||
#define GRID_BINARY_IO_H
 | 
			
		||||
 | 
			
		||||
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) 
 | 
			
		||||
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) 
 | 
			
		||||
#define USE_MPI_IO
 | 
			
		||||
#else
 | 
			
		||||
#undef  USE_MPI_IO
 | 
			
		||||
 
 | 
			
		||||
@@ -379,7 +379,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 | 
			
		||||
{
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
			
		||||
  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
 | 
			
		||||
 | 
			
		||||
  Compressor compressor(dag);
 | 
			
		||||
 | 
			
		||||
@@ -388,46 +387,46 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 | 
			
		||||
 | 
			
		||||
  DhopFaceTime-=usecond();
 | 
			
		||||
  st.HaloExchangeOptGather(in,compressor);
 | 
			
		||||
  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
 | 
			
		||||
  DhopFaceTime+=usecond();
 | 
			
		||||
  std::vector<std::vector<CommsRequest_t> > reqs;
 | 
			
		||||
 | 
			
		||||
  // Rely on async comms; start comms before merge of local data
 | 
			
		||||
  DhopComputeTime-=usecond();
 | 
			
		||||
  DhopCommTime-=usecond();
 | 
			
		||||
  st.CommunicateBegin(reqs);
 | 
			
		||||
 | 
			
		||||
  DhopFaceTime-=usecond();
 | 
			
		||||
  st.CommsMergeSHM(compressor);
 | 
			
		||||
  DhopFaceTime+=usecond();
 | 
			
		||||
 | 
			
		||||
  // Perhaps use omp task and region
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
  { 
 | 
			
		||||
    int nthreads = omp_get_num_threads();
 | 
			
		||||
    int me = omp_get_thread_num();
 | 
			
		||||
    int myoff, mywork;
 | 
			
		||||
    // Should time this somehow; hard as the threads fork nowait
 | 
			
		||||
    st.CommunicateThreaded();
 | 
			
		||||
 | 
			
		||||
    GridThread::GetWork(len,me-1,mywork,myoff,nthreads-1);
 | 
			
		||||
    int sF = LLs * myoff;
 | 
			
		||||
 | 
			
		||||
    if ( me == 0 ) {
 | 
			
		||||
      st.CommunicateComplete(reqs);
 | 
			
		||||
      DhopCommTime+=usecond();
 | 
			
		||||
  if (dag == DaggerYes) {
 | 
			
		||||
#pragma omp for
 | 
			
		||||
    for (int ss = 0; ss < U._grid->oSites(); ss++) {
 | 
			
		||||
      int sU = ss;
 | 
			
		||||
      int sF = LLs * sU;
 | 
			
		||||
      Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
      // Interior links in stencil
 | 
			
		||||
      if ( me==1 ) DhopComputeTime-=usecond();
 | 
			
		||||
      if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
 | 
			
		||||
      else      	    Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
 | 
			
		||||
      if ( me==1 ) DhopComputeTime+=usecond();
 | 
			
		||||
#pragma omp for
 | 
			
		||||
    for (int ss = 0; ss < U._grid->oSites(); ss++) {
 | 
			
		||||
      int sU = ss;
 | 
			
		||||
      int sF = LLs * sU;
 | 
			
		||||
      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
#pragma omp single
 | 
			
		||||
  DhopComputeTime+=usecond();
 | 
			
		||||
 | 
			
		||||
#pragma omp taskwait 
 | 
			
		||||
 | 
			
		||||
#pragma omp single
 | 
			
		||||
  DhopCommTime+=usecond();
 | 
			
		||||
  } // Closes parallel region and waits the comms (I hope)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  DhopFaceTime-=usecond();
 | 
			
		||||
  st.CommsMerge(compressor);
 | 
			
		||||
  DhopFaceTime+=usecond();
 | 
			
		||||
 | 
			
		||||
  // Load imbalance alert. Should use dynamic schedule OMP for loop
 | 
			
		||||
  // Perhaps create a list of only those sites with face work, and 
 | 
			
		||||
  // load balance process the list.
 | 
			
		||||
  DhopComputeTime2-=usecond();
 | 
			
		||||
  if (dag == DaggerYes) {
 | 
			
		||||
    int sz=st.surface_list.size();
 | 
			
		||||
@@ -448,11 +447,9 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 | 
			
		||||
#else 
 | 
			
		||||
  assert(0);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
 | 
			
		||||
					 DoubledGaugeField & U,
 | 
			
		||||
 
 | 
			
		||||
@@ -248,6 +248,39 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
  //////////////////////////////////////////
 | 
			
		||||
  // Comms packet queue for asynch thread
 | 
			
		||||
  //////////////////////////////////////////
 | 
			
		||||
  void CommunicateThreaded()
 | 
			
		||||
  {
 | 
			
		||||
    for(int i=0;i<Packets.size();i++){
 | 
			
		||||
#pragma omp task 
 | 
			
		||||
      {
 | 
			
		||||
	double start;
 | 
			
		||||
	double stop;
 | 
			
		||||
	start = usecond();
 | 
			
		||||
	uint64_t bytes;
 | 
			
		||||
	std::vector<CommsRequest_t> reqs;
 | 
			
		||||
	bytes=_grid->StencilSendToRecvFromBegin(reqs,
 | 
			
		||||
					  Packets[i].send_buf,
 | 
			
		||||
					  Packets[i].to_rank,
 | 
			
		||||
					  Packets[i].recv_buf,
 | 
			
		||||
					  Packets[i].from_rank,
 | 
			
		||||
					  Packets[i].bytes,i);
 | 
			
		||||
	_grid->StencilSendToRecvFromComplete(reqs,i);
 | 
			
		||||
	// Last task logged; this is approximate but hard to catch
 | 
			
		||||
	// the last to complete
 | 
			
		||||
	stop = usecond();
 | 
			
		||||
	stop = stop - start;
 | 
			
		||||
 | 
			
		||||
	if ( i==0 ) commtime+=stop;
 | 
			
		||||
 | 
			
		||||
#pragma omp critical
 | 
			
		||||
	{
 | 
			
		||||
	  comms_bytes+=bytes;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
  }
 | 
			
		||||
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
 | 
			
		||||
  {
 | 
			
		||||
    reqs.resize(Packets.size());
 | 
			
		||||
@@ -258,14 +291,14 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
						     Packets[i].to_rank,
 | 
			
		||||
						     Packets[i].recv_buf,
 | 
			
		||||
						     Packets[i].from_rank,
 | 
			
		||||
					  Packets[i].bytes);
 | 
			
		||||
						     Packets[i].bytes,i);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
 | 
			
		||||
  {
 | 
			
		||||
    for(int i=0;i<Packets.size();i++){
 | 
			
		||||
      _grid->StencilSendToRecvFromComplete(reqs[i]);
 | 
			
		||||
      _grid->StencilSendToRecvFromComplete(reqs[i],i);
 | 
			
		||||
    }
 | 
			
		||||
    commtime+=usecond();
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -393,7 +393,7 @@ void Grid_init(int *argc,char ***argv)
 | 
			
		||||
 | 
			
		||||
void Grid_finalize(void)
 | 
			
		||||
{
 | 
			
		||||
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
 | 
			
		||||
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
 | 
			
		||||
  MPI_Finalize();
 | 
			
		||||
  Grid_unquiesce_nodes();
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user