diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 4f16b1de..9bf7d0a5 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -218,7 +218,7 @@ public: std::cout<({45,12,81,9})); for(int lat=8;lat<=lmax;lat+=4){ @@ -368,7 +368,7 @@ public: const int num_cases = 4; #endif controls Cases [] = { -#if defined(AVX512) +#ifdef AVX512 { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, #endif @@ -380,6 +380,10 @@ public: for(int c=0;cBarrier(); diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc index 6767495f..3ce3a774 100644 --- a/lib/communicator/Communicator_base.cc +++ b/lib/communicator/Communicator_base.cc @@ -41,6 +41,7 @@ uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 128*1024*1024; CartesianCommunicator::CommunicatorPolicy_t CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent; int CartesianCommunicator::nCommThreads = -1; +int CartesianCommunicator::Hugepages = 0; ///////////////////////////////// // Alloc, free shmem region @@ -134,7 +135,10 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { } void CartesianCommunicator::ShmInitGeneric(void){ #if 1 - ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, MAP_HUGETLB| MAP_SHARED | MAP_ANONYMOUS, -1, 0); + + int mmap_flag = MAP_SHARED | MAP_ANONYMOUS; + if ( Hugepages ) mmap_flag |= MAP_HUGETLB; + ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); if (ShmCommBuf == (void *)MAP_FAILED) exit(EXIT_FAILURE); std::cout << "ShmCommBuf "< #ifdef HAVE_NUMAIF_H #include #endif + +// Make up for linex deficiencies #ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 +#define SHM_HUGETLB 0x0 +#endif +#ifndef MAP_HUGETLB +#define MAP_HUGETLB 0x0 #endif namespace Grid { @@ -213,8 +218,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666); if ( fd < 0 ) { perror("failed shm_open"); assert(0); } ftruncate(fd, size); + + int mmap_flag = MAP_SHARED; + if (Hugepages) mmap_flag |= MAP_HUGETLB; + void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0); - void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); } assert(((uint64_t)ptr&0x3F)==0); @@ -628,8 +636,9 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, int bytes,int dir) { std::vector list; - StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir); + double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir); StencilSendToRecvFromComplete(list,dir); + return offbytes; } double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, @@ -671,7 +680,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorStencilSendToRecvFromComplete(list); + this->StencilSendToRecvFromComplete(list,dir); } return off_node_bytes; diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 0b6c9e3d..404ecce0 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -135,10 +135,11 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, template void WilsonFermion5D::Report(void) { - std::vector latt = GridDefaultLatt(); - RealD volume = Ls; for(int mu=0;mu_Nprocessors; - RealD NN = _FourDimGrid->NodeCount(); + RealD NP = _FourDimGrid->_Nprocessors; + RealD NN = _FourDimGrid->NodeCount(); + RealD volume = Ls; + std::vector latt = _FourDimGrid->GlobalDimensions(); + for(int mu=0;mu 0 ) { std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; @@ -390,17 +391,18 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms DhopFaceTime+=usecond(); - // Rely on async comms; start comms before merge of local data double ctime=0; double ptime=0; - // DhopComputeTime-=usecond(); - // DhopCommTime-=usecond(); + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // Ugly explicit thread mapping introduced for OPA reasons. + ////////////////////////////////////////////////////////////////////////////////////////////////////// #pragma omp parallel reduction(max:ctime) reduction(max:ptime) { int tid = omp_get_thread_num(); int nthreads = omp_get_num_threads(); int ncomms = CartesianCommunicator::nCommThreads; - if (ncomms == -1) ncomms = st.Packets.size(); + if (ncomms == -1) ncomms = 1; assert(nthreads > ncomms); if (tid >= ncomms) { double start = usecond(); diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index d1d7a7e0..cca67587 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -252,10 +252,15 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal ////////////////////////////////////////// void CommunicateThreaded() { +#ifdef GRID_OMP // must be called in parallel region int mythread = omp_get_thread_num(); int nthreads = CartesianCommunicator::nCommThreads; - if (nthreads == -1) nthreads = Packets.size(); +#else + int mythread = 0; + int nthreads = 1; +#endif + if (nthreads == -1) nthreads = 1; if (mythread < nthreads) { for (int i = mythread; i < Packets.size(); i += nthreads) { double start = usecond(); diff --git a/lib/util/Init.cc b/lib/util/Init.cc index 39a726cf..3fd8b4cd 100644 --- a/lib/util/Init.cc +++ b/lib/util/Init.cc @@ -222,6 +222,11 @@ void Grid_init(int *argc,char ***argv) CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024; } + if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-hugepages") ){ + CartesianCommunicator::Hugepages = 1; + } + + if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){ Grid_debug_handler_init(); } @@ -304,6 +309,7 @@ void Grid_init(int *argc,char ***argv) std::cout<