From 869b99ec1efde04d94bdd02eb041a457accb930e Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 24 Jun 2017 10:55:54 +0100 Subject: [PATCH 01/22] Threaded calls to multiple communicators --- lib/communicator/Communicator_mpit.cc | 260 ++++++++++++++++++++++++++ 1 file changed, 260 insertions(+) create mode 100644 lib/communicator/Communicator_mpit.cc diff --git a/lib/communicator/Communicator_mpit.cc b/lib/communicator/Communicator_mpit.cc new file mode 100644 index 00000000..07522900 --- /dev/null +++ b/lib/communicator/Communicator_mpit.cc @@ -0,0 +1,260 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/communicator/Communicator_mpi.cc + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include +#include + +namespace Grid { + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Info that is setup once and indept of cartesian layout +/////////////////////////////////////////////////////////////////////////////////////////////////// +MPI_Comm CartesianCommunicator::communicator_world; + +// Should error check all MPI calls. +void CartesianCommunicator::Init(int *argc, char ***argv) { + int flag; + int provided; + MPI_Initialized(&flag); // needed to coexist with other libs apparently + if ( !flag ) { + MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); + if ( provided != MPI_THREAD_MULTIPLE ) { + QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute; + } + } + MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); + ShmInitGeneric(); +} + +CartesianCommunicator::CartesianCommunicator(const std::vector &processors) +{ + _ndimension = processors.size(); + std::vector periodic(_ndimension,1); + + _Nprocessors=1; + _processors = processors; + _processor_coor.resize(_ndimension); + + MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator); + MPI_Comm_rank(communicator,&_processor); + MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]); + + for(int i=0;i<_ndimension;i++){ + _Nprocessors*=_processors[i]; + } + + communicator_halo.resize (2*_ndimension); + for(int i=0;i<_ndimension*2;i++){ + MPI_Comm_dup(communicator,&communicator_halo[i]); + } + + int Size; + MPI_Comm_size(communicator,&Size); + + assert(Size==_Nprocessors); +} +void CartesianCommunicator::GlobalSum(uint32_t &u){ + int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalSum(uint64_t &u){ + int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalXOR(uint32_t &u){ + int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalXOR(uint64_t &u){ + int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalSum(float &f){ + int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalSumVector(float *f,int N) +{ + int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalSum(double &d) +{ + int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalSumVector(double *d,int N) +{ + int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); + assert(ierr==0); +} +void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) +{ + int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest); + assert(ierr==0); +} +int CartesianCommunicator::RankFromProcessorCoor(std::vector &coor) +{ + int rank; + int ierr=MPI_Cart_rank (communicator, &coor[0], &rank); + assert(ierr==0); + return rank; +} +void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector &coor) +{ + coor.resize(_ndimension); + int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]); + assert(ierr==0); +} + +// Basic Halo comms primitive +void CartesianCommunicator::SendToRecvFrom(void *xmit, + int dest, + void *recv, + int from, + int bytes) +{ + std::vector reqs(0); + SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes); + SendToRecvFromComplete(reqs); +} + +void CartesianCommunicator::SendRecvPacket(void *xmit, + void *recv, + int sender, + int receiver, + int bytes) +{ + MPI_Status stat; + assert(sender != receiver); + int tag = sender; + if ( _processor == sender ) { + MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator); + } + if ( _processor == receiver ) { + MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat); + } +} + +// Basic Halo comms primitive +void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, + void *xmit, + int dest, + void *recv, + int from, + int bytes) +{ + int myrank = _processor; + int ierr; + if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { + MPI_Request xrq; + MPI_Request rrq; + + ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); + ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); + + assert(ierr==0); + list.push_back(xrq); + list.push_back(rrq); + } else { + // Give the CPU to MPI immediately; can use threads to overlap optionally + ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, + recv,bytes,MPI_CHAR,from, from, + communicator,MPI_STATUS_IGNORE); + assert(ierr==0); + } +} +void CartesianCommunicator::SendToRecvFromComplete(std::vector &list) +{ + if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { + int nreq=list.size(); + std::vector status(nreq); + int ierr = MPI_Waitall(nreq,&list[0],&status[0]); + assert(ierr==0); + } +} + +void CartesianCommunicator::Barrier(void) +{ + int ierr = MPI_Barrier(communicator); + assert(ierr==0); +} + +void CartesianCommunicator::Broadcast(int root,void* data, int bytes) +{ + int ierr=MPI_Bcast(data, + bytes, + MPI_BYTE, + root, + communicator); + assert(ierr==0); +} + /////////////////////////////////////////////////////// + // Should only be used prior to Grid Init finished. + // Check for this? + /////////////////////////////////////////////////////// +int CartesianCommunicator::RankWorld(void){ + int r; + MPI_Comm_rank(communicator_world,&r); + return r; +} +void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) +{ + int ierr= MPI_Bcast(data, + bytes, + MPI_BYTE, + root, + communicator_world); + assert(ierr==0); +} + + double CartesianCommunicator::StencilSendToRecvFromBegin(int dir, + std::vector &list, + void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes) +{ + int myrank = _processor; + int ierr; + // Give the CPU to MPI immediately; can use threads to overlap optionally + ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, + recv,bytes,MPI_CHAR,from, from, + communicator_halo[dir],MPI_STATUS_IGNORE); + assert(ierr==0); + return 2.0*bytes; +} +void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &waitall){ }; + + + +} + From 54e94360ad06cde7edbaeede2cf18eb0d5a1227b Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 24 Jun 2017 23:10:24 +0100 Subject: [PATCH 02/22] Experimental: Multiple communicators to see if we can avoid thread locks in --enable-comms=mpit --- benchmarks/Benchmark_comms.cc | 27 ++++++----- configure.ac | 10 ++--- lib/Makefile.am | 4 +- lib/communicator/Communicator_base.cc | 22 +++++---- lib/communicator/Communicator_base.h | 20 +++++---- lib/communicator/Communicator_mpi3.cc | 12 ++--- lib/communicator/Communicator_mpit.cc | 26 ++++++----- lib/cshift/Cshift.h | 2 +- lib/log/Log.cc | 2 +- lib/parallelIO/BinaryIO.h | 2 +- lib/qcd/action/fermion/WilsonFermion5D.cc | 55 +++++++++++------------ lib/stencil/Stencil.h | 45 ++++++++++++++++--- lib/util/Init.cc | 2 +- 13 files changed, 139 insertions(+), 90 deletions(-) diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc index 532532f8..753b8a58 100644 --- a/benchmarks/Benchmark_comms.cc +++ b/benchmarks/Benchmark_comms.cc @@ -68,7 +68,7 @@ int main (int argc, char ** argv) int Nloop=100; int nmu=0; - int maxlat=24; + int maxlat=32; for(int mu=0;mu1) nmu++; std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl; @@ -80,7 +80,7 @@ int main (int argc, char ** argv) std::cout< latt_size ({lat*mpi_layout[0], lat*mpi_layout[1], @@ -163,7 +163,7 @@ int main (int argc, char ** argv) header(); for(int lat=4;lat<=maxlat;lat+=4){ - for(int Ls=8;Ls<=32;Ls*=2){ + for(int Ls=8;Ls<=8;Ls*=2){ std::vector latt_size ({lat,lat,lat,lat}); @@ -249,7 +249,7 @@ int main (int argc, char ** argv) header(); for(int lat=4;lat<=maxlat;lat+=4){ - for(int Ls=8;Ls<=32;Ls*=2){ + for(int Ls=8;Ls<=8;Ls*=2){ std::vector latt_size ({lat*mpi_layout[0], lat*mpi_layout[1], @@ -299,7 +299,7 @@ int main (int argc, char ** argv) xmit_to_rank, (void *)&rbuf[mu][0], recv_from_rank, - bytes); + bytes,mu); comm_proc = mpi_layout[mu]-1; @@ -310,11 +310,11 @@ int main (int argc, char ** argv) xmit_to_rank, (void *)&rbuf[mu+4][0], recv_from_rank, - bytes); + bytes,mu+4); } } - Grid.StencilSendToRecvFromComplete(requests); + Grid.StencilSendToRecvFromComplete(requests,0); Grid.Barrier(); double stop=usecond(); t_time[i] = stop-start; // microseconds @@ -346,7 +346,7 @@ int main (int argc, char ** argv) header(); for(int lat=4;lat<=maxlat;lat+=4){ - for(int Ls=8;Ls<=32;Ls*=2){ + for(int Ls=8;Ls<=8;Ls*=2){ std::vector latt_size ({lat*mpi_layout[0], lat*mpi_layout[1], @@ -393,8 +393,8 @@ int main (int argc, char ** argv) xmit_to_rank, (void *)&rbuf[mu][0], recv_from_rank, - bytes); - Grid.StencilSendToRecvFromComplete(requests); + bytes,mu); + Grid.StencilSendToRecvFromComplete(requests,mu); requests.resize(0); comm_proc = mpi_layout[mu]-1; @@ -406,8 +406,8 @@ int main (int argc, char ** argv) xmit_to_rank, (void *)&rbuf[mu+4][0], recv_from_rank, - bytes); - Grid.StencilSendToRecvFromComplete(requests); + bytes,mu+4); + Grid.StencilSendToRecvFromComplete(requests,mu+4); requests.resize(0); } @@ -435,6 +435,9 @@ int main (int argc, char ** argv) } } + std::cout< &list, - void *xmit, - int xmit_to_rank, - void *recv, - int recv_from_rank, - int bytes) + void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes, int dir) { + // Discard the "dir" SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); return 2.0*bytes; } -void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &waitall) +void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &waitall,int dir) { SendToRecvFromComplete(waitall); } +#endif + +#if !defined( GRID_COMMS_MPI3) + void CartesianCommunicator::StencilBarrier(void){}; commVector CartesianCommunicator::ShmBufStorageVector; diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h index 12a8429f..4e471b43 100644 --- a/lib/communicator/Communicator_base.h +++ b/lib/communicator/Communicator_base.h @@ -38,7 +38,7 @@ Author: Peter Boyle #ifdef GRID_COMMS_MPI3 #include #endif -#ifdef GRID_COMMS_MPI3L +#ifdef GRID_COMMS_MPIT #include #endif #ifdef GRID_COMMS_SHMEM @@ -64,7 +64,7 @@ class CartesianCommunicator { std::vector _processor_coor; // linear processor coordinate unsigned long _ndimension; -#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L) +#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT) static MPI_Comm communicator_world; MPI_Comm communicator; typedef MPI_Request CommsRequest_t; @@ -72,6 +72,10 @@ class CartesianCommunicator { typedef int CommsRequest_t; #endif +#if defined (GRID_COMMS_MPIT) + std::vector communicator_halo; +#endif + //////////////////////////////////////////////////////////////////// // Helper functionality for SHM Windows common to all other impls //////////////////////////////////////////////////////////////////// @@ -212,13 +216,13 @@ class CartesianCommunicator { void SendToRecvFromComplete(std::vector &waitall); double StencilSendToRecvFromBegin(std::vector &list, - void *xmit, - int xmit_to_rank, - void *recv, - int recv_from_rank, - int bytes); + void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes,int dir); - void StencilSendToRecvFromComplete(std::vector &waitall); + void StencilSendToRecvFromComplete(std::vector &waitall,int i); void StencilBarrier(void); //////////////////////////////////////////////////////////// diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index 632eb991..8046fef6 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -600,11 +600,11 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis } double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, - void *xmit, - int dest, - void *recv, - int from, - int bytes) + void *xmit, + int dest, + void *recv, + int from, + int bytes,int dir) { MPI_Request xrq; MPI_Request rrq; @@ -643,7 +643,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &waitall) +void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &waitall,int dir) { SendToRecvFromComplete(waitall); } diff --git a/lib/communicator/Communicator_mpit.cc b/lib/communicator/Communicator_mpit.cc index 07522900..24a518ec 100644 --- a/lib/communicator/Communicator_mpit.cc +++ b/lib/communicator/Communicator_mpit.cc @@ -235,24 +235,30 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) assert(ierr==0); } - double CartesianCommunicator::StencilSendToRecvFromBegin(int dir, - std::vector &list, - void *xmit, - int xmit_to_rank, - void *recv, - int recv_from_rank, - int bytes) +double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, + void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes,int dir) { + int myrank = _processor; int ierr; + assert(dir < communicator_halo.size()); + + // std::cout << " sending on communicator "< &waitall){ }; +void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &waitall,int dir) +{ + // Do nothing +}; diff --git a/lib/cshift/Cshift.h b/lib/cshift/Cshift.h index cd162e35..7d0caeee 100644 --- a/lib/cshift/Cshift.h +++ b/lib/cshift/Cshift.h @@ -42,7 +42,7 @@ Author: Peter Boyle #include #endif -#ifdef GRID_COMMS_MPI3L +#ifdef GRID_COMMS_MPIT #include #endif diff --git a/lib/log/Log.cc b/lib/log/Log.cc index 69a9a0a8..65dc2812 100644 --- a/lib/log/Log.cc +++ b/lib/log/Log.cc @@ -95,7 +95,7 @@ void GridLogConfigure(std::vector &logstreams) { //////////////////////////////////////////////////////////// void Grid_quiesce_nodes(void) { int me = 0; -#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L) +#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) MPI_Comm_rank(MPI_COMM_WORLD, &me); #endif #ifdef GRID_COMMS_SHMEM diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h index 117bec01..480afa01 100644 --- a/lib/parallelIO/BinaryIO.h +++ b/lib/parallelIO/BinaryIO.h @@ -29,7 +29,7 @@ #ifndef GRID_BINARY_IO_H #define GRID_BINARY_IO_H -#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) +#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) #define USE_MPI_IO #else #undef USE_MPI_IO diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 27319fb0..6a6bc1f8 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -379,7 +379,6 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg { #ifdef GRID_OMP // assert((dag==DaggerNo) ||(dag==DaggerYes)); - typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; Compressor compressor(dag); @@ -388,46 +387,46 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg DhopFaceTime-=usecond(); st.HaloExchangeOptGather(in,compressor); + st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms DhopFaceTime+=usecond(); - std::vector > reqs; // Rely on async comms; start comms before merge of local data + DhopComputeTime-=usecond(); DhopCommTime-=usecond(); - st.CommunicateBegin(reqs); - - DhopFaceTime-=usecond(); - st.CommsMergeSHM(compressor); - DhopFaceTime+=usecond(); - - // Perhaps use omp task and region #pragma omp parallel { - int nthreads = omp_get_num_threads(); - int me = omp_get_thread_num(); - int myoff, mywork; + // Should time this somehow; hard as the threads fork nowait + st.CommunicateThreaded(); - GridThread::GetWork(len,me-1,mywork,myoff,nthreads-1); - int sF = LLs * myoff; - - if ( me == 0 ) { - st.CommunicateComplete(reqs); - DhopCommTime+=usecond(); - } else { - // Interior links in stencil - if ( me==1 ) DhopComputeTime-=usecond(); - if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0); - else Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0); - if ( me==1 ) DhopComputeTime+=usecond(); + if (dag == DaggerYes) { +#pragma omp for + for (int ss = 0; ss < U._grid->oSites(); ss++) { + int sU = ss; + int sF = LLs * sU; + Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0); + } + } else { +#pragma omp for + for (int ss = 0; ss < U._grid->oSites(); ss++) { + int sU = ss; + int sF = LLs * sU; + Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0); } } +#pragma omp single + DhopComputeTime+=usecond(); + +#pragma omp taskwait + +#pragma omp single + DhopCommTime+=usecond(); + } // Closes parallel region and waits the comms (I hope) + DhopFaceTime-=usecond(); st.CommsMerge(compressor); DhopFaceTime+=usecond(); - // Load imbalance alert. Should use dynamic schedule OMP for loop - // Perhaps create a list of only those sites with face work, and - // load balance process the list. DhopComputeTime2-=usecond(); if (dag == DaggerYes) { int sz=st.surface_list.size(); @@ -448,11 +447,9 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg #else assert(0); #endif - } - template void WilsonFermion5D::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField & U, diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index 2894778a..17db64d8 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -248,24 +248,57 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal ////////////////////////////////////////// // Comms packet queue for asynch thread ////////////////////////////////////////// + void CommunicateThreaded() + { + for(int i=0;i reqs; + bytes=_grid->StencilSendToRecvFromBegin(reqs, + Packets[i].send_buf, + Packets[i].to_rank, + Packets[i].recv_buf, + Packets[i].from_rank, + Packets[i].bytes,i); + _grid->StencilSendToRecvFromComplete(reqs,i); + // Last task logged; this is approximate but hard to catch + // the last to complete + stop = usecond(); + stop = stop - start; + + if ( i==0 ) commtime+=stop; + +#pragma omp critical + { + comms_bytes+=bytes; + } + + } + } + + } void CommunicateBegin(std::vector > &reqs) { reqs.resize(Packets.size()); commtime-=usecond(); for(int i=0;iStencilSendToRecvFromBegin(reqs[i], - Packets[i].send_buf, - Packets[i].to_rank, - Packets[i].recv_buf, - Packets[i].from_rank, - Packets[i].bytes); + Packets[i].send_buf, + Packets[i].to_rank, + Packets[i].recv_buf, + Packets[i].from_rank, + Packets[i].bytes,i); } } void CommunicateComplete(std::vector > &reqs) { for(int i=0;iStencilSendToRecvFromComplete(reqs[i]); + _grid->StencilSendToRecvFromComplete(reqs[i],i); } commtime+=usecond(); } diff --git a/lib/util/Init.cc b/lib/util/Init.cc index fe3b1734..fc701ac1 100644 --- a/lib/util/Init.cc +++ b/lib/util/Init.cc @@ -393,7 +393,7 @@ void Grid_init(int *argc,char ***argv) void Grid_finalize(void) { -#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) +#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT) MPI_Finalize(); Grid_unquiesce_nodes(); #endif From 6f5a5cd9b3269932a720804aebe8b7046d4b68fe Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 28 Jun 2017 23:27:02 +0100 Subject: [PATCH 03/22] Improved threaded comms benchmark --- TODO | 11 ++-- benchmarks/Benchmark_comms.cc | 94 +++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 3 deletions(-) diff --git a/TODO b/TODO index 001c6c0c..3d29215e 100644 --- a/TODO +++ b/TODO @@ -2,10 +2,13 @@ TODO: --------------- Large item work list: -1)- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O +1)- I/O; There appear to be issues with MPI IO and NERSC with large files. + Possible 2GB limit reappeared. GPFS driver in Intel MPI. + +2)- BG/Q port and check + +3)- Christoph's local basis expansion Lanczos; port to use Lattice_transfer features -2)- Christoph's local basis expansion Lanczos -3)- BG/Q port and check 4)- Precision conversion and sort out localConvert <-- partial - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet 5)- Physical propagator interface @@ -14,6 +17,8 @@ Large item work list: 8)- HDCR resume Recent DONE + +-- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O. <--- DONE -- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE -- GaugeFix into central location <-- DONE -- Scidac and Ildg metadata handling <-- DONE diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc index 753b8a58..698f9d25 100644 --- a/benchmarks/Benchmark_comms.cc +++ b/benchmarks/Benchmark_comms.cc @@ -435,6 +435,100 @@ int main (int argc, char ** argv) } } + + + + std::cout< latt_size ({lat*mpi_layout[0], + lat*mpi_layout[1], + lat*mpi_layout[2], + lat*mpi_layout[3]}); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + RealD Nrank = Grid._Nprocessors; + RealD Nnode = Grid.NodeCount(); + RealD ppn = Nrank/Nnode; + + std::vector xbuf(8); + std::vector rbuf(8); + Grid.ShmBufferFreeAll(); + for(int d=0;d<8;d++){ + xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + } + + int ncomm; + int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + double dbytes; + for(int i=0;i requests; + dbytes=0; + ncomm=0; + + parallel_for(int dir=0;dir<8;dir++){ + + double tbytes; + int mu =dir % 4; + + if (mpi_layout[mu]>1 ) { + + ncomm++; + int xmit_to_rank; + int recv_from_rank; + if ( dir == mu ) { + int comm_proc=1; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + } else { + int comm_proc = mpi_layout[mu]-1; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + } + tbytes= Grid.StencilSendToRecvFromBegin(requests, + (void *)&xbuf[dir][0], + xmit_to_rank, + (void *)&rbuf[dir][0], + recv_from_rank, + bytes,dir); + Grid.StencilSendToRecvFromComplete(requests,dir); + requests.resize(0); + +#pragma omp atomic + dbytes+=tbytes; + } + } + Grid.Barrier(); + double stop=usecond(); + t_time[i] = stop-start; // microseconds + } + + timestat.statistics(t_time); + + dbytes=dbytes*ppn; + double xbytes = dbytes*0.5; + double rbytes = dbytes*0.5; + double bidibytes = dbytes; + + + std::cout< Date: Sat, 29 Jul 2017 13:06:53 -0400 Subject: [PATCH 04/22] Threaded MPI calls patches --- benchmarks/Benchmark_dwf.cc | 2 +- lib/allocator/AlignedAllocator.h | 10 ++- lib/communicator/Communicator_base.cc | 4 +- lib/communicator/Communicator_base.h | 14 ++++- lib/communicator/Communicator_mpit.cc | 25 +++++++- lib/qcd/action/fermion/WilsonFermion5D.cc | 74 ++++++++++++++--------- lib/stencil/Stencil.h | 59 +++++++++--------- lib/util/Init.cc | 6 +- 8 files changed, 128 insertions(+), 66 deletions(-) diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index a071c050..0264905c 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -489,7 +489,7 @@ int main (int argc, char ** argv) //assert(norm2(src_e)<1.0e-4); //assert(norm2(src_o)<1.0e-4); - + exit(0); Grid_finalize(); } diff --git a/lib/allocator/AlignedAllocator.h b/lib/allocator/AlignedAllocator.h index 6e85ab27..7fd9496f 100644 --- a/lib/allocator/AlignedAllocator.h +++ b/lib/allocator/AlignedAllocator.h @@ -92,11 +92,15 @@ public: size_type bytes = __n*sizeof(_Tp); _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes); - + ////////////////// + // Hack 2MB align; could make option probably doesn't need configurability + ////////////////// +//define GRID_ALLOC_ALIGN (128) +#define GRID_ALLOC_ALIGN (2*1024*1024) #ifdef HAVE_MM_MALLOC_H - if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128); + if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN); #else - if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes); + if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes); #endif return ptr; diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc index a5edf8e9..67bfaed0 100644 --- a/lib/communicator/Communicator_base.cc +++ b/lib/communicator/Communicator_base.cc @@ -34,7 +34,9 @@ namespace Grid { /////////////////////////////////////////////////////////////// void * CartesianCommunicator::ShmCommBuf; uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 128*1024*1024; -CartesianCommunicator::CommunicatorPolicy_t CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent; +CartesianCommunicator::CommunicatorPolicy_t +CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent; +int CartesianCommunicator::nCommThreads = -1; ///////////////////////////////// // Alloc, free shmem region diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h index 4e471b43..84dbedb4 100644 --- a/lib/communicator/Communicator_base.h +++ b/lib/communicator/Communicator_base.h @@ -54,8 +54,9 @@ class CartesianCommunicator { // 128MB shared memory for comms enought for 48^4 local vol comms // Give external control (command line override?) of this - static const int MAXLOG2RANKSPERNODE = 16; - static uint64_t MAX_MPI_SHM_BYTES; + static const int MAXLOG2RANKSPERNODE = 16; + static uint64_t MAX_MPI_SHM_BYTES; + static int nCommThreads; // Communicator should know nothing of the physics grid, only processor grid. int _Nprocessors; // How many in all @@ -125,7 +126,7 @@ class CartesianCommunicator { enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential }; static CommunicatorPolicy_t CommunicatorPolicy; static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; } - + size_t heap_top; size_t heap_bytes; @@ -215,6 +216,12 @@ class CartesianCommunicator { void SendToRecvFromComplete(std::vector &waitall); + double StencilSendToRecvFrom(void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes,int dir); + double StencilSendToRecvFromBegin(std::vector &list, void *xmit, int xmit_to_rank, @@ -222,6 +229,7 @@ class CartesianCommunicator { int recv_from_rank, int bytes,int dir); + void StencilSendToRecvFromComplete(std::vector &waitall,int i); void StencilBarrier(void); diff --git a/lib/communicator/Communicator_mpit.cc b/lib/communicator/Communicator_mpit.cc index 24a518ec..f522701c 100644 --- a/lib/communicator/Communicator_mpit.cc +++ b/lib/communicator/Communicator_mpit.cc @@ -242,7 +242,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &waitall,int dir) { diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 6a6bc1f8..0b6c9e3d 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -391,37 +391,57 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg DhopFaceTime+=usecond(); // Rely on async comms; start comms before merge of local data - DhopComputeTime-=usecond(); - DhopCommTime-=usecond(); -#pragma omp parallel + double ctime=0; + double ptime=0; + // DhopComputeTime-=usecond(); + // DhopCommTime-=usecond(); +#pragma omp parallel reduction(max:ctime) reduction(max:ptime) { - // Should time this somehow; hard as the threads fork nowait - st.CommunicateThreaded(); - - if (dag == DaggerYes) { -#pragma omp for - for (int ss = 0; ss < U._grid->oSites(); ss++) { - int sU = ss; - int sF = LLs * sU; - Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0); + int tid = omp_get_thread_num(); + int nthreads = omp_get_num_threads(); + int ncomms = CartesianCommunicator::nCommThreads; + if (ncomms == -1) ncomms = st.Packets.size(); + assert(nthreads > ncomms); + if (tid >= ncomms) { + double start = usecond(); + nthreads -= ncomms; + int ttid = tid - ncomms; + int n = U._grid->oSites(); + int chunk = n / nthreads; + int rem = n % nthreads; + int myblock, myn; + if (ttid < rem) { + myblock = ttid * chunk + ttid; + myn = chunk+1; + } else { + myblock = ttid*chunk + rem; + myn = chunk; + } + + // do the compute + if (dag == DaggerYes) { + for (int ss = myblock; ss < myblock+myn; ++ss) { + int sU = ss; + int sF = LLs * sU; + Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0); + } + } else { + for (int ss = myblock; ss < myblock+myn; ++ss) { + int sU = ss; + int sF = LLs * sU; + Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0); + } + } + ptime = usecond() - start; } - } else { -#pragma omp for - for (int ss = 0; ss < U._grid->oSites(); ss++) { - int sU = ss; - int sF = LLs * sU; - Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0); + { + double start = usecond(); + st.CommunicateThreaded(); + ctime = usecond() - start; } } -#pragma omp single - DhopComputeTime+=usecond(); - -#pragma omp taskwait - -#pragma omp single - DhopCommTime+=usecond(); - } // Closes parallel region and waits the comms (I hope) - + DhopCommTime += ctime; + DhopComputeTime+=ptime; DhopFaceTime-=usecond(); st.CommsMerge(compressor); diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index 17db64d8..d1d7a7e0 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -185,6 +185,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal double splicetime; double nosplicetime; double calls; + std::vector comms_bytesthr; + std::vector commtimethr; //////////////////////////////////////// // Stencil query @@ -250,36 +252,22 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal ////////////////////////////////////////// void CommunicateThreaded() { - for(int i=0;i reqs; - bytes=_grid->StencilSendToRecvFromBegin(reqs, - Packets[i].send_buf, - Packets[i].to_rank, - Packets[i].recv_buf, - Packets[i].from_rank, - Packets[i].bytes,i); - _grid->StencilSendToRecvFromComplete(reqs,i); - // Last task logged; this is approximate but hard to catch - // the last to complete - stop = usecond(); - stop = stop - start; - - if ( i==0 ) commtime+=stop; - -#pragma omp critical - { - comms_bytes+=bytes; - } - + // must be called in parallel region + int mythread = omp_get_thread_num(); + int nthreads = CartesianCommunicator::nCommThreads; + if (nthreads == -1) nthreads = Packets.size(); + if (mythread < nthreads) { + for (int i = mythread; i < Packets.size(); i += nthreads) { + double start = usecond(); + uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf, + Packets[i].to_rank, + Packets[i].recv_buf, + Packets[i].from_rank, + Packets[i].bytes,i); + comms_bytesthr[mythread] += bytes; + commtimethr[mythread] += usecond() - start; } } - } void CommunicateBegin(std::vector > &reqs) { @@ -475,7 +463,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal int checkerboard, const std::vector &directions, const std::vector &distances) - : _permute_type(npoints), _comm_buf_size(npoints) + : _permute_type(npoints), + _comm_buf_size(npoints), + comms_bytesthr(npoints), + commtimethr(npoints) { face_table_computed=0; _npoints = npoints; @@ -1029,6 +1020,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal void ZeroCounters(void) { gathertime = 0.; commtime = 0.; + memset(&commtimethr[0], 0, sizeof(commtimethr)); + memset(&comms_bytesthr[0], 0, sizeof(comms_bytesthr)); halogtime = 0.; mergetime = 0.; decompresstime = 0.; @@ -1044,6 +1037,14 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal #define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<_Nprocessors; RealD NN = _grid->NodeCount(); + double t = 0; + // if commtimethr is set they were all done in parallel so take the max + // but add up the bytes + for (int i = 0; i < 8; ++i) { + comms_bytes += comms_bytesthr[i]; + if (t < commtimethr[i]) t = commtimethr[i]; + } + commtime += t; _grid->GlobalSum(commtime); commtime/=NP; if ( calls > 0. ) { diff --git a/lib/util/Init.cc b/lib/util/Init.cc index fc701ac1..ef875429 100644 --- a/lib/util/Init.cc +++ b/lib/util/Init.cc @@ -359,7 +359,11 @@ void Grid_init(int *argc,char ***argv) if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){ LebesgueOrder::UseLebesgueOrder=1; } - + CartesianCommunicator::nCommThreads = -1; + if( GridCmdOptionExists(*argv,*argv+*argc,"--commthreads") ){ + arg= GridCmdOptionPayload(*argv,*argv+*argc,"--commthreads"); + GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads); + } if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); GridCmdOptionIntVector(arg,LebesgueOrder::Block); From bcefdd7c4eff147242ededf040653449c2d573c9 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 19 Aug 2017 12:49:02 -0400 Subject: [PATCH 05/22] Align both allocator calls to 2MB --- lib/allocator/AlignedAllocator.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/allocator/AlignedAllocator.h b/lib/allocator/AlignedAllocator.h index 7fd9496f..39734b53 100644 --- a/lib/allocator/AlignedAllocator.h +++ b/lib/allocator/AlignedAllocator.h @@ -186,9 +186,9 @@ public: pointer allocate(size_type __n, const void* _p= 0) { #ifdef HAVE_MM_MALLOC_H - _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128); + _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),GRID_ALLOC_ALIGN); #else - _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp)); + _Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,__n*sizeof(_Tp)); #endif return ptr; } From 9e658de2383620b5aa002f319b85442ab24d8115 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 19 Aug 2017 12:52:44 -0400 Subject: [PATCH 06/22] Use Vector --- benchmarks/Benchmark_comms.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc index 698f9d25..491fba1e 100644 --- a/benchmarks/Benchmark_comms.cc +++ b/benchmarks/Benchmark_comms.cc @@ -92,8 +92,8 @@ int main (int argc, char ** argv) RealD Nnode = Grid.NodeCount(); RealD ppn = Nrank/Nnode; - std::vector > xbuf(8,std::vector(lat*lat*lat*Ls)); - std::vector > rbuf(8,std::vector(lat*lat*lat*Ls)); + Vector > xbuf(8,Vector(lat*lat*lat*Ls)); + Vector > rbuf(8,Vector(lat*lat*lat*Ls)); int ncomm; int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); @@ -172,8 +172,8 @@ int main (int argc, char ** argv) RealD Nnode = Grid.NodeCount(); RealD ppn = Nrank/Nnode; - std::vector > xbuf(8,std::vector(lat*lat*lat*Ls)); - std::vector > rbuf(8,std::vector(lat*lat*lat*Ls)); + Vector > xbuf(8,Vector(lat*lat*lat*Ls)); + Vector > rbuf(8,Vector(lat*lat*lat*Ls)); int ncomm; From d6472eda8d00c8d0ffc60760a4dd9462702ac00b Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 19 Aug 2017 12:53:18 -0400 Subject: [PATCH 07/22] Use mmap --- lib/communicator/Communicator_base.cc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc index 67bfaed0..6767495f 100644 --- a/lib/communicator/Communicator_base.cc +++ b/lib/communicator/Communicator_base.cc @@ -26,6 +26,10 @@ Author: Peter Boyle *************************************************************************************/ /* END LEGAL */ #include +#include +#include +#include +#include namespace Grid { @@ -129,8 +133,15 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { return NULL; } void CartesianCommunicator::ShmInitGeneric(void){ +#if 1 + ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, MAP_HUGETLB| MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (ShmCommBuf == (void *)MAP_FAILED) exit(EXIT_FAILURE); + std::cout << "ShmCommBuf "< Date: Sat, 19 Aug 2017 12:53:59 -0400 Subject: [PATCH 08/22] Enable blocking stencil send --- lib/communicator/Communicator_mpit.cc | 5 ----- 1 file changed, 5 deletions(-) diff --git a/lib/communicator/Communicator_mpit.cc b/lib/communicator/Communicator_mpit.cc index f522701c..c0fb47fd 100644 --- a/lib/communicator/Communicator_mpit.cc +++ b/lib/communicator/Communicator_mpit.cc @@ -242,20 +242,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector Date: Sat, 19 Aug 2017 13:18:50 -0400 Subject: [PATCH 09/22] Fix mpi 3 interface change --- lib/communicator/Communicator_mpi3.cc | 11 +++++++++++ lib/communicator/Communicator_mpit.cc | 25 +++++++------------------ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index 46e4745c..e6e33d33 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -621,6 +621,17 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis } } +double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, + int dest, + void *recv, + int from, + int bytes,int dir) +{ + std::vector list; + StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir); + StencilSendToRecvFromComplete(list,dir); +} + double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, void *xmit, int dest, diff --git a/lib/communicator/Communicator_mpit.cc b/lib/communicator/Communicator_mpit.cc index c0fb47fd..9a9b26d2 100644 --- a/lib/communicator/Communicator_mpit.cc +++ b/lib/communicator/Communicator_mpit.cc @@ -242,17 +242,12 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &waitall,int dir) +{ + // Do nothing +}; double CartesianCommunicator::StencilSendToRecvFrom(void *xmit, int xmit_to_rank, void *recv, @@ -266,17 +261,11 @@ double CartesianCommunicator::StencilSendToRecvFrom(void *xmit, // std::cout << " sending on communicator "< &waitall,int dir) -{ - // Do nothing -}; From a446d95c3393d697f987434ac594950d18017b7a Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 20 Aug 2017 01:10:50 +0100 Subject: [PATCH 10/22] Trying to pass TeamCity and Travis --- benchmarks/Benchmark_ITT.cc | 12 ++++++------ lib/communicator/Communicator_base.cc | 6 +++++- lib/communicator/Communicator_base.h | 19 +++++++++++++------ lib/communicator/Communicator_mpi3.cc | 17 +++++++++++++---- lib/qcd/action/fermion/WilsonFermion5D.cc | 18 ++++++++++-------- lib/stencil/Stencil.h | 7 ++++++- lib/util/Init.cc | 18 ++++++++++++++---- 7 files changed, 67 insertions(+), 30 deletions(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 4f16b1de..9bf7d0a5 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -218,7 +218,7 @@ public: std::cout<({45,12,81,9})); for(int lat=8;lat<=lmax;lat+=4){ @@ -368,7 +368,7 @@ public: const int num_cases = 4; #endif controls Cases [] = { -#if defined(AVX512) +#ifdef AVX512 { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, #endif @@ -380,6 +380,10 @@ public: for(int c=0;cBarrier(); diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc index 6767495f..3ce3a774 100644 --- a/lib/communicator/Communicator_base.cc +++ b/lib/communicator/Communicator_base.cc @@ -41,6 +41,7 @@ uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 128*1024*1024; CartesianCommunicator::CommunicatorPolicy_t CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent; int CartesianCommunicator::nCommThreads = -1; +int CartesianCommunicator::Hugepages = 0; ///////////////////////////////// // Alloc, free shmem region @@ -134,7 +135,10 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { } void CartesianCommunicator::ShmInitGeneric(void){ #if 1 - ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, MAP_HUGETLB| MAP_SHARED | MAP_ANONYMOUS, -1, 0); + + int mmap_flag = MAP_SHARED | MAP_ANONYMOUS; + if ( Hugepages ) mmap_flag |= MAP_HUGETLB; + ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); if (ShmCommBuf == (void *)MAP_FAILED) exit(EXIT_FAILURE); std::cout << "ShmCommBuf "< #ifdef HAVE_NUMAIF_H #include #endif + +// Make up for linex deficiencies #ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 +#define SHM_HUGETLB 0x0 +#endif +#ifndef MAP_HUGETLB +#define MAP_HUGETLB 0x0 #endif namespace Grid { @@ -213,8 +218,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666); if ( fd < 0 ) { perror("failed shm_open"); assert(0); } ftruncate(fd, size); + + int mmap_flag = MAP_SHARED; + if (Hugepages) mmap_flag |= MAP_HUGETLB; + void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0); - void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); } assert(((uint64_t)ptr&0x3F)==0); @@ -628,8 +636,9 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, int bytes,int dir) { std::vector list; - StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir); + double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir); StencilSendToRecvFromComplete(list,dir); + return offbytes; } double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, @@ -671,7 +680,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorStencilSendToRecvFromComplete(list); + this->StencilSendToRecvFromComplete(list,dir); } return off_node_bytes; diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 0b6c9e3d..404ecce0 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -135,10 +135,11 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, template void WilsonFermion5D::Report(void) { - std::vector latt = GridDefaultLatt(); - RealD volume = Ls; for(int mu=0;mu_Nprocessors; - RealD NN = _FourDimGrid->NodeCount(); + RealD NP = _FourDimGrid->_Nprocessors; + RealD NN = _FourDimGrid->NodeCount(); + RealD volume = Ls; + std::vector latt = _FourDimGrid->GlobalDimensions(); + for(int mu=0;mu 0 ) { std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; @@ -390,17 +391,18 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms DhopFaceTime+=usecond(); - // Rely on async comms; start comms before merge of local data double ctime=0; double ptime=0; - // DhopComputeTime-=usecond(); - // DhopCommTime-=usecond(); + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // Ugly explicit thread mapping introduced for OPA reasons. + ////////////////////////////////////////////////////////////////////////////////////////////////////// #pragma omp parallel reduction(max:ctime) reduction(max:ptime) { int tid = omp_get_thread_num(); int nthreads = omp_get_num_threads(); int ncomms = CartesianCommunicator::nCommThreads; - if (ncomms == -1) ncomms = st.Packets.size(); + if (ncomms == -1) ncomms = 1; assert(nthreads > ncomms); if (tid >= ncomms) { double start = usecond(); diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index d1d7a7e0..cca67587 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -252,10 +252,15 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal ////////////////////////////////////////// void CommunicateThreaded() { +#ifdef GRID_OMP // must be called in parallel region int mythread = omp_get_thread_num(); int nthreads = CartesianCommunicator::nCommThreads; - if (nthreads == -1) nthreads = Packets.size(); +#else + int mythread = 0; + int nthreads = 1; +#endif + if (nthreads == -1) nthreads = 1; if (mythread < nthreads) { for (int i = mythread; i < Packets.size(); i += nthreads) { double start = usecond(); diff --git a/lib/util/Init.cc b/lib/util/Init.cc index 39a726cf..3fd8b4cd 100644 --- a/lib/util/Init.cc +++ b/lib/util/Init.cc @@ -222,6 +222,11 @@ void Grid_init(int *argc,char ***argv) CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024; } + if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-hugepages") ){ + CartesianCommunicator::Hugepages = 1; + } + + if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){ Grid_debug_handler_init(); } @@ -304,6 +309,7 @@ void Grid_init(int *argc,char ***argv) std::cout< Date: Sun, 20 Aug 2017 01:37:07 +0100 Subject: [PATCH 11/22] Comms none fail fix --- lib/communicator/Communicator_base.cc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc index 3ce3a774..2e6626be 100644 --- a/lib/communicator/Communicator_base.cc +++ b/lib/communicator/Communicator_base.cc @@ -102,6 +102,18 @@ int CartesianCommunicator::NodeCount(void) { return Proc int CartesianCommunicator::RankCount(void) { return ProcessorCount();}; #endif #if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT) +double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes, int dir) +{ + std::vector list; + // Discard the "dir" + SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); + SendToRecvFromComplete(list); + return 2.0*bytes; +} double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, void *xmit, int xmit_to_rank, From 1cdf99966810227f180452393973c87ae4a301c4 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 20 Aug 2017 02:39:10 +0100 Subject: [PATCH 12/22] Moving multicommunicator into mpi3 also for threading --- lib/communicator/Communicator_base.h | 8 ++++---- lib/communicator/Communicator_mpi3.cc | 12 ++++++++++-- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h index ac7d94f3..ac866ced 100644 --- a/lib/communicator/Communicator_base.h +++ b/lib/communicator/Communicator_base.h @@ -78,15 +78,15 @@ class CartesianCommunicator { #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT) static MPI_Comm communicator_world; - MPI_Comm communicator; + + MPI_Comm communicator; + std::vector communicator_halo; + typedef MPI_Request CommsRequest_t; #else typedef int CommsRequest_t; #endif -#if defined (GRID_COMMS_MPIT) - std::vector communicator_halo; -#endif //////////////////////////////////////////////////////////////////// // Helper functionality for SHM Windows common to all other impls diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index 4f769971..9e5dfb97 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -405,8 +405,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors) { int ierr; communicator=communicator_world; + _ndimension = processors.size(); + communicator_halo.resize (2*_ndimension); + for(int i=0;i<_ndimension*2;i++){ + MPI_Comm_dup(communicator,&communicator_halo[i]); + } + //////////////////////////////////////////////////////////////// // Assert power of two shm_size. //////////////////////////////////////////////////////////////// @@ -648,6 +654,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector Date: Sun, 20 Aug 2017 02:53:12 +0100 Subject: [PATCH 13/22] finalise issue on new OPA revert --- benchmarks/Benchmark_dwf.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index 98ce0a07..3858226e 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -503,9 +503,9 @@ int main (int argc, char ** argv) std::cout< Date: Sun, 20 Aug 2017 03:08:54 +0100 Subject: [PATCH 14/22] MAP_HUGETLB portability fix --- lib/communicator/Communicator_base.cc | 2 ++ lib/communicator/Communicator_mpi3.cc | 15 +++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc index 2e6626be..3378c56a 100644 --- a/lib/communicator/Communicator_base.cc +++ b/lib/communicator/Communicator_base.cc @@ -149,7 +149,9 @@ void CartesianCommunicator::ShmInitGeneric(void){ #if 1 int mmap_flag = MAP_SHARED | MAP_ANONYMOUS; +#ifdef MAP_HUGETLB if ( Hugepages ) mmap_flag |= MAP_HUGETLB; +#endif ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); if (ShmCommBuf == (void *)MAP_FAILED) exit(EXIT_FAILURE); std::cout << "ShmCommBuf "< #include #endif -// Make up for linex deficiencies -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 0x0 -#endif -#ifndef MAP_HUGETLB -#define MAP_HUGETLB 0x0 -#endif namespace Grid { @@ -220,7 +213,9 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { ftruncate(fd, size); int mmap_flag = MAP_SHARED; +#ifdef MAP_HUGETLB if (Hugepages) mmap_flag |= MAP_HUGETLB; +#endif void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0); if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); } @@ -274,7 +269,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { for(int r=0;r Date: Fri, 25 Aug 2017 09:25:54 +0100 Subject: [PATCH 15/22] Benchmark prep --- benchmarks/Benchmark_ITT.cc | 322 +++++++++++++++++++--- benchmarks/Benchmark_comms.cc | 30 +- lib/allocator/AlignedAllocator.h | 5 + lib/communicator/Communicator_base.cc | 6 +- lib/communicator/Communicator_mpi3.cc | 5 +- lib/communicator/Communicator_mpit.cc | 19 +- lib/qcd/action/fermion/CayleyFermion5D.cc | 12 +- lib/qcd/action/fermion/WilsonCompressor.h | 41 ++- lib/qcd/action/fermion/WilsonFermion5D.cc | 11 + lib/stencil/Stencil.h | 114 +++++++- 10 files changed, 494 insertions(+), 71 deletions(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 9bf7d0a5..c5226ee1 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -32,6 +32,19 @@ using namespace std; using namespace Grid; using namespace Grid::QCD; +typedef WilsonFermion5D WilsonFermion5DR; +typedef WilsonFermion5D WilsonFermion5DF; +typedef WilsonFermion5D WilsonFermion5DD; + + +std::vector L_list; +std::vector Ls_list; +std::vector mflop_list; + +double mflop_ref; +double mflop_ref_err; + +int NN_global; struct time_statistics{ double mean; @@ -95,13 +108,15 @@ public: static void Comms(void) { - int Nloop=100; + int Nloop=1000; int nmu=0; int maxlat=32; std::vector simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd()); std::vector mpi_layout = GridDefaultMpi(); + for(int mu=0;mu1) nmu++; + std::vector t_time(Nloop); time_statistics timestat; @@ -133,13 +148,14 @@ public: bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); } - int ncomm; int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + int ncomm; double dbytes; + std::vector times(Nloop); for(int i=0;i requests; dbytes=0; ncomm=0; @@ -150,7 +166,6 @@ public: if (mpi_layout[mu]>1 ) { - ncomm++; int xmit_to_rank; int recv_from_rank; if ( dir == mu ) { @@ -160,18 +175,18 @@ public: int comm_proc = mpi_layout[mu]-1; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); } -#if 1 - tbytes= Grid.StencilSendToRecvFromBegin(requests, - (void *)&xbuf[dir][0], - xmit_to_rank, - (void *)&rbuf[dir][0], - recv_from_rank, - bytes,dir); - Grid.StencilSendToRecvFromComplete(requests,dir); -#endif - requests.resize(0); - + tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, + (void *)&rbuf[dir][0], recv_from_rank, + bytes,dir); + +#ifdef GRID_OMP #pragma omp atomic +#endif + ncomm++; + +#ifdef GRID_OMP +#pragma omp atomic +#endif dbytes+=tbytes; } } @@ -181,13 +196,15 @@ public: } timestat.statistics(t_time); + // for(int i=0;i({45,12,81,9})); for(int lat=8;lat<=lmax;lat+=4){ @@ -253,8 +271,7 @@ public: } }; - - static void DWF(int Ls,int L) + static double DWF5(int Ls,int L) { RealD mass=0.1; RealD M5 =1.8; @@ -262,6 +279,7 @@ public: double mflops; double mflops_best = 0; double mflops_worst= 0; + std::vector mflops_all; /////////////////////////////////////////////////////// // Set/Get the layout & grid size @@ -274,6 +292,189 @@ public: GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); uint64_t NP = TmpGrid->RankCount(); uint64_t NN = TmpGrid->NodeCount(); + NN_global=NN; + uint64_t SHM=NP/NN; + + std::vector internal; + if ( SHM == 1 ) internal = std::vector({1,1,1,1}); + else if ( SHM == 2 ) internal = std::vector({2,1,1,1}); + else if ( SHM == 4 ) internal = std::vector({2,2,1,1}); + else if ( SHM == 8 ) internal = std::vector({2,2,2,1}); + else assert(0); + + std::vector nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]}); + std::vector latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]}); + + ///////// Welcome message //////////// + std::cout< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG RNG5(sFGrid); RNG5.SeedFixedIntegers(seeds5); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + ///////// Source preparation //////////// + LatticeFermion src (sFGrid); random(RNG5,src); + LatticeFermion tmp (sFGrid); + + RealD N2 = 1.0/::sqrt(norm2(src)); + src = src*N2; + + LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); + + WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5); + LatticeFermion src_e (sFrbGrid); + LatticeFermion src_o (sFrbGrid); + LatticeFermion r_e (sFrbGrid); + LatticeFermion r_o (sFrbGrid); + LatticeFermion r_eo (sFGrid); + LatticeFermion err (sFGrid); + { + + pickCheckerboard(Even,src_e,src); + pickCheckerboard(Odd,src_o,src); + +#if defined(AVX512) + const int num_cases = 6; + std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O "); +#else + const int num_cases = 4; + std::string fmt("U/S ; U/O ; G/S ; G/O "); +#endif + controls Cases [] = { +#ifdef AVX512 + { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, + { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, +#endif + { QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, + { QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, + { QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, + { QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential } + }; + + for(int c=0;cBarrier(); + for(int i=0;iBarrier(); + double t1=usecond(); + // uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0); + // if (ncall < 500) ncall = 500; + uint64_t ncall = 1000; + + sFGrid->Broadcast(0,&ncall,sizeof(ncall)); + + // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); + for(uint64_t i=0;iBarrier(); + + double volume=Ls; for(int mu=0;mumflops_best ) mflops_best = mflops; + if ( mflops mflops_all; + + /////////////////////////////////////////////////////// + // Set/Get the layout & grid size + /////////////////////////////////////////////////////// + int threads = GridThread::GetThreads(); + std::vector mpi = GridDefaultMpi(); assert(mpi.size()==4); + std::vector local({L,L,L,L}); + + GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(std::vector({64,64,64,64}), + GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + uint64_t NP = TmpGrid->RankCount(); + uint64_t NN = TmpGrid->NodeCount(); + NN_global=NN; uint64_t SHM=NP/NN; std::vector internal; @@ -364,13 +565,15 @@ public: #if defined(AVX512) const int num_cases = 6; + std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O "); #else const int num_cases = 4; + std::string fmt("U/S ; U/O ; G/S ; G/O "); #endif controls Cases [] = { #ifdef AVX512 - { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, + { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, #endif { QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, { QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, @@ -394,7 +597,7 @@ public: if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<Barrier(); for(int i=0;iBarrier(); double t1=usecond(); - uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0); + // uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0); + // if (ncall < 500) ncall = 500; + uint64_t ncall = 1000; + FGrid->Broadcast(0,&ncall,sizeof(ncall)); // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<mflops_best ) mflops_best = mflops; @@ -450,12 +656,20 @@ public: } std::cout< L_list({8,12,16,24}); + std::vector wilson; + std::vector dwf4; + std::vector dwf5; + if ( do_wilson ) { int Ls=1; std::cout< > xbuf(8,Vector(lat*lat*lat*Ls)); - Vector > rbuf(8,Vector(lat*lat*lat*Ls)); + std::vector > xbuf(8); + std::vector > rbuf(8); int ncomm; int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + for(int mu=0;mu<8;mu++){ + xbuf[mu].resize(lat*lat*lat*Ls); + rbuf[mu].resize(lat*lat*lat*Ls); + // std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] < > xbuf(8,Vector(lat*lat*lat*Ls)); - Vector > rbuf(8,Vector(lat*lat*lat*Ls)); + std::vector > xbuf(8); + std::vector > rbuf(8); + for(int mu=0;mu<8;mu++){ + xbuf[mu].resize(lat*lat*lat*Ls); + rbuf[mu].resize(lat*lat*lat*Ls); + // std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] < &waitall,int dir) { - // Do nothing + int nreq=waitall.size(); + MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE); }; double CartesianCommunicator::StencilSendToRecvFrom(void *xmit, int xmit_to_rank, @@ -262,7 +275,7 @@ double CartesianCommunicator::StencilSendToRecvFrom(void *xmit, // Give the CPU to MPI immediately; can use threads to overlap optionally MPI_Request req[2]; MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]); - MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank,myrank, communicator_halo[dir], &req[0]); + MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank ,myrank , communicator_halo[dir],&req[0]); MPI_Waitall(2, req, MPI_STATUSES_IGNORE); return 2.0*bytes; } diff --git a/lib/qcd/action/fermion/CayleyFermion5D.cc b/lib/qcd/action/fermion/CayleyFermion5D.cc index 46ba3793..5e67d1f1 100644 --- a/lib/qcd/action/fermion/CayleyFermion5D.cc +++ b/lib/qcd/action/fermion/CayleyFermion5D.cc @@ -429,7 +429,7 @@ void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vectorM5) +1.0); - // assert(fabs(bee[i])>0.0); + assert(fabs(bee[i])>0.0); cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5)); beo[i]=as[i]*bs[i]; ceo[i]=-as[i]*cs[i]; @@ -455,11 +455,17 @@ void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vector0.0); + assert(fabs(bee[0])>0.0); lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column leem[i]=mass*cee[Ls-1]/bee[0]; - for(int j=0;j0.0); + leem[i]*= aee[j]/bee[j+1]; + } uee[i] =-aee[i]/bee[i]; // up-diag entry on the ith row @@ -478,7 +484,7 @@ void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vector0.0); + assert(fabs(bee[j])>0.0); delta_d *= cee[j]/bee[j]; } dee[Ls-1] += delta_d; diff --git a/lib/qcd/action/fermion/WilsonCompressor.h b/lib/qcd/action/fermion/WilsonCompressor.h index 96cbe1ec..30c6d838 100644 --- a/lib/qcd/action/fermion/WilsonCompressor.h +++ b/lib/qcd/action/fermion/WilsonCompressor.h @@ -238,7 +238,35 @@ template using WilsonCompressor = WilsonCom template class WilsonStencil : public CartesianStencil { public: - + double timer0; + double timer1; + double timer2; + double timer3; + double timer4; + double timer5; + double timer6; + uint64_t callsi; + void ZeroCountersi(void) + { + std::cout << GridLogMessage << " ZeroCountersi()"< same_node; @@ -252,6 +280,7 @@ public: : CartesianStencil (grid,npoints,checkerboard,directions,distances) , same_node(npoints) { + ZeroCountersi(); surface_list.resize(0); }; @@ -282,17 +311,25 @@ public: { std::vector > reqs; this->HaloExchangeOptGather(source,compress); + double t1=usecond(); this->CommunicateBegin(reqs); this->CommunicateComplete(reqs); + double t2=usecond(); timer1 += t2-t1; this->CommsMerge(compress); + double t3=usecond(); timer2 += t3-t2; this->CommsMergeSHM(compress); + double t4=usecond(); timer3 += t4-t3; } template void HaloExchangeOptGather(const Lattice &source,compressor &compress) { this->Prepare(); + double t0=usecond(); this->HaloGatherOpt(source,compress); + double t1=usecond(); + timer0 += t1-t0; + callsi++; } template @@ -304,7 +341,9 @@ public: typedef typename compressor::SiteHalfSpinor SiteHalfSpinor; typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor; + this->mpi3synctime_g-=usecond(); this->_grid->StencilBarrier(); + this->mpi3synctime_g+=usecond(); assert(source._grid==this->_grid); this->halogtime-=usecond(); diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 404ecce0..c5b0f872 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -185,6 +185,11 @@ void WilsonFermion5D::Report(void) std::cout << GridLogMessage << "WilsonFermion5D StencilEven"< 0){ + std::cout << GridLogMessage << "WilsonFermion5D Stencil Reporti()" < @@ -204,6 +209,9 @@ void WilsonFermion5D::ZeroCounters(void) { Stencil.ZeroCounters(); StencilEven.ZeroCounters(); StencilOdd.ZeroCounters(); + Stencil.ZeroCountersi(); + StencilEven.ZeroCountersi(); + StencilOdd.ZeroCountersi(); } @@ -445,6 +453,9 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg DhopCommTime += ctime; DhopComputeTime+=ptime; + // First to enter, last to leave timing + st.CollateThreads(); + DhopFaceTime-=usecond(); st.CommsMerge(compressor); DhopFaceTime+=usecond(); diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index cca67587..ad454bcb 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -176,6 +176,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal // Timing info; ugly; possibly temporary ///////////////////////////////////////// double commtime; + double mpi3synctime; + double mpi3synctime_g; + double shmmergetime; double gathertime; double gathermtime; double halogtime; @@ -185,8 +188,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal double splicetime; double nosplicetime; double calls; - std::vector comms_bytesthr; - std::vector commtimethr; + std::vector comm_bytes_thr; + std::vector comm_time_thr; + std::vector comm_enter_thr; + std::vector comm_leave_thr; //////////////////////////////////////// // Stencil query @@ -262,18 +267,45 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal #endif if (nthreads == -1) nthreads = 1; if (mythread < nthreads) { + comm_enter_thr[mythread] = usecond(); for (int i = mythread; i < Packets.size(); i += nthreads) { - double start = usecond(); uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf, Packets[i].to_rank, Packets[i].recv_buf, Packets[i].from_rank, Packets[i].bytes,i); - comms_bytesthr[mythread] += bytes; - commtimethr[mythread] += usecond() - start; + comm_bytes_thr[mythread] += bytes; } + comm_leave_thr[mythread]= usecond(); + comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread]; } } + + void CollateThreads(void) + { + int nthreads = CartesianCommunicator::nCommThreads; + double first=0.0; + double last =0.0; + + for(int t=0;t 0.0) && ( t0 < first ) ) first = t0; // min time seen + + if ( t1 > last ) last = t1; // max time seen + + } + commtime+= last-first; + } void CommunicateBegin(std::vector > &reqs) { reqs.resize(Packets.size()); @@ -295,14 +327,48 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal } commtime+=usecond(); } + void Communicate(void) + { +#ifdef GRID_OMP +#pragma omp parallel + { + // must be called in parallel region + int mythread = omp_get_thread_num(); + int maxthreads= omp_get_max_threads(); + int nthreads = CartesianCommunicator::nCommThreads; + assert(nthreads <= maxthreads); + + if (nthreads == -1) nthreads = 1; +#else + int mythread = 0; + int nthreads = 1; +#endif + if (mythread < nthreads) { + for (int i = mythread; i < Packets.size(); i += nthreads) { + double start = usecond(); + comm_bytes_thr[mythread] += _grid->StencilSendToRecvFrom(Packets[i].send_buf, + Packets[i].to_rank, + Packets[i].recv_buf, + Packets[i].from_rank, + Packets[i].bytes,i); + comm_time_thr[mythread] += usecond() - start; + } + } +#ifdef GRID_OMP + } +#endif + } template void HaloExchange(const Lattice &source,compressor &compress) { std::vector > reqs; Prepare(); HaloGather(source,compress); + // Concurrent CommunicateBegin(reqs); CommunicateComplete(reqs); + // Sequential + // Communicate(); CommsMergeSHM(compress); CommsMerge(compress); } @@ -363,7 +429,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal template void HaloGather(const Lattice &source,compressor &compress) { + mpi3synctime_g-=usecond(); _grid->StencilBarrier();// Synch shared memory on a single nodes + mpi3synctime_g+=usecond(); // conformable(source._grid,_grid); assert(source._grid==_grid); @@ -423,8 +491,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal CommsMerge(decompress,Mergers,Decompressions); } template void CommsMergeSHM(decompressor decompress) { + mpi3synctime-=usecond(); _grid->StencilBarrier();// Synch shared memory on a single nodes + mpi3synctime+=usecond(); + shmmergetime-=usecond(); CommsMerge(decompress,MergersSHM,DecompressionsSHM); + shmmergetime+=usecond(); } template @@ -470,8 +542,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal const std::vector &distances) : _permute_type(npoints), _comm_buf_size(npoints), - comms_bytesthr(npoints), - commtimethr(npoints) + comm_bytes_thr(npoints), + comm_enter_thr(npoints), + comm_leave_thr(npoints), + comm_time_thr(npoints) { face_table_computed=0; _npoints = npoints; @@ -1025,8 +1099,15 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal void ZeroCounters(void) { gathertime = 0.; commtime = 0.; - memset(&commtimethr[0], 0, sizeof(commtimethr)); - memset(&comms_bytesthr[0], 0, sizeof(comms_bytesthr)); + mpi3synctime=0.; + mpi3synctime_g=0.; + shmmergetime=0.; + for(int i=0;i<_npoints;i++){ + comm_time_thr[i]=0; + comm_bytes_thr[i]=0; + comm_enter_thr[i]=0; + comm_leave_thr[i]=0; + } halogtime = 0.; mergetime = 0.; decompresstime = 0.; @@ -1043,13 +1124,17 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal RealD NP = _grid->_Nprocessors; RealD NN = _grid->NodeCount(); double t = 0; - // if commtimethr is set they were all done in parallel so take the max + // if comm_time_thr is set they were all done in parallel so take the max // but add up the bytes + int threaded = 0 ; for (int i = 0; i < 8; ++i) { - comms_bytes += comms_bytesthr[i]; - if (t < commtimethr[i]) t = commtimethr[i]; + if ( comm_time_thr[i]>0.0 ) { + threaded = 1; + comms_bytes += comm_bytes_thr[i]; + if (t < comm_time_thr[i]) t = comm_time_thr[i]; + } } - commtime += t; + if (threaded) commtime += t; _grid->GlobalSum(commtime); commtime/=NP; if ( calls > 0. ) { @@ -1065,6 +1150,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"< Date: Fri, 25 Aug 2017 11:41:01 +0100 Subject: [PATCH 16/22] updated from cambridge mpi3 shakeout --- benchmarks/Benchmark_ITT.cc | 4 ++-- lib/qcd/action/fermion/WilsonCompressor.h | 7 +++++-- lib/stencil/Stencil.h | 8 ++++---- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index c5226ee1..bd75dd8e 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -108,7 +108,7 @@ public: static void Comms(void) { - int Nloop=1000; + int Nloop=200; int nmu=0; int maxlat=32; @@ -197,7 +197,7 @@ public: timestat.statistics(t_time); // for(int i=0;i > reqs; this->HaloExchangeOptGather(source,compress); double t1=usecond(); - this->CommunicateBegin(reqs); - this->CommunicateComplete(reqs); + // Asynchronous MPI calls multidirectional, Isend etc... + // this->CommunicateBegin(reqs); + // this->CommunicateComplete(reqs); + // Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways. + this->Communicate(); double t2=usecond(); timer1 += t2-t1; this->CommsMerge(compress); double t3=usecond(); timer2 += t3-t2; diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index ad454bcb..cd0792d5 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -365,10 +365,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal Prepare(); HaloGather(source,compress); // Concurrent - CommunicateBegin(reqs); - CommunicateComplete(reqs); - // Sequential - // Communicate(); + //CommunicateBegin(reqs); + //CommunicateComplete(reqs); + // Sequential, possibly threaded + Communicate(); CommsMergeSHM(compress); CommsMerge(compress); } From 3a582174053732f4e5645367b750fd446d8fcb1d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 25 Aug 2017 14:29:53 +0100 Subject: [PATCH 17/22] Updated --- benchmarks/Benchmark_ITT.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index bd75dd8e..2edae8d0 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -386,7 +386,7 @@ public: if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<Barrier(); for(int i=0;iBroadcast(0,&ncall,sizeof(ncall)); From d0f3d525d5dfb6cd7a2f5fe3be5a69c7ddc1306e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 25 Aug 2017 19:33:54 +0100 Subject: [PATCH 18/22] Optimal block size for KNL --- benchmarks/Benchmark_ITT.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 2edae8d0..c0ce451f 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -679,8 +679,11 @@ int main (int argc, char ** argv) Grid_init(&argc,&argv); CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); +#ifdef KNL + LebesgueOrder::Block = std::vector({8,2,2,2}); +#else LebesgueOrder::Block = std::vector({2,2,2,2}); - +#endif Benchmark::Decomposition(); int do_memory=1; From f68b5de9c8798779ef2657b9c2d469174ae8f53a Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 25 Aug 2017 19:35:21 +0100 Subject: [PATCH 19/22] No compile fix on Clang --- lib/qcd/action/fermion/CayleyFermion5D.cc | 12 ++++++------ lib/qcd/action/fermion/WilsonCompressor.h | 4 ---- lib/qcd/action/fermion/WilsonFermion5D.cc | 5 +++-- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/lib/qcd/action/fermion/CayleyFermion5D.cc b/lib/qcd/action/fermion/CayleyFermion5D.cc index 5e67d1f1..838b1c3d 100644 --- a/lib/qcd/action/fermion/CayleyFermion5D.cc +++ b/lib/qcd/action/fermion/CayleyFermion5D.cc @@ -414,7 +414,7 @@ void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vector0.0); + assert(omega[i]!=Coeff_t(0.0)); bs[i] = 0.5*(bpc/omega[i] + bmc); cs[i] = 0.5*(bpc/omega[i] - bmc); } @@ -429,7 +429,7 @@ void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vectorM5) +1.0); - assert(fabs(bee[i])>0.0); + assert(bee[i]!=Coeff_t(0.0)); cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5)); beo[i]=as[i]*bs[i]; ceo[i]=-as[i]*cs[i]; @@ -456,14 +456,14 @@ void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vector0.0); - assert(fabs(bee[0])>0.0); + assert(bee[i]!=Coeff_t(0.0)); + assert(bee[0]!=Coeff_t(0.0)); lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column leem[i]=mass*cee[Ls-1]/bee[0]; for(int j=0;j0.0); + assert(bee[j+1]!=Coeff_t(0.0)); leem[i]*= aee[j]/bee[j+1]; } @@ -484,7 +484,7 @@ void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vector0.0); + assert(bee[j] != Coeff_t(0.0)); delta_d *= cee[j]/bee[j]; } dee[Ls-1] += delta_d; diff --git a/lib/qcd/action/fermion/WilsonCompressor.h b/lib/qcd/action/fermion/WilsonCompressor.h index 406476b0..cc5c3c63 100644 --- a/lib/qcd/action/fermion/WilsonCompressor.h +++ b/lib/qcd/action/fermion/WilsonCompressor.h @@ -248,7 +248,6 @@ public: uint64_t callsi; void ZeroCountersi(void) { - std::cout << GridLogMessage << " ZeroCountersi()"<_npoints;point++){ same_node[point] = this->SameNode(point); - // std::cout << " dir " <HaloGatherDir(source,XpCompress,Xp,face_idx)); assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index c5b0f872..1da58ddb 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -123,12 +123,13 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, int vol4; vol4=FourDimGrid.oSites(); Stencil.BuildSurfaceList(LLs,vol4); + vol4=FourDimRedBlackGrid.oSites(); StencilEven.BuildSurfaceList(LLs,vol4); StencilOdd.BuildSurfaceList(LLs,vol4); - std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size() - <<" " << StencilEven.surface_list.size()< Date: Fri, 25 Aug 2017 20:43:37 +0100 Subject: [PATCH 20/22] Fix --- benchmarks/Benchmark_ITT.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 58fdb84a..c0ce451f 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -181,6 +181,7 @@ public: #ifdef GRID_OMP #pragma omp atomic +#endif ncomm++; #ifdef GRID_OMP From 54a5e6c1d0ec1cf1b66dac5ba407db49bc7e1016 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 25 Aug 2017 22:36:08 +0100 Subject: [PATCH 21/22] Check if we get huge pages on linux. Larry Meadows piece of magic. --- lib/allocator/AlignedAllocator.cc | 33 +++++++++++++++++++++++++++++++ lib/allocator/AlignedAllocator.h | 2 ++ 2 files changed, 35 insertions(+) diff --git a/lib/allocator/AlignedAllocator.cc b/lib/allocator/AlignedAllocator.cc index 04de20bf..764bd732 100644 --- a/lib/allocator/AlignedAllocator.cc +++ b/lib/allocator/AlignedAllocator.cc @@ -63,4 +63,37 @@ void *PointerCache::Lookup(size_t bytes) { return NULL; } + +void check_huge_pages(void *Buf,uint64_t BYTES) +{ +#ifdef __linux__ + int fd = open("/proc/self/pagemap", O_RDONLY); + assert(fd >= 0); + const int page_size = 4096; + uint64_t virt_pfn = (uint64_t)Buf / page_size; + off_t offset = sizeof(uint64_t) * virt_pfn; + uint64_t npages = (BYTES + page_size-1) / page_size; + uint64_t pagedata[npages]; + uint64_t ret = lseek(fd, offset, SEEK_SET); + assert(ret == offset); + ret = ::read(fd, pagedata, sizeof(uint64_t)*npages); + assert(ret == sizeof(uint64_t) * npages); + int nhugepages = npages / 512; + int n4ktotal, nnothuge; + n4ktotal = 0; + nnothuge = 0; + for (int i = 0; i < nhugepages; ++i) { + uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size; + for (int j = 0; j < 512; ++j) { + uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size; + ++n4ktotal; + if (pageaddr != baseaddr + j * page_size) + ++nnothuge; + } + } + int rank = CartesianCommunicator::RankWorld(); + printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge); +#endif +} + } diff --git a/lib/allocator/AlignedAllocator.h b/lib/allocator/AlignedAllocator.h index c5ad0883..e64a5949 100644 --- a/lib/allocator/AlignedAllocator.h +++ b/lib/allocator/AlignedAllocator.h @@ -64,6 +64,8 @@ namespace Grid { }; + void check_huge_pages(void *Buf,uint64_t BYTES); + //////////////////////////////////////////////////////////////////// // A lattice of something, but assume the something is SIMDized. //////////////////////////////////////////////////////////////////// From 4b4c2a715b319bcc7060ef9ae8aa983c49471167 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 26 Aug 2017 11:38:04 +0100 Subject: [PATCH 22/22] fcntl.h needed --- lib/allocator/AlignedAllocator.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/allocator/AlignedAllocator.cc b/lib/allocator/AlignedAllocator.cc index 764bd732..967b2571 100644 --- a/lib/allocator/AlignedAllocator.cc +++ b/lib/allocator/AlignedAllocator.cc @@ -1,7 +1,5 @@ - - - #include +#include namespace Grid {