From 22d384b07dcf0f3cd612c880e15c7a66fc84cef0 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 11 May 2015 18:59:03 +0100 Subject: [PATCH] Adding a better controlled threading class, preparing to force in deterministic reduction. --- benchmarks/Grid_comms.cc | 7 +- benchmarks/Grid_memory_bandwidth.cc | 9 +- benchmarks/Grid_wilson.cc | 9 +- lib/Grid.h | 13 ++- lib/Grid_init.cc | 125 ++++++++++++++--------- lib/Grid_threads.h | 80 +++++++++++++++ lib/cartesian/Grid_cartesian_base.h | 5 +- lib/cartesian/Grid_cartesian_red_black.h | 2 +- tests/Grid_cshift.cc | 7 +- tests/Grid_gamma.cc | 10 +- tests/Grid_main.cc | 7 +- tests/Grid_nersc_io.cc | 7 +- tests/Grid_simd.cc | 8 +- tests/Grid_stencil.cc | 7 +- 14 files changed, 199 insertions(+), 97 deletions(-) create mode 100644 lib/Grid_threads.h diff --git a/benchmarks/Grid_comms.cc b/benchmarks/Grid_comms.cc index 5317e127..3dedfb5b 100644 --- a/benchmarks/Grid_comms.cc +++ b/benchmarks/Grid_comms.cc @@ -8,11 +8,8 @@ int main (int argc, char ** argv) { Grid_init(&argc,&argv); - std::vector latt_size; - std::vector simd_layout; - std::vector mpi_layout; - - GridParseLayout(argv,argc,latt_size,simd_layout,mpi_layout); + std::vector simd_layout = GridDefaultSimd(); + std::vector mpi_layout = GridDefaultMpi(); int Nloop=10; int nmu=0; diff --git a/benchmarks/Grid_memory_bandwidth.cc b/benchmarks/Grid_memory_bandwidth.cc index dd6d1816..cb444128 100644 --- a/benchmarks/Grid_memory_bandwidth.cc +++ b/benchmarks/Grid_memory_bandwidth.cc @@ -8,17 +8,14 @@ int main (int argc, char ** argv) { Grid_init(&argc,&argv); - std::vector tmp_latt_size; - std::vector simd_layout; - std::vector mpi_layout; - - GridParseLayout(argv,argc,tmp_latt_size,simd_layout,mpi_layout); - const int Nvec=8; typedef Lattice< iVector< vReal,Nvec> > LatticeVec; int Nloop=1000; + std::vector simd_layout = GridDefaultSimd(); + std::vector mpi_layout = GridDefaultMpi(); + std::cout << "===================================================================================================="< latt_size; - std::vector simd_layout; - std::vector mpi_layout; - GridParseLayout(argv,argc,latt_size,simd_layout,mpi_layout); - + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(); + std::vector mpi_layout = GridDefaultMpi(); GridCartesian Grid(latt_size,simd_layout,mpi_layout); + std::vector seeds({1,2,3,4}); GridParallelRNG pRNG(&Grid); diff --git a/lib/Grid.h b/lib/Grid.h index cce56c0c..da04438a 100644 --- a/lib/Grid.h +++ b/lib/Grid.h @@ -45,8 +45,11 @@ #include #include -#include +#include + #include + +#include #include #include #include @@ -60,6 +63,7 @@ namespace Grid { void Grid_init(int *argc,char ***argv); void Grid_finalize(void); + // internal, controled with --handle void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr); void Grid_debug_handler_init(void); void Grid_quiesce_nodes(void); @@ -68,6 +72,11 @@ namespace Grid { // C++11 time facilities better? double usecond(void); + const std::vector &GridDefaultSimd(void); + const std::vector &GridDefaultLatt(void); + const std::vector &GridDefaultMpi(void); + const int &GridThreads(void) ; + void GridSetThreads(int t) ; // Common parsing chores std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option); @@ -75,8 +84,8 @@ namespace Grid { void GridParseIntVector(std::string &str,std::vector & vec); void GridParseLayout(char **argv,int argc, - std::vector &simd, std::vector &latt, + std::vector &simd, std::vector &mpi); diff --git a/lib/Grid_init.cc b/lib/Grid_init.cc index e80bbe64..9e977064 100644 --- a/lib/Grid_init.cc +++ b/lib/Grid_init.cc @@ -1,5 +1,5 @@ /****************************************************************************/ -/* PAB: Signal magic. Processor state dump is x86-64 specific */ +/* pab: Signal magic. Processor state dump is x86-64 specific */ /****************************************************************************/ #include @@ -23,23 +23,25 @@ namespace Grid { - void Grid_quiesce_nodes(void) - { -#ifdef GRID_COMMS_MPI - int me; - MPI_Comm_rank(MPI_COMM_WORLD,&me); - if ( me ) { - std::cout.setstate(std::ios::badbit); - } -#endif - } - void Grid_unquiesce_nodes(void) - { -#ifdef GRID_COMMS_MPI - std::cout.clear(); -#endif - } + ////////////////////////////////////////////////////// + // Convenience functions to access stadard command line arg + // driven parallelism controls + ////////////////////////////////////////////////////// + static std::vector Grid_default_simd; + static std::vector Grid_default_latt; + static std::vector Grid_default_mpi; + int GridThread::_threads; + + + const std::vector &GridDefaultSimd(void) {return Grid_default_simd;}; + const std::vector &GridDefaultLatt(void) {return Grid_default_latt;}; + const std::vector &GridDefaultMpi(void) {return Grid_default_mpi;}; + + + //////////////////////////////////////////////////////////// + // Command line parsing assist for stock controls + //////////////////////////////////////////////////////////// std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option) { char ** itr = std::find(begin, end, option); @@ -53,15 +55,6 @@ bool GridCmdOptionExists(char** begin, char** end, const std::string& option) { return std::find(begin, end, option) != end; } -void Grid_init(int *argc,char ***argv) -{ -#ifdef GRID_COMMS_MPI - MPI_Init(argc,argv); -#endif - // Parse command line args. - Grid_quiesce_nodes(); - -} void GridCmdOptionIntVector(std::string &str,std::vector & vec) { @@ -70,7 +63,7 @@ void GridCmdOptionIntVector(std::string &str,std::vector & vec) int i; while (ss >> i){ vec.push_back(i); - if (ss.peek() == ',') + if(std::ispunct(ss.peek())) ss.ignore(); } return; @@ -94,38 +87,74 @@ void GridParseLayout(char **argv,int argc, simd=std::vector({1,2,2,2}); #endif - + GridThread::SetMaxThreads(); + std::string arg; if( GridCmdOptionExists(argv,argv+argc,"--mpi") ){ arg = GridCmdOptionPayload(argv,argv+argc,"--mpi"); GridCmdOptionIntVector(arg,mpi); - std::cout<<"MPI "; - for(int i=0;i ompthreads(0); + arg= GridCmdOptionPayload(argv,argv+argc,"--omp"); + GridCmdOptionIntVector(arg,ompthreads); + assert(ompthreads.size()==1); + GridThread::SetThreads(ompthreads[0]); + } + } + + ///////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////// +void Grid_init(int *argc,char ***argv) +{ +#ifdef GRID_COMMS_MPI + MPI_Init(argc,argv); +#endif + // Parse command line args. + + if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){ + Grid_debug_handler_init(); + } + if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){ + Grid_quiesce_nodes(); + } + GridParseLayout(*argv,*argc, + Grid_default_latt, + Grid_default_simd, + Grid_default_mpi); + +} + + + //////////////////////////////////////////////////////////// + // Verbose limiter on MPI tasks + //////////////////////////////////////////////////////////// + void Grid_quiesce_nodes(void) + { +#ifdef GRID_COMMS_MPI + int me; + MPI_Comm_rank(MPI_COMM_WORLD,&me); + if ( me ) { + std::cout.setstate(std::ios::badbit); + } +#endif + } + void Grid_unquiesce_nodes(void) + { +#ifdef GRID_COMMS_MPI + std::cout.clear(); +#endif + } + void Grid_finalize(void) { @@ -146,14 +175,14 @@ void * Grid_backtrace_buffer[_NBACKTRACE]; void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr) { printf("Caught signal %d\n",si->si_signo); - printf(" mem address %lx\n",(uint64_t)si->si_addr); + printf(" mem address %llx\n",(unsigned long long)si->si_addr); printf(" code %d\n",si->si_code); #ifdef __X86_64 ucontext_t * uc= (ucontext_t *)ptr; struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext; - printf(" instruction %llx\n",(uint64_t)sc->rip); -#define REG(A) printf(" %s %lx\n",#A, sc-> A); + printf(" instruction %llx\n",(unsigned long long)sc->rip); +#define REG(A) printf(" %s %lx\n",#A,sc-> A); REG(rdi); REG(rsi); REG(rbp); diff --git a/lib/Grid_threads.h b/lib/Grid_threads.h new file mode 100644 index 00000000..6c5f17e0 --- /dev/null +++ b/lib/Grid_threads.h @@ -0,0 +1,80 @@ +#ifndef GRID_THREADS_H +#define GRID_THREADS_H + +#ifdef HAVE_OPENMP +#include +#define PARALLEL_FOR_LOOP _Pragma("omp parallel for") +#define PARALLEL_NESTED_LOOP(n) _Pragma("omp parallel for collapse(" #n ")") +#else +#define PARALLEL_FOR_LOOP +#define PARALLEL_NESTED_LOOP(n) +#endif + +namespace Grid { + + // Introduce a class to gain deterministic bit reproducible reduction. + // make static; perhaps just a namespace is required. + +class GridThread { + public: + static int _threads; + + static void SetThreads(int thr) { +#ifdef HAVE_OPENMP + _threads = MIN(thr,omp_get_max_threads()) ; + omp_set_num_threads(_threads); +#else + _threads = 1; +#endif + }; + static void SetMaxThreads(void) { +#ifdef HAVE_OPENMP + _threads = omp_get_max_threads(); + omp_set_num_threads(_threads); +#else + _threads = 1; +#endif + }; + static int GetThreads(void) { return _threads; }; + static int SumArraySize(void) {return _threads;}; + + static void GetWork(int nwork, int me, int & mywork, int & myoff){ + int basework = nwork/_threads; + int backfill = _threads-(nwork%_threads); + if ( me >= _threads ) { + mywork = myoff = 0; + } else { + mywork = (nwork+me)/_threads; + myoff = basework * me; + if ( me > backfill ) + myoff+= (me-backfill); + } + return; + }; + + static void GetWorkBarrier(int nwork, int &me, int & mywork, int & myoff){ + me = ThreadBarrier(); + GetWork(nwork,me,mywork,myoff); + }; + + static int ThreadBarrier(void) { +#ifdef HAVE_OPENMP +#pragma omp barrier + return omp_get_thread_num(); +#else + return 0; +#endif + }; + + template static void ThreadSum( std::vector &sum_array,obj &val,int me){ + sum_array[me] = val; + val=zero; + ThreadBarrier(); + for(int i=0;i<_threads;i++) val+= sum_array[i]; + ThreadBarrier(); + }; + +}; + +} +#endif diff --git a/lib/cartesian/Grid_cartesian_base.h b/lib/cartesian/Grid_cartesian_base.h index c297cbb9..a74773f8 100644 --- a/lib/cartesian/Grid_cartesian_base.h +++ b/lib/cartesian/Grid_cartesian_base.h @@ -14,7 +14,7 @@ namespace Grid{ // int _processor; // linear processor rank // std::vector _processor_coor; // linear processor rank ////////////////////////////////////////////////////////////////////// - class GridBase : public CartesianCommunicator { + class GridBase : public CartesianCommunicator , public GridThread { public: @@ -22,7 +22,8 @@ public: template friend class Lattice; GridBase(std::vector & processor_grid) : CartesianCommunicator(processor_grid) {}; - + + // Physics Grid information. std::vector _simd_layout;// Which dimensions get relayed out over simd lanes. std::vector _fdimensions;// Global dimensions of array prior to cb removal diff --git a/lib/cartesian/Grid_cartesian_red_black.h b/lib/cartesian/Grid_cartesian_red_black.h index 55bd1f20..475b47c2 100644 --- a/lib/cartesian/Grid_cartesian_red_black.h +++ b/lib/cartesian/Grid_cartesian_red_black.h @@ -46,7 +46,7 @@ public: }; GridRedBlackCartesian(std::vector &dimensions, std::vector &simd_layout, - std::vector &processor_grid) : GridBase(processor_grid) + std::vector &processor_grid ) : GridBase(processor_grid) { /////////////////////// // Grid information diff --git a/tests/Grid_cshift.cc b/tests/Grid_cshift.cc index 0eb6285a..e92ff793 100644 --- a/tests/Grid_cshift.cc +++ b/tests/Grid_cshift.cc @@ -8,11 +8,10 @@ int main (int argc, char ** argv) { Grid_init(&argc,&argv); - std::vector simd_layout; - std::vector mpi_layout; - std::vector latt_size; + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(); + std::vector mpi_layout = GridDefaultMpi(); - GridParseLayout(argv,argc,latt_size,simd_layout,mpi_layout); GridCartesian Fine(latt_size,simd_layout,mpi_layout); GridParallelRNG FineRNG(&Fine); FineRNG.SeedRandomDevice(); diff --git a/tests/Grid_gamma.cc b/tests/Grid_gamma.cc index f8f86242..9779ca07 100644 --- a/tests/Grid_gamma.cc +++ b/tests/Grid_gamma.cc @@ -14,12 +14,10 @@ int main (int argc, char ** argv) { Grid_init(&argc,&argv); - std::vector latt_size; - std::vector simd_layout; - std::vector mpi_layout; - - GridParseLayout(argv,argc,latt_size,simd_layout,mpi_layout); - + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(); + std::vector mpi_layout = GridDefaultMpi(); + GridCartesian Grid(latt_size,simd_layout,mpi_layout); GridParallelRNG pRNG(&Grid); diff --git a/tests/Grid_main.cc b/tests/Grid_main.cc index 9a7aa70b..c3454dfc 100644 --- a/tests/Grid_main.cc +++ b/tests/Grid_main.cc @@ -25,11 +25,10 @@ int main (int argc, char ** argv) { Grid_init(&argc,&argv); - std::vector latt_size; - std::vector simd_layout; - std::vector mpi_layout; + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(); + std::vector mpi_layout = GridDefaultMpi(); - GridParseLayout(argv,argc,latt_size,simd_layout,mpi_layout); latt_size.resize(4); #ifdef AVX512 diff --git a/tests/Grid_nersc_io.cc b/tests/Grid_nersc_io.cc index e8ff7209..73d7edfe 100644 --- a/tests/Grid_nersc_io.cc +++ b/tests/Grid_nersc_io.cc @@ -10,12 +10,9 @@ int main (int argc, char ** argv) { Grid_init(&argc,&argv); - std::vector tmp_latt_size; - std::vector simd_layout; - std::vector mpi_layout; - - GridParseLayout(argv,argc,tmp_latt_size,simd_layout,mpi_layout); + std::vector simd_layout = GridDefaultSimd(); + std::vector mpi_layout = GridDefaultMpi(); std::vector latt_size ({16,16,16,32}); std::vector clatt_size ({4,4,4,8}); int orthodir=3; diff --git a/tests/Grid_simd.cc b/tests/Grid_simd.cc index 357e0fe8..6a957d7e 100644 --- a/tests/Grid_simd.cc +++ b/tests/Grid_simd.cc @@ -106,11 +106,9 @@ int main (int argc, char ** argv) { Grid_init(&argc,&argv); - std::vector latt_size; - std::vector simd_layout; - std::vector mpi_layout; - - GridParseLayout(argv,argc,latt_size,simd_layout,mpi_layout); + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(); + std::vector mpi_layout = GridDefaultMpi(); GridCartesian Grid(latt_size,simd_layout,mpi_layout); std::vector seeds({1,2,3,4}); diff --git a/tests/Grid_stencil.cc b/tests/Grid_stencil.cc index c4b516dd..d9e779bf 100644 --- a/tests/Grid_stencil.cc +++ b/tests/Grid_stencil.cc @@ -8,11 +8,10 @@ int main (int argc, char ** argv) { Grid_init(&argc,&argv); - std::vector latt_size; - std::vector simd_layout; - std::vector mpi_layout; - GridParseLayout(argv,argc,latt_size,simd_layout,mpi_layout); + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(); + std::vector mpi_layout = GridDefaultMpi(); double volume = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];