mirror of
https://github.com/paboyle/Grid.git
synced 2025-11-27 18:09:32 +00:00
WIlson flow to include options for DBW2, Iwasaki, Symanzik. View logging for data assurance
899 lines
30 KiB
C++
899 lines
30 KiB
C++
/*************************************************************************************
|
|
|
|
Grid physics library, www.github.com/paboyle/Grid
|
|
|
|
Source file: ./lib/Init.cc
|
|
|
|
Copyright (C) 2015
|
|
|
|
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
Author: Peter Boyle <peterboyle@MacBook-Pro.local>
|
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License along
|
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
|
|
See the full license in the file "LICENSE" in the top level distribution directory
|
|
*************************************************************************************/
|
|
/* END LEGAL */
|
|
/****************************************************************************/
|
|
/* pab: Signal magic. Processor state dump is x86-64 specific */
|
|
/****************************************************************************/
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <stdint.h>
|
|
#include <unistd.h>
|
|
#include <sys/mman.h>
|
|
#include <sys/stat.h>
|
|
#include <sys/time.h>
|
|
#include <signal.h>
|
|
#include <iostream>
|
|
#include <iterator>
|
|
#include <algorithm>
|
|
#include <iterator>
|
|
#include <cstdlib>
|
|
#include <memory>
|
|
|
|
|
|
#include <Grid/Grid.h>
|
|
|
|
#include <Grid/util/CompilerCompatible.h>
|
|
|
|
#ifdef HAVE_UNWIND
|
|
#include <libunwind.h>
|
|
#endif
|
|
|
|
#include <fenv.h>
|
|
#ifdef __APPLE__
|
|
static int
|
|
feenableexcept (unsigned int excepts)
|
|
{
|
|
#if 0
|
|
// Fails on Apple M1
|
|
static fenv_t fenv;
|
|
unsigned int new_excepts = excepts & FE_ALL_EXCEPT;
|
|
unsigned int old_excepts; // previous masks
|
|
int iold_excepts; // previous masks
|
|
|
|
if ( fegetenv (&fenv) ) return -1;
|
|
old_excepts = fenv.__control & FE_ALL_EXCEPT;
|
|
|
|
// unmask
|
|
fenv.__control &= ~new_excepts;
|
|
fenv.__mxcsr &= ~(new_excepts << 7);
|
|
|
|
iold_excepts = (int) old_excepts;
|
|
return ( fesetenv (&fenv) ? -1 : iold_excepts );
|
|
#endif
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
#ifndef HOST_NAME_MAX
|
|
#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
|
|
#endif
|
|
|
|
void * Grid_backtrace_buffer[_NBACKTRACE];
|
|
|
|
NAMESPACE_BEGIN(Grid);
|
|
|
|
//////////////////////////////////////////////////////
|
|
// Convenience functions to access stadard command line arg
|
|
// driven parallelism controls
|
|
//////////////////////////////////////////////////////
|
|
static Coordinate Grid_default_latt;
|
|
static Coordinate Grid_default_mpi;
|
|
|
|
int GridThread::_threads =1;
|
|
int GridThread::_hyperthreads=1;
|
|
int GridThread::_cores=1;
|
|
|
|
char hostname[HOST_NAME_MAX+1];
|
|
|
|
char *GridHostname(void)
|
|
{
|
|
return hostname;
|
|
}
|
|
const Coordinate &GridDefaultLatt(void) {return Grid_default_latt;};
|
|
const Coordinate &GridDefaultMpi(void) {return Grid_default_mpi;};
|
|
const Coordinate GridDefaultSimd(int dims,int nsimd)
|
|
{
|
|
Coordinate layout(dims);
|
|
int nn=nsimd;
|
|
for(int d=dims-1;d>=0;d--){
|
|
if ( nn>=2) {
|
|
layout[d]=2;
|
|
nn/=2;
|
|
} else {
|
|
layout[d]=1;
|
|
}
|
|
}
|
|
GRID_ASSERT(nn==1);
|
|
return layout;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////
|
|
// Command line parsing assist for stock controls
|
|
////////////////////////////////////////////////////////////
|
|
std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option)
|
|
{
|
|
char ** itr = std::find(begin, end, option);
|
|
if (itr != end && ++itr != end) {
|
|
std::string payload(*itr);
|
|
return payload;
|
|
}
|
|
return std::string("");
|
|
}
|
|
bool GridCmdOptionExists(char** begin, char** end, const std::string& option)
|
|
{
|
|
return std::find(begin, end, option) != end;
|
|
}
|
|
// Comma separated list
|
|
void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec)
|
|
{
|
|
size_t pos = 0;
|
|
std::string token;
|
|
std::string delimiter(",");
|
|
|
|
vec.resize(0);
|
|
while ((pos = str.find(delimiter)) != std::string::npos) {
|
|
token = str.substr(0, pos);
|
|
vec.push_back(token);
|
|
str.erase(0, pos + delimiter.length());
|
|
}
|
|
token = str;
|
|
vec.push_back(token);
|
|
return;
|
|
}
|
|
|
|
template<class VectorInt>
|
|
void GridCmdOptionIntVector(const std::string &str,VectorInt & vec)
|
|
{
|
|
vec.resize(0);
|
|
std::stringstream ss(str);
|
|
int i;
|
|
while (ss >> i){
|
|
vec.push_back(i);
|
|
if(std::ispunct(ss.peek()))
|
|
ss.ignore();
|
|
}
|
|
return;
|
|
}
|
|
|
|
template void GridCmdOptionIntVector(const std::string &str,std::vector<int> & vec);
|
|
template void GridCmdOptionIntVector(const std::string &str,Coordinate & vec);
|
|
|
|
void GridCmdOptionInt(std::string &str,int & val)
|
|
{
|
|
std::stringstream ss(str);
|
|
ss>>val;
|
|
return;
|
|
}
|
|
|
|
void GridCmdOptionFloat(std::string &str,double & val)
|
|
{
|
|
std::stringstream ss(str);
|
|
ss>>val;
|
|
return;
|
|
}
|
|
|
|
void GridParseLayout(char **argv,int argc,
|
|
Coordinate &latt_c,
|
|
Coordinate &mpi_c)
|
|
{
|
|
auto mpi =std::vector<int>({1,1,1,1});
|
|
auto latt=std::vector<int>({8,8,8,8});
|
|
|
|
GridThread::SetMaxThreads();
|
|
|
|
std::string arg;
|
|
if( GridCmdOptionExists(argv,argv+argc,"--mpi") ){
|
|
arg = GridCmdOptionPayload(argv,argv+argc,"--mpi");
|
|
GridCmdOptionIntVector(arg,mpi);
|
|
}
|
|
if( GridCmdOptionExists(argv,argv+argc,"--grid") ){
|
|
arg= GridCmdOptionPayload(argv,argv+argc,"--grid");
|
|
GridCmdOptionIntVector(arg,latt);
|
|
}
|
|
if( GridCmdOptionExists(argv,argv+argc,"--threads") ){
|
|
std::vector<int> ompthreads(0);
|
|
#ifndef GRID_OMP
|
|
std::cout << GridLogWarning << "'--threads' option used but Grid was"
|
|
<< " not compiled with thread support" << std::endl;
|
|
#endif
|
|
arg= GridCmdOptionPayload(argv,argv+argc,"--threads");
|
|
GridCmdOptionIntVector(arg,ompthreads);
|
|
GRID_ASSERT(ompthreads.size()==1);
|
|
GridThread::SetThreads(ompthreads[0]);
|
|
}
|
|
if( GridCmdOptionExists(argv,argv+argc,"--accelerator-threads") ){
|
|
std::vector<int> gputhreads(0);
|
|
arg= GridCmdOptionPayload(argv,argv+argc,"--accelerator-threads");
|
|
GridCmdOptionIntVector(arg,gputhreads);
|
|
GRID_ASSERT(gputhreads.size()==1);
|
|
acceleratorThreads(gputhreads[0]);
|
|
}
|
|
|
|
if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
|
|
int cores;
|
|
arg= GridCmdOptionPayload(argv,argv+argc,"--cores");
|
|
GridCmdOptionInt(arg,cores);
|
|
GridThread::SetCores(cores);
|
|
}
|
|
// Copy back into coordinate format
|
|
int nd = mpi.size();
|
|
GRID_ASSERT(latt.size()==nd);
|
|
latt_c.resize(nd);
|
|
mpi_c.resize(nd);
|
|
for(int d=0;d<nd;d++){
|
|
latt_c[d] = latt[d];
|
|
mpi_c[d] = mpi[d];
|
|
}
|
|
}
|
|
|
|
template<class VectorInt>
|
|
std::string GridCmdVectorIntToString(const VectorInt & vec_in){
|
|
int sz = vec_in.size();
|
|
std::vector<int> vec(sz);
|
|
for(int s=0;s<sz;s++) vec[s] = vec_in[s];
|
|
std::ostringstream oss;
|
|
std::copy(vec.begin(), vec.end(),std::ostream_iterator<int>(oss, " "));
|
|
return oss.str();
|
|
}
|
|
/////////////////////////////////////////////////////////
|
|
// Reinit guard
|
|
/////////////////////////////////////////////////////////
|
|
static MemoryStats dbgMemStats;
|
|
static int Grid_is_initialised;
|
|
|
|
/////////////////////////////////////////////////////////
|
|
// Reinit guard
|
|
/////////////////////////////////////////////////////////
|
|
void GridBanner(void)
|
|
{
|
|
std::cout <<std::endl;
|
|
std::cout << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl;
|
|
std::cout << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl;
|
|
std::cout << "__|_ | | | | | | | | | | | | _|__"<<std::endl;
|
|
std::cout << "__|_ _|__"<<std::endl;
|
|
std::cout << "__|_ GGGG RRRR III DDDD _|__"<<std::endl;
|
|
std::cout << "__|_ G R R I D D _|__"<<std::endl;
|
|
std::cout << "__|_ G R R I D D _|__"<<std::endl;
|
|
std::cout << "__|_ G GG RRRR I D D _|__"<<std::endl;
|
|
std::cout << "__|_ G G R R I D D _|__"<<std::endl;
|
|
std::cout << "__|_ GGGG R R III DDDD _|__"<<std::endl;
|
|
std::cout << "__|_ _|__"<<std::endl;
|
|
std::cout << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl;
|
|
std::cout << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl;
|
|
std::cout << " | | | | | | | | | | | | | | "<<std::endl;
|
|
std::cout << std::endl;
|
|
std::cout << std::endl;
|
|
std::cout << "Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors"<<std::endl;
|
|
std::cout << std::endl;
|
|
std::cout << "This program is free software; you can redistribute it and/or modify"<<std::endl;
|
|
std::cout << "it under the terms of the GNU General Public License as published by"<<std::endl;
|
|
std::cout << "the Free Software Foundation; either version 2 of the License, or"<<std::endl;
|
|
std::cout << "(at your option) any later version."<<std::endl;
|
|
std::cout << std::endl;
|
|
std::cout << "This program is distributed in the hope that it will be useful,"<<std::endl;
|
|
std::cout << "but WITHOUT ANY WARRANTY; without even the implied warranty of"<<std::endl;
|
|
std::cout << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the"<<std::endl;
|
|
std::cout << "GNU General Public License for more details."<<std::endl;
|
|
printHash();
|
|
#ifdef GRID_BUILD_REF
|
|
#define _GRID_BUILD_STR(x) #x
|
|
#define GRID_BUILD_STR(x) _GRID_BUILD_STR(x)
|
|
std::cout << "Build " << GRID_BUILD_STR(GRID_BUILD_REF) << std::endl;
|
|
#endif
|
|
std::cout << std::endl;
|
|
std::cout << std::setprecision(9);
|
|
}
|
|
|
|
//Some file local variables
|
|
static int fileno_stdout;
|
|
static int fileno_stderr;
|
|
static int signal_delay;
|
|
class dlRegion {
|
|
public:
|
|
uint64_t start;
|
|
uint64_t end;
|
|
uint64_t size;
|
|
uint64_t offset;
|
|
std::string name;
|
|
};
|
|
std::vector<dlRegion> dlMap;
|
|
|
|
void Grid_init(int *argc,char ***argv)
|
|
{
|
|
|
|
GRID_ASSERT(Grid_is_initialised == 0);
|
|
|
|
GridLogger::GlobalStopWatch.Start();
|
|
|
|
std::string arg;
|
|
|
|
//////////////////////////////////////////////////////////
|
|
// Early intialisation necessities without rank knowledge
|
|
//////////////////////////////////////////////////////////
|
|
acceleratorInit(); // Must come first to set device prior to MPI init due to Omnipath Driver
|
|
|
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){
|
|
int MB;
|
|
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm");
|
|
GridCmdOptionInt(arg,MB);
|
|
uint64_t MB64 = MB;
|
|
GlobalSharedMemory::MAX_MPI_SHM_BYTES = MB64*1024LL*1024LL;
|
|
}
|
|
|
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-mpi") ){
|
|
int forcempi;
|
|
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm-mpi");
|
|
GridCmdOptionInt(arg,forcempi);
|
|
Stencil_force_mpi = (bool)forcempi;
|
|
}
|
|
|
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--device-mem") ){
|
|
int MB;
|
|
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--device-mem");
|
|
GridCmdOptionInt(arg,MB);
|
|
uint64_t MB64 = MB;
|
|
MemoryManager::DeviceMaxBytes = MB64*1024LL*1024LL;
|
|
}
|
|
|
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--hypercube") ){
|
|
int enable;
|
|
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--hypercube");
|
|
GridCmdOptionInt(arg,enable);
|
|
GlobalSharedMemory::HPEhypercube = enable;
|
|
}
|
|
|
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-hugepages") ){
|
|
GlobalSharedMemory::Hugepages = 1;
|
|
}
|
|
|
|
#if defined(A64FX)
|
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
|
|
std::cout << "Option --comms-overlap currently not supported on QPACE4. Exiting." << std::endl;
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
#endif
|
|
|
|
//////////////////////////////////////////////////////////
|
|
// Memory manager
|
|
//////////////////////////////////////////////////////////
|
|
MemoryManager::Init();
|
|
|
|
//////////////////////////////////////////////////////////
|
|
// MPI initialisation
|
|
//////////////////////////////////////////////////////////
|
|
CartesianCommunicator::Init(argc,argv);
|
|
|
|
GridLogger::GlobalStopWatch.Stop();
|
|
CartesianCommunicator::BarrierWorld();
|
|
GridLogger::GlobalStopWatch.Reset();// Back to zero with synchronised clock
|
|
GridLogger::GlobalStopWatch.Start();
|
|
|
|
////////////////////////////////////
|
|
// Banner after MPI (unless GPU)
|
|
////////////////////////////////////
|
|
if ( CartesianCommunicator::RankWorld() == 0 ) {
|
|
GridBanner();
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////
|
|
// Rank information can be used to control who logs
|
|
/////////////////////////////////////////////////////////////////
|
|
if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){
|
|
Grid_quiesce_nodes();
|
|
} else {
|
|
FILE *fp;
|
|
std::ostringstream fname;
|
|
|
|
int rank = CartesianCommunicator::RankWorld();
|
|
int radix=32;
|
|
char* root = getenv("GRID_STDOUT_ROOT");
|
|
if (root) {
|
|
fname << root ;
|
|
mkdir(fname.str().c_str(), S_IRWXU );
|
|
fname << "/";
|
|
}
|
|
fname << (rank/radix)*radix ;
|
|
mkdir(fname.str().c_str(), S_IRWXU );
|
|
fname << "/";
|
|
fname<<"Grid.stdout.";
|
|
fname<<CartesianCommunicator::RankWorld();
|
|
|
|
std::cout << " Reconnecting stdout to "<<fname.str()<<std::endl;
|
|
|
|
fp=freopen(fname.str().c_str(),"w",stdout);
|
|
GRID_ASSERT(fp!=(FILE *)NULL);
|
|
|
|
std::ostringstream ename;
|
|
if (root){
|
|
ename << root << "/";
|
|
}
|
|
ename << (rank/radix)*radix << "/";
|
|
ename<<"Grid.stderr.";
|
|
ename<<CartesianCommunicator::RankWorld();
|
|
std::cout << " Reconnecting stderr to "<<ename.str()<<std::endl;
|
|
fp=freopen(ename.str().c_str(),"w",stderr);
|
|
GRID_ASSERT(fp!=(FILE *)NULL);
|
|
}
|
|
fileno_stdout = fileno(stdout);
|
|
fileno_stderr = fileno(stderr) ;
|
|
dup2(fileno_stdout, STDOUT_FILENO);
|
|
dup2(fileno_stderr, STDERR_FILENO);
|
|
////////////////////////////////////////////////////
|
|
// OK to use GridLogMessage etc from here on
|
|
////////////////////////////////////////////////////
|
|
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
|
std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
|
|
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
|
{
|
|
gethostname(hostname, HOST_NAME_MAX+1);
|
|
time_t mytime;
|
|
struct tm *info;
|
|
char buffer[80];
|
|
time(&mytime);
|
|
info = localtime(&mytime);
|
|
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", info);
|
|
std::cout << GridLogMessage << "This rank is running on host "<< hostname<<" at local time "<<buffer<<std::endl;
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////
|
|
// Reporting
|
|
/////////////////////////////////////////////////////////
|
|
std::cout << GridLogMessage << "Requested "<< GlobalSharedMemory::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
|
|
if ( GlobalSharedMemory::Hugepages) {
|
|
std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl;
|
|
}
|
|
|
|
MemoryManager::InitMessage();
|
|
|
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-mem") ){
|
|
MemoryProfiler::debug = true;
|
|
MemoryProfiler::stats = &dbgMemStats;
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////
|
|
// LD.so space
|
|
/////////////////////////////////////////////////////////
|
|
#ifndef __APPLE__
|
|
{
|
|
// Provides mapping of .so files
|
|
FILE *f = fopen("/proc/self/maps", "r");
|
|
if (f) {
|
|
char line[256];
|
|
while (fgets(line, sizeof(line), f)) {
|
|
if (strstr(line, "r-xp")) {
|
|
dlRegion region;
|
|
uint32_t major, minor, inode;
|
|
uint64_t start,end,offset;
|
|
char path[PATH_MAX];
|
|
sscanf(line,"%lx-%lx r-xp %lx %x:%x %d %s",
|
|
&start,&end,&offset,
|
|
&major,&minor,&inode,path);
|
|
region.start=start;
|
|
region.end =end;
|
|
region.offset=offset;
|
|
region.name = std::string(path);
|
|
region.size = region.end-region.start;
|
|
dlMap.push_back(region);
|
|
// std::cout << GridLogMessage<< line;
|
|
}
|
|
}
|
|
fclose(f);
|
|
}
|
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--dylib-map") ){
|
|
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
|
std::cout << GridLogMessage<< " Dynamic library map: " <<std::endl;
|
|
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
|
for(int r=0;r<dlMap.size();r++){
|
|
auto region = dlMap[r];
|
|
std::cout << GridLogMessage<<" "<<region.name<<std::hex<<region.start<<"-"<<region.end<<" sz "<<region.size<<std::dec<<std::endl;
|
|
}
|
|
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
|
}
|
|
}
|
|
#endif
|
|
////////////////////////////////////
|
|
// Logging
|
|
////////////////////////////////////
|
|
std::vector<std::string> logstreams;
|
|
std::string defaultLog("Error,Warning,Message");
|
|
GridCmdOptionCSL(defaultLog,logstreams);
|
|
GridLogConfigure(logstreams);
|
|
|
|
|
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){
|
|
arg = GridCmdOptionPayload(*argv,*argv+*argc,"--log");
|
|
GridCmdOptionCSL(arg,logstreams);
|
|
GridLogConfigure(logstreams);
|
|
}
|
|
|
|
////////////////////////////////////
|
|
// Help message
|
|
////////////////////////////////////
|
|
|
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--help") ){
|
|
std::cout<<GridLogMessage<<" --help : this message"<<std::endl;
|
|
std::cout<<GridLogMessage<<std::endl;
|
|
std::cout<<GridLogMessage<<"Geometry:"<<std::endl;
|
|
std::cout<<GridLogMessage<<std::endl;
|
|
std::cout<<GridLogMessage<<" --mpi n.n.n.n : default MPI decomposition"<<std::endl;
|
|
std::cout<<GridLogMessage<<" --threads n : default number of OMP threads"<<std::endl;
|
|
std::cout<<GridLogMessage<<" --grid n.n.n.n : default Grid size"<<std::endl;
|
|
std::cout<<GridLogMessage<<" --shm M : allocate M megabytes of shared memory for comms"<<std::endl;
|
|
std::cout<<GridLogMessage<<" --shm-mpi 0|1 : Force MPI usage under multi-rank per node "<<std::endl;
|
|
std::cout<<GridLogMessage<<" --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;
|
|
std::cout<<GridLogMessage<<" --device-mem M : Size of device software cache for lattice fields (MB) "<<std::endl;
|
|
std::cout<<GridLogMessage<<std::endl;
|
|
std::cout<<GridLogMessage<<"Verbose:"<<std::endl;
|
|
std::cout<<GridLogMessage<<std::endl;
|
|
std::cout<<GridLogMessage<<" --log list : comma separated list from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
|
|
std::cout<<GridLogMessage<<" --notimestamp : suppress millisecond resolution stamps"<<std::endl;
|
|
std::cout<<GridLogMessage<<" --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;
|
|
std::cout<<GridLogMessage<<"Debug:"<<std::endl;
|
|
std::cout<<GridLogMessage<<" --dylib-map : print dynamic library map, useful for interpreting signal backtraces "<<std::endl;
|
|
std::cout<<GridLogMessage<<" --heartbeat : periodic itimer wakeup (interrupts stuck system calls!) "<<std::endl;
|
|
std::cout<<GridLogMessage<<" --signal-delay n : pause for n seconds after signal handling (useful to get ALL nodes in stuck state) "<<std::endl;
|
|
std::cout<<GridLogMessage<<" --debug-stdout : print stdout from EVERY node to file Grid.stdout/err.rank "<<std::endl;
|
|
std::cout<<GridLogMessage<<" --debug-signals : catch sigsegv and print a blame report, handle SIGHUP with a backtrace to stderr"<<std::endl;
|
|
std::cout<<GridLogMessage<<" --debug-heartbeat : periodically report backtrace "<<std::endl;
|
|
std::cout<<GridLogMessage<<" --debug-mem : print Grid allocator activity"<<std::endl;
|
|
std::cout<<GridLogMessage<<std::endl;
|
|
std::cout<<GridLogMessage<<"Performance:"<<std::endl;
|
|
std::cout<<GridLogMessage<<std::endl;
|
|
std::cout<<GridLogMessage<<" --comms-overlap : Overlap comms with compute "<<std::endl;
|
|
std::cout<<GridLogMessage<<std::endl;
|
|
std::cout<<GridLogMessage<<" --dslash-generic: Wilson kernel for generic Nc"<<std::endl;
|
|
std::cout<<GridLogMessage<<" --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;
|
|
std::cout<<GridLogMessage<<" --dslash-asm : Wilson kernel for AVX512"<<std::endl;
|
|
std::cout<<GridLogMessage<<std::endl;
|
|
std::cout<<GridLogMessage<<std::endl;
|
|
exit(EXIT_SUCCESS);
|
|
}
|
|
|
|
////////////////////////////////////
|
|
// Performance options
|
|
////////////////////////////////////
|
|
|
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-unroll") ){
|
|
WilsonKernelsStatic::Opt=WilsonKernelsStatic::OptHandUnroll;
|
|
StaggeredKernelsStatic::Opt=StaggeredKernelsStatic::OptHandUnroll;
|
|
}
|
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-asm") ){
|
|
WilsonKernelsStatic::Opt=WilsonKernelsStatic::OptInlineAsm;
|
|
StaggeredKernelsStatic::Opt=StaggeredKernelsStatic::OptInlineAsm;
|
|
}
|
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-generic") ){
|
|
WilsonKernelsStatic::Opt=WilsonKernelsStatic::OptGeneric;
|
|
StaggeredKernelsStatic::Opt=StaggeredKernelsStatic::OptGeneric;
|
|
}
|
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
|
|
WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute;
|
|
StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
|
|
} else {
|
|
WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsThenCompute;
|
|
StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsThenCompute;
|
|
}
|
|
|
|
////////////////////////////////
|
|
// Timestamping or not
|
|
////////////////////////////////
|
|
|
|
CartesianCommunicator::nCommThreads = 1;
|
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--notimestamp") ){
|
|
GridLogTimestamp(0);
|
|
} else {
|
|
GridLogTimestamp(1);
|
|
}
|
|
|
|
////////////////////////////////
|
|
// Default layout
|
|
////////////////////////////////
|
|
GridParseLayout(*argv,*argc,
|
|
Grid_default_latt,
|
|
Grid_default_mpi);
|
|
|
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
|
|
std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
|
|
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
|
|
std::cout<<GridLogMessage<<"\tMPI tasks : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
|
|
std::cout<<GridLogMessage<<"\tvRealF : "<<sizeof(vRealF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
|
|
std::cout<<GridLogMessage<<"\tvRealD : "<<sizeof(vRealD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
|
|
std::cout<<GridLogMessage<<"\tvComplexF : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
|
|
std::cout<<GridLogMessage<<"\tvComplexD : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
|
|
}
|
|
|
|
////////////////////////////////////
|
|
// Debug options
|
|
////////////////////////////////////
|
|
|
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
|
|
Grid_debug_handler_init();
|
|
}
|
|
// Sleep n-seconds at end of handler
|
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--signal-delay") ){
|
|
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--signal-delay");
|
|
GridCmdOptionInt(arg,signal_delay);
|
|
}
|
|
// periodic wakeup with stack trace printed
|
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-heartbeat") ){
|
|
Grid_debug_heartbeat();
|
|
}
|
|
// periodic wakeup with empty handler (interrupts some system calls)
|
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--heartbeat") ){
|
|
Grid_heartbeat();
|
|
}
|
|
|
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--flightrecorder") ){
|
|
std::cout << GridLogMessage <<" Enabling flight recorder " <<std::endl;
|
|
FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
|
|
FlightRecorder::PrintEntireLog = 1;
|
|
FlightRecorder::ChecksumComms = 1;
|
|
FlightRecorder::ChecksumCommsSend=1;
|
|
}
|
|
|
|
Grid_is_initialised = 1;
|
|
}
|
|
|
|
|
|
void Grid_finalize(void)
|
|
{
|
|
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
|
|
std::cout<<GridLogMessage<<"******* Grid Finalize ******"<<std::endl;
|
|
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
|
|
|
|
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
|
|
MPI_Barrier(MPI_COMM_WORLD);
|
|
MPI_Finalize();
|
|
Grid_unquiesce_nodes();
|
|
#endif
|
|
#if defined (GRID_COMMS_SHMEM)
|
|
shmem_finalize();
|
|
#endif
|
|
Grid_is_initialised = 0;
|
|
}
|
|
|
|
void GridLogLayout() {
|
|
std::cout << GridLogMessage << "Grid Layout\n";
|
|
std::cout << GridLogMessage << "\tGlobal lattice size : "<< GridCmdVectorIntToString(GridDefaultLatt()) << std::endl;
|
|
std::cout << GridLogMessage << "\tOpenMP threads : "<< GridThread::GetThreads() <<std::endl;
|
|
std::cout << GridLogMessage << "\tMPI tasks : "<< GridCmdVectorIntToString(GridDefaultMpi()) << std::endl;
|
|
}
|
|
|
|
#define SIGLOG(A) ::write(fileno_stderr,A,strlen(A));
|
|
|
|
void sig_print_dig(uint32_t dig)
|
|
{
|
|
const char *digits[] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f" };
|
|
if ( dig>=0 && dig< 16){
|
|
SIGLOG(digits[dig]);
|
|
}
|
|
}
|
|
void sig_print_uint(uint32_t A)
|
|
{
|
|
int dig;
|
|
int nz=0;
|
|
#define DIGIT(DIV) dig = (A/DIV)%10 ; if(dig|nz) sig_print_dig(dig); nz = nz|dig;
|
|
DIGIT(1000000000); // Catches 4BN = 2^32
|
|
DIGIT(100000000);
|
|
DIGIT(10000000);
|
|
DIGIT(1000000);
|
|
DIGIT(100000);
|
|
DIGIT(10000);
|
|
DIGIT(1000);
|
|
DIGIT(100);
|
|
DIGIT(10);
|
|
DIGIT(1);
|
|
if (nz==0) SIGLOG("0");
|
|
}
|
|
void sig_print_hex(uint64_t A)
|
|
{
|
|
int nz=0;
|
|
int dig;
|
|
#define NIBBLE(A) dig = A ; if(dig|nz) sig_print_dig(dig); nz = nz|dig;
|
|
SIGLOG("0x");
|
|
NIBBLE((A>>(15*4))&0xF);
|
|
NIBBLE((A>>(14*4))&0xF);
|
|
NIBBLE((A>>(13*4))&0xF);
|
|
NIBBLE((A>>(12*4))&0xF);
|
|
NIBBLE((A>>(11*4))&0xF);
|
|
NIBBLE((A>>(10*4))&0xF);
|
|
NIBBLE((A>>(9*4))&0xF);
|
|
NIBBLE((A>>(8*4))&0xF);
|
|
NIBBLE((A>>(7*4))&0xF);
|
|
NIBBLE((A>>(6*4))&0xF);
|
|
NIBBLE((A>>(5*4))&0xF);
|
|
NIBBLE((A>>(4*4))&0xF);
|
|
NIBBLE((A>>(3*4))&0xF);
|
|
NIBBLE((A>>(2*4))&0xF);
|
|
NIBBLE((A>>4)&0xF);
|
|
sig_print_dig(A&0xF);
|
|
}
|
|
/*
|
|
#ifdef __linux__
|
|
#ifdef __x86_64__
|
|
ucontext_t * uc= (ucontext_t *)ptr;
|
|
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
|
|
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
|
|
#endif
|
|
#endif
|
|
*/
|
|
void Grid_generic_handler(int sig,siginfo_t *si,void * ptr)
|
|
{
|
|
SIGLOG("Signal handler on host ");
|
|
SIGLOG(hostname);
|
|
SIGLOG(" process id ");
|
|
sig_print_uint((uint32_t)getpid());
|
|
SIGLOG("\n");
|
|
SIGLOG("FlightRecorder step ");
|
|
sig_print_uint(FlightRecorder::StepLoggingCounter);
|
|
SIGLOG(" stage ");
|
|
SIGLOG(FlightRecorder::StepName);
|
|
SIGLOG("\n");
|
|
SIGLOG("Caught signal ");
|
|
sig_print_uint(si->si_signo);
|
|
SIGLOG("\n");
|
|
SIGLOG(" mem address ");
|
|
sig_print_hex((uint64_t)si->si_addr);
|
|
SIGLOG("\n");
|
|
SIGLOG(" code ");
|
|
sig_print_uint(si->si_code);
|
|
SIGLOG("\n");
|
|
|
|
ucontext_t *uc= (ucontext_t *)ptr;
|
|
|
|
SIGLOG("Backtrace:\n");
|
|
#ifdef HAVE_UNWIND
|
|
// Debug cross check on offsets
|
|
// int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
|
|
// backtrace_symbols_fd(Grid_backtrace_buffer,symbols,fileno_stderr);
|
|
unw_cursor_t cursor;
|
|
unw_word_t ip, off;
|
|
if (!unw_init_local(&cursor, uc) ) {
|
|
|
|
SIGLOG(" frame IP function\n");
|
|
int level = 0;
|
|
int ret = 0;
|
|
while(1) {
|
|
char name[128];
|
|
if (level >= _NBACKTRACE) return;
|
|
|
|
unw_get_reg(&cursor, UNW_REG_IP, &ip);
|
|
|
|
sig_print_uint(level); SIGLOG(" ");
|
|
sig_print_hex(ip); SIGLOG(" ");
|
|
for(int r=0;r<dlMap.size();r++){
|
|
if((ip>=dlMap[r].start) &&(ip<dlMap[r].end)){
|
|
SIGLOG(dlMap[r].name.c_str());
|
|
SIGLOG("+");
|
|
sig_print_hex((ip-dlMap[r].start));
|
|
break;
|
|
}
|
|
}
|
|
SIGLOG("\n");
|
|
Grid_backtrace_buffer[level]=(void *)ip;
|
|
level++;
|
|
ret = unw_step(&cursor);
|
|
if (ret <= 0) {
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
#else
|
|
// Known Asynch-Signal unsafe
|
|
int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
|
|
backtrace_symbols_fd(Grid_backtrace_buffer,symbols,fileno_stderr);
|
|
#endif
|
|
}
|
|
|
|
void Grid_heartbeat_signal_handler(int sig,siginfo_t *si,void * ptr)
|
|
{
|
|
Grid_generic_handler(sig,si,ptr);
|
|
SIGLOG("\n");
|
|
}
|
|
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
|
|
{
|
|
Grid_generic_handler(sig,si,ptr);
|
|
if (signal_delay) {
|
|
SIGLOG("Adding extra signal delay ");
|
|
sig_print_uint(signal_delay);
|
|
SIGLOG(" s\n");
|
|
usleep( (uint64_t) signal_delay*1000LL*1000LL);
|
|
}
|
|
SIGLOG("\n");
|
|
return;
|
|
}
|
|
|
|
void Grid_fatal_signal_handler(int sig,siginfo_t *si,void * ptr)
|
|
{
|
|
Grid_generic_handler(sig,si,ptr);
|
|
SIGLOG("\n");
|
|
exit(0);
|
|
return;
|
|
};
|
|
void Grid_empty_signal_handler(int sig,siginfo_t *si,void * ptr)
|
|
{
|
|
// SIGLOG("heartbeat signal handled\n");
|
|
return;
|
|
}
|
|
void Grid_debug_heartbeat(void)
|
|
{
|
|
struct sigaction sa_ping;
|
|
|
|
sigemptyset (&sa_ping.sa_mask);
|
|
sa_ping.sa_sigaction= Grid_usr_signal_handler;
|
|
sa_ping.sa_flags = SA_SIGINFO;
|
|
sigaction(SIGALRM,&sa_ping,NULL);
|
|
|
|
// repeating 10s heartbeat
|
|
struct itimerval it_val;
|
|
it_val.it_value.tv_sec = 10;
|
|
it_val.it_value.tv_usec = 0;
|
|
it_val.it_interval = it_val.it_value;
|
|
setitimer(ITIMER_REAL, &it_val, NULL);
|
|
}
|
|
void Grid_heartbeat(void)
|
|
{
|
|
struct sigaction sa_ping;
|
|
|
|
sigemptyset (&sa_ping.sa_mask);
|
|
sa_ping.sa_sigaction= Grid_empty_signal_handler;
|
|
sa_ping.sa_flags = SA_SIGINFO;
|
|
sigaction(SIGALRM,&sa_ping,NULL);
|
|
|
|
// repeating 10s heartbeat
|
|
struct itimerval it_val;
|
|
it_val.it_value.tv_sec = 0;
|
|
it_val.it_value.tv_usec = 10000;
|
|
it_val.it_interval = it_val.it_value;
|
|
setitimer(ITIMER_REAL, &it_val, NULL);
|
|
}
|
|
void Grid_exit_handler(void)
|
|
{
|
|
BACKTRACEFP(stdout);
|
|
fflush(stdout);
|
|
}
|
|
void Grid_debug_handler_init(void)
|
|
{
|
|
struct sigaction sa;
|
|
sigemptyset (&sa.sa_mask);
|
|
sa.sa_sigaction= Grid_fatal_signal_handler;
|
|
sa.sa_flags = SA_SIGINFO;
|
|
sigaction(SIGTRAP,&sa,NULL);
|
|
sigaction(SIGILL,&sa,NULL);
|
|
sigaction(SIGABRT,&sa,NULL); // SigABRT backtrace
|
|
#ifndef GRID_SYCL
|
|
sigaction(SIGSEGV,&sa,NULL); // SYCL is using SIGSEGV
|
|
sigaction(SIGBUS,&sa,NULL);
|
|
feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
|
|
sigaction(SIGFPE,&sa,NULL);
|
|
#endif
|
|
|
|
// Non terminating SIGHUP handler
|
|
struct sigaction sa_ping;
|
|
sigemptyset (&sa_ping.sa_mask);
|
|
sa_ping.sa_sigaction= Grid_usr_signal_handler;
|
|
sa_ping.sa_flags = SA_SIGINFO;
|
|
sigaction(SIGHUP,&sa_ping,NULL);
|
|
|
|
// atexit(Grid_exit_handler);
|
|
}
|
|
|
|
NAMESPACE_END(Grid);
|
|
|