mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-15 02:05:37 +00:00
Better flight logging
This commit is contained in:
parent
beb0e474ee
commit
ec1395a304
@ -39,6 +39,8 @@ int FlightRecorder::ContinueOnFail;
|
|||||||
int FlightRecorder::LoggingMode;
|
int FlightRecorder::LoggingMode;
|
||||||
int FlightRecorder::ChecksumComms;
|
int FlightRecorder::ChecksumComms;
|
||||||
int FlightRecorder::ChecksumCommsSend;
|
int FlightRecorder::ChecksumCommsSend;
|
||||||
|
const char * FlightRecorder::StepName;
|
||||||
|
int32_t FlightRecorder::StepLoggingCounter;
|
||||||
int32_t FlightRecorder::XmitLoggingCounter;
|
int32_t FlightRecorder::XmitLoggingCounter;
|
||||||
int32_t FlightRecorder::RecvLoggingCounter;
|
int32_t FlightRecorder::RecvLoggingCounter;
|
||||||
int32_t FlightRecorder::CsumLoggingCounter;
|
int32_t FlightRecorder::CsumLoggingCounter;
|
||||||
@ -58,6 +60,8 @@ void FlightRecorder::ResetCounters(void)
|
|||||||
CsumLoggingCounter=0;
|
CsumLoggingCounter=0;
|
||||||
NormLoggingCounter=0;
|
NormLoggingCounter=0;
|
||||||
ReductionLoggingCounter=0;
|
ReductionLoggingCounter=0;
|
||||||
|
StepName = "No steps started";
|
||||||
|
StepLoggingCounter=0;
|
||||||
}
|
}
|
||||||
void FlightRecorder::Truncate(void)
|
void FlightRecorder::Truncate(void)
|
||||||
{
|
{
|
||||||
@ -88,6 +92,11 @@ void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode)
|
|||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
bool FlightRecorder::StepLog(const char *name)
|
||||||
|
{
|
||||||
|
StepName = name;
|
||||||
|
StepLoggingCounter ++;
|
||||||
|
}
|
||||||
|
|
||||||
void FlightRecorder::SetLoggingModePrint(void)
|
void FlightRecorder::SetLoggingModePrint(void)
|
||||||
{
|
{
|
||||||
@ -111,17 +120,19 @@ uint64_t FlightRecorder::ErrorCount(void)
|
|||||||
{
|
{
|
||||||
return ErrorCounter;
|
return ErrorCounter;
|
||||||
}
|
}
|
||||||
void FlightRecorder::NormLog(double value)
|
bool FlightRecorder::NormLog(double value)
|
||||||
{
|
{
|
||||||
uint64_t hex = * ( (uint64_t *)&value );
|
uint64_t hex = * ( (uint64_t *)&value );
|
||||||
if(LoggingMode == LoggingModePrint) {
|
if(LoggingMode == LoggingModePrint) {
|
||||||
std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
||||||
NormLoggingCounter++;
|
NormLoggingCounter++;
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
if(LoggingMode == LoggingModeRecord) {
|
if(LoggingMode == LoggingModeRecord) {
|
||||||
std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
||||||
NormLogVector.push_back(value);
|
NormLogVector.push_back(value);
|
||||||
NormLoggingCounter++;
|
NormLoggingCounter++;
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
if(LoggingMode == LoggingModeVerify) {
|
if(LoggingMode == LoggingModeVerify) {
|
||||||
|
|
||||||
@ -130,6 +141,9 @@ void FlightRecorder::NormLog(double value)
|
|||||||
|
|
||||||
if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) {
|
if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) {
|
||||||
|
|
||||||
|
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
|
||||||
|
FlightRecorder::StepLoggingCounter,
|
||||||
|
FlightRecorder::StepName);
|
||||||
std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter
|
std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter
|
||||||
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" "
|
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" "
|
||||||
<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl;
|
<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl;
|
||||||
@ -142,7 +156,9 @@ void FlightRecorder::NormLog(double value)
|
|||||||
NormLoggingCounter,NormLogVector.size(),
|
NormLoggingCounter,NormLogVector.size(),
|
||||||
value, NormLogVector[NormLoggingCounter]); fflush(stderr);
|
value, NormLogVector[NormLoggingCounter]); fflush(stderr);
|
||||||
|
|
||||||
if(!ContinueOnFail)assert(0); // Force takedown of job
|
BACKTRACEFP(stderr);
|
||||||
|
|
||||||
|
if(!ContinueOnFail) return false;
|
||||||
|
|
||||||
ErrorCounter++;
|
ErrorCounter++;
|
||||||
} else {
|
} else {
|
||||||
@ -159,18 +175,21 @@ void FlightRecorder::NormLog(double value)
|
|||||||
}
|
}
|
||||||
NormLoggingCounter++;
|
NormLoggingCounter++;
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
void FlightRecorder::CsumLog(uint64_t hex)
|
bool FlightRecorder::CsumLog(uint64_t hex)
|
||||||
{
|
{
|
||||||
if(LoggingMode == LoggingModePrint) {
|
if(LoggingMode == LoggingModePrint) {
|
||||||
std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
||||||
CsumLoggingCounter++;
|
CsumLoggingCounter++;
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(LoggingMode == LoggingModeRecord) {
|
if(LoggingMode == LoggingModeRecord) {
|
||||||
std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
||||||
CsumLogVector.push_back(hex);
|
CsumLogVector.push_back(hex);
|
||||||
CsumLoggingCounter++;
|
CsumLoggingCounter++;
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(LoggingMode == LoggingModeVerify) {
|
if(LoggingMode == LoggingModeVerify) {
|
||||||
@ -181,6 +200,9 @@ void FlightRecorder::CsumLog(uint64_t hex)
|
|||||||
|
|
||||||
if ( hex != hexref ) {
|
if ( hex != hexref ) {
|
||||||
|
|
||||||
|
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
|
||||||
|
FlightRecorder::StepLoggingCounter,
|
||||||
|
FlightRecorder::StepName);
|
||||||
std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter
|
std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter
|
||||||
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
|
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
|
||||||
|
|
||||||
@ -188,9 +210,10 @@ void FlightRecorder::CsumLog(uint64_t hex)
|
|||||||
GridHostname(),
|
GridHostname(),
|
||||||
GlobalSharedMemory::WorldShmRank,
|
GlobalSharedMemory::WorldShmRank,
|
||||||
CsumLoggingCounter,hex, hexref);
|
CsumLoggingCounter,hex, hexref);
|
||||||
|
BACKTRACEFP(stderr);
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
|
|
||||||
if(!ContinueOnFail) assert(0); // Force takedown of job
|
if(!ContinueOnFail) return false;
|
||||||
|
|
||||||
ErrorCounter++;
|
ErrorCounter++;
|
||||||
|
|
||||||
@ -207,7 +230,9 @@ void FlightRecorder::CsumLog(uint64_t hex)
|
|||||||
}
|
}
|
||||||
CsumLoggingCounter++;
|
CsumLoggingCounter++;
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void FlightRecorder::ReductionLog(double local,double global)
|
void FlightRecorder::ReductionLog(double local,double global)
|
||||||
{
|
{
|
||||||
uint64_t hex_l = * ( (uint64_t *)&local );
|
uint64_t hex_l = * ( (uint64_t *)&local );
|
||||||
@ -224,11 +249,15 @@ void FlightRecorder::ReductionLog(double local,double global)
|
|||||||
if(LoggingMode == LoggingModeVerify) {
|
if(LoggingMode == LoggingModeVerify) {
|
||||||
if(ReductionLoggingCounter < ReductionLogVector.size()){
|
if(ReductionLoggingCounter < ReductionLogVector.size()){
|
||||||
if ( global != ReductionLogVector[ReductionLoggingCounter] ) {
|
if ( global != ReductionLogVector[ReductionLoggingCounter] ) {
|
||||||
|
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
|
||||||
|
FlightRecorder::StepLoggingCounter,
|
||||||
|
FlightRecorder::StepName);
|
||||||
fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n",
|
fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n",
|
||||||
GridHostname(),
|
GridHostname(),
|
||||||
GlobalSharedMemory::WorldShmRank,
|
GlobalSharedMemory::WorldShmRank,
|
||||||
ReductionLoggingCounter,ReductionLogVector.size(),
|
ReductionLoggingCounter,ReductionLogVector.size(),
|
||||||
global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
|
global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
|
||||||
|
BACKTRACEFP(stderr);
|
||||||
|
|
||||||
if ( !ContinueOnFail ) assert(0);
|
if ( !ContinueOnFail ) assert(0);
|
||||||
|
|
||||||
@ -267,11 +296,15 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
|
|||||||
if(LoggingMode == LoggingModeVerify) {
|
if(LoggingMode == LoggingModeVerify) {
|
||||||
if(XmitLoggingCounter < XmitLogVector.size()){
|
if(XmitLoggingCounter < XmitLogVector.size()){
|
||||||
if ( _xor != XmitLogVector[XmitLoggingCounter] ) {
|
if ( _xor != XmitLogVector[XmitLoggingCounter] ) {
|
||||||
|
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
|
||||||
|
FlightRecorder::StepLoggingCounter,
|
||||||
|
FlightRecorder::StepName);
|
||||||
fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu %lx expect glb %lx\n",
|
fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu %lx expect glb %lx\n",
|
||||||
GridHostname(),
|
GridHostname(),
|
||||||
GlobalSharedMemory::WorldShmRank,
|
GlobalSharedMemory::WorldShmRank,
|
||||||
XmitLoggingCounter,XmitLogVector.size(),
|
XmitLoggingCounter,XmitLogVector.size(),
|
||||||
_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
|
_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
|
||||||
|
BACKTRACEFP(stderr);
|
||||||
|
|
||||||
if ( !ContinueOnFail ) assert(0);
|
if ( !ContinueOnFail ) assert(0);
|
||||||
|
|
||||||
@ -309,11 +342,15 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
|
|||||||
if(LoggingMode == LoggingModeVerify) {
|
if(LoggingMode == LoggingModeVerify) {
|
||||||
if(RecvLoggingCounter < RecvLogVector.size()){
|
if(RecvLoggingCounter < RecvLogVector.size()){
|
||||||
if ( _xor != RecvLogVector[RecvLoggingCounter] ) {
|
if ( _xor != RecvLogVector[RecvLoggingCounter] ) {
|
||||||
|
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
|
||||||
|
FlightRecorder::StepLoggingCounter,
|
||||||
|
FlightRecorder::StepName);
|
||||||
fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu %lx expect glb %lx from MPI rank %d\n",
|
fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu %lx expect glb %lx from MPI rank %d\n",
|
||||||
GridHostname(),
|
GridHostname(),
|
||||||
GlobalSharedMemory::WorldShmRank,
|
GlobalSharedMemory::WorldShmRank,
|
||||||
RecvLoggingCounter,RecvLogVector.size(),
|
RecvLoggingCounter,RecvLogVector.size(),
|
||||||
_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
|
_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
|
||||||
|
BACKTRACEFP(stderr);
|
||||||
|
|
||||||
if ( !ContinueOnFail ) assert(0);
|
if ( !ContinueOnFail ) assert(0);
|
||||||
|
|
||||||
|
@ -12,6 +12,8 @@ class FlightRecorder {
|
|||||||
|
|
||||||
static int LoggingMode;
|
static int LoggingMode;
|
||||||
static uint64_t ErrorCounter;
|
static uint64_t ErrorCounter;
|
||||||
|
static const char * StepName;
|
||||||
|
static int32_t StepLoggingCounter;
|
||||||
static int32_t XmitLoggingCounter;
|
static int32_t XmitLoggingCounter;
|
||||||
static int32_t RecvLoggingCounter;
|
static int32_t RecvLoggingCounter;
|
||||||
static int32_t CsumLoggingCounter;
|
static int32_t CsumLoggingCounter;
|
||||||
@ -30,8 +32,9 @@ class FlightRecorder {
|
|||||||
static void SetLoggingModeRecord(void);
|
static void SetLoggingModeRecord(void);
|
||||||
static void SetLoggingModeVerify(void);
|
static void SetLoggingModeVerify(void);
|
||||||
static void SetLoggingMode(LoggingMode_t mode);
|
static void SetLoggingMode(LoggingMode_t mode);
|
||||||
static void NormLog(double value);
|
static bool StepLog(const char *name);
|
||||||
static void CsumLog(uint64_t csum);
|
static bool NormLog(double value);
|
||||||
|
static bool CsumLog(uint64_t csum);
|
||||||
static void ReductionLog(double lcl, double glbl);
|
static void ReductionLog(double lcl, double glbl);
|
||||||
static void Truncate(void);
|
static void Truncate(void);
|
||||||
static void ResetCounters(void);
|
static void ResetCounters(void);
|
||||||
|
@ -552,6 +552,9 @@ void * Grid_backtrace_buffer[_NBACKTRACE];
|
|||||||
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
|
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||||
{
|
{
|
||||||
fprintf(stderr,"Signal handler on host %s\n",hostname);
|
fprintf(stderr,"Signal handler on host %s\n",hostname);
|
||||||
|
fprintf(stderr,"FlightRecorder step %d stage %s \n",
|
||||||
|
FlightRecorder::StepLoggingCounter,
|
||||||
|
FlightRecorder::StepName);
|
||||||
fprintf(stderr,"Caught signal %d\n",si->si_signo);
|
fprintf(stderr,"Caught signal %d\n",si->si_signo);
|
||||||
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
|
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
|
||||||
fprintf(stderr," code %d\n",si->si_code);
|
fprintf(stderr," code %d\n",si->si_code);
|
||||||
|
Loading…
Reference in New Issue
Block a user