1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-15 02:05:37 +00:00

Better flight logging

This commit is contained in:
Peter Boyle 2024-10-10 22:01:57 +00:00
parent beb0e474ee
commit ec1395a304
3 changed files with 49 additions and 6 deletions

View File

@ -39,6 +39,8 @@ int FlightRecorder::ContinueOnFail;
int FlightRecorder::LoggingMode; int FlightRecorder::LoggingMode;
int FlightRecorder::ChecksumComms; int FlightRecorder::ChecksumComms;
int FlightRecorder::ChecksumCommsSend; int FlightRecorder::ChecksumCommsSend;
const char * FlightRecorder::StepName;
int32_t FlightRecorder::StepLoggingCounter;
int32_t FlightRecorder::XmitLoggingCounter; int32_t FlightRecorder::XmitLoggingCounter;
int32_t FlightRecorder::RecvLoggingCounter; int32_t FlightRecorder::RecvLoggingCounter;
int32_t FlightRecorder::CsumLoggingCounter; int32_t FlightRecorder::CsumLoggingCounter;
@ -58,6 +60,8 @@ void FlightRecorder::ResetCounters(void)
CsumLoggingCounter=0; CsumLoggingCounter=0;
NormLoggingCounter=0; NormLoggingCounter=0;
ReductionLoggingCounter=0; ReductionLoggingCounter=0;
StepName = "No steps started";
StepLoggingCounter=0;
} }
void FlightRecorder::Truncate(void) void FlightRecorder::Truncate(void)
{ {
@ -88,6 +92,11 @@ void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode)
assert(0); assert(0);
} }
} }
bool FlightRecorder::StepLog(const char *name)
{
StepName = name;
StepLoggingCounter ++;
}
void FlightRecorder::SetLoggingModePrint(void) void FlightRecorder::SetLoggingModePrint(void)
{ {
@ -111,17 +120,19 @@ uint64_t FlightRecorder::ErrorCount(void)
{ {
return ErrorCounter; return ErrorCounter;
} }
void FlightRecorder::NormLog(double value) bool FlightRecorder::NormLog(double value)
{ {
uint64_t hex = * ( (uint64_t *)&value ); uint64_t hex = * ( (uint64_t *)&value );
if(LoggingMode == LoggingModePrint) { if(LoggingMode == LoggingModePrint) {
std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
NormLoggingCounter++; NormLoggingCounter++;
return true;
} }
if(LoggingMode == LoggingModeRecord) { if(LoggingMode == LoggingModeRecord) {
std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
NormLogVector.push_back(value); NormLogVector.push_back(value);
NormLoggingCounter++; NormLoggingCounter++;
return true;
} }
if(LoggingMode == LoggingModeVerify) { if(LoggingMode == LoggingModeVerify) {
@ -130,6 +141,9 @@ void FlightRecorder::NormLog(double value)
if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) { if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) {
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" " <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" "
<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl; <<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl;
@ -142,7 +156,9 @@ void FlightRecorder::NormLog(double value)
NormLoggingCounter,NormLogVector.size(), NormLoggingCounter,NormLogVector.size(),
value, NormLogVector[NormLoggingCounter]); fflush(stderr); value, NormLogVector[NormLoggingCounter]); fflush(stderr);
if(!ContinueOnFail)assert(0); // Force takedown of job BACKTRACEFP(stderr);
if(!ContinueOnFail) return false;
ErrorCounter++; ErrorCounter++;
} else { } else {
@ -159,18 +175,21 @@ void FlightRecorder::NormLog(double value)
} }
NormLoggingCounter++; NormLoggingCounter++;
} }
return true;
} }
void FlightRecorder::CsumLog(uint64_t hex) bool FlightRecorder::CsumLog(uint64_t hex)
{ {
if(LoggingMode == LoggingModePrint) { if(LoggingMode == LoggingModePrint) {
std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
CsumLoggingCounter++; CsumLoggingCounter++;
return true;
} }
if(LoggingMode == LoggingModeRecord) { if(LoggingMode == LoggingModeRecord) {
std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
CsumLogVector.push_back(hex); CsumLogVector.push_back(hex);
CsumLoggingCounter++; CsumLoggingCounter++;
return true;
} }
if(LoggingMode == LoggingModeVerify) { if(LoggingMode == LoggingModeVerify) {
@ -181,6 +200,9 @@ void FlightRecorder::CsumLog(uint64_t hex)
if ( hex != hexref ) { if ( hex != hexref ) {
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl; <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
@ -188,9 +210,10 @@ void FlightRecorder::CsumLog(uint64_t hex)
GridHostname(), GridHostname(),
GlobalSharedMemory::WorldShmRank, GlobalSharedMemory::WorldShmRank,
CsumLoggingCounter,hex, hexref); CsumLoggingCounter,hex, hexref);
BACKTRACEFP(stderr);
fflush(stderr); fflush(stderr);
if(!ContinueOnFail) assert(0); // Force takedown of job if(!ContinueOnFail) return false;
ErrorCounter++; ErrorCounter++;
@ -207,7 +230,9 @@ void FlightRecorder::CsumLog(uint64_t hex)
} }
CsumLoggingCounter++; CsumLoggingCounter++;
} }
return true;
} }
void FlightRecorder::ReductionLog(double local,double global) void FlightRecorder::ReductionLog(double local,double global)
{ {
uint64_t hex_l = * ( (uint64_t *)&local ); uint64_t hex_l = * ( (uint64_t *)&local );
@ -224,11 +249,15 @@ void FlightRecorder::ReductionLog(double local,double global)
if(LoggingMode == LoggingModeVerify) { if(LoggingMode == LoggingModeVerify) {
if(ReductionLoggingCounter < ReductionLogVector.size()){ if(ReductionLoggingCounter < ReductionLogVector.size()){
if ( global != ReductionLogVector[ReductionLoggingCounter] ) { if ( global != ReductionLogVector[ReductionLoggingCounter] ) {
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n", fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n",
GridHostname(), GridHostname(),
GlobalSharedMemory::WorldShmRank, GlobalSharedMemory::WorldShmRank,
ReductionLoggingCounter,ReductionLogVector.size(), ReductionLoggingCounter,ReductionLogVector.size(),
global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr); global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
BACKTRACEFP(stderr);
if ( !ContinueOnFail ) assert(0); if ( !ContinueOnFail ) assert(0);
@ -267,11 +296,15 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
if(LoggingMode == LoggingModeVerify) { if(LoggingMode == LoggingModeVerify) {
if(XmitLoggingCounter < XmitLogVector.size()){ if(XmitLoggingCounter < XmitLogVector.size()){
if ( _xor != XmitLogVector[XmitLoggingCounter] ) { if ( _xor != XmitLogVector[XmitLoggingCounter] ) {
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu %lx expect glb %lx\n", fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu %lx expect glb %lx\n",
GridHostname(), GridHostname(),
GlobalSharedMemory::WorldShmRank, GlobalSharedMemory::WorldShmRank,
XmitLoggingCounter,XmitLogVector.size(), XmitLoggingCounter,XmitLogVector.size(),
_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr); _xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
BACKTRACEFP(stderr);
if ( !ContinueOnFail ) assert(0); if ( !ContinueOnFail ) assert(0);
@ -309,11 +342,15 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
if(LoggingMode == LoggingModeVerify) { if(LoggingMode == LoggingModeVerify) {
if(RecvLoggingCounter < RecvLogVector.size()){ if(RecvLoggingCounter < RecvLogVector.size()){
if ( _xor != RecvLogVector[RecvLoggingCounter] ) { if ( _xor != RecvLogVector[RecvLoggingCounter] ) {
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu %lx expect glb %lx from MPI rank %d\n", fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu %lx expect glb %lx from MPI rank %d\n",
GridHostname(), GridHostname(),
GlobalSharedMemory::WorldShmRank, GlobalSharedMemory::WorldShmRank,
RecvLoggingCounter,RecvLogVector.size(), RecvLoggingCounter,RecvLogVector.size(),
_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr); _xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
BACKTRACEFP(stderr);
if ( !ContinueOnFail ) assert(0); if ( !ContinueOnFail ) assert(0);

View File

@ -12,6 +12,8 @@ class FlightRecorder {
static int LoggingMode; static int LoggingMode;
static uint64_t ErrorCounter; static uint64_t ErrorCounter;
static const char * StepName;
static int32_t StepLoggingCounter;
static int32_t XmitLoggingCounter; static int32_t XmitLoggingCounter;
static int32_t RecvLoggingCounter; static int32_t RecvLoggingCounter;
static int32_t CsumLoggingCounter; static int32_t CsumLoggingCounter;
@ -30,8 +32,9 @@ class FlightRecorder {
static void SetLoggingModeRecord(void); static void SetLoggingModeRecord(void);
static void SetLoggingModeVerify(void); static void SetLoggingModeVerify(void);
static void SetLoggingMode(LoggingMode_t mode); static void SetLoggingMode(LoggingMode_t mode);
static void NormLog(double value); static bool StepLog(const char *name);
static void CsumLog(uint64_t csum); static bool NormLog(double value);
static bool CsumLog(uint64_t csum);
static void ReductionLog(double lcl, double glbl); static void ReductionLog(double lcl, double glbl);
static void Truncate(void); static void Truncate(void);
static void ResetCounters(void); static void ResetCounters(void);

View File

@ -552,6 +552,9 @@ void * Grid_backtrace_buffer[_NBACKTRACE];
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr) void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
{ {
fprintf(stderr,"Signal handler on host %s\n",hostname); fprintf(stderr,"Signal handler on host %s\n",hostname);
fprintf(stderr,"FlightRecorder step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
fprintf(stderr,"Caught signal %d\n",si->si_signo); fprintf(stderr,"Caught signal %d\n",si->si_signo);
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr); fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
fprintf(stderr," code %d\n",si->si_code); fprintf(stderr," code %d\n",si->si_code);