mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Flight recorder, resurrecting the "world famous" Britney test
This commit is contained in:
parent
b92dfcc8d3
commit
60b7f6c99d
@ -90,129 +90,6 @@ NAMESPACE_BEGIN(Grid);
|
||||
static Coordinate Grid_default_latt;
|
||||
static Coordinate Grid_default_mpi;
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////
|
||||
// Grid Norm logging for repro testing
|
||||
///////////////////////////////////////////////////////
|
||||
int GridNormLoggingMode;
|
||||
int32_t GridNormLoggingCounter;
|
||||
int32_t GridMPINormLoggingCounter;
|
||||
std::vector<double> GridNormLogVector;
|
||||
std::vector<double> GridMPINormLogVector;
|
||||
std::vector<uint32_t> GridCsumLogVector;
|
||||
|
||||
void SetGridNormLoggingMode(GridNormLoggingMode_t mode)
|
||||
{
|
||||
switch ( mode ) {
|
||||
case GridNormLoggingModePrint:
|
||||
SetGridNormLoggingModePrint();
|
||||
break;
|
||||
case GridNormLoggingModeRecord:
|
||||
SetGridNormLoggingModeRecord();
|
||||
break;
|
||||
case GridNormLoggingModeVerify:
|
||||
SetGridNormLoggingModeVerify();
|
||||
break;
|
||||
case GridNormLoggingModeNone:
|
||||
GridNormLoggingMode = mode;
|
||||
GridNormLoggingCounter=0;
|
||||
GridMPINormLoggingCounter=0;
|
||||
GridNormLogVector.resize(0);
|
||||
GridCsumLogVector.resize(0);
|
||||
GridMPINormLogVector.resize(0);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
void SetGridNormLoggingModePrint(void)
|
||||
{
|
||||
std::cout << " GridNormLogging Reproducibility logging set to print output " <<std::endl;
|
||||
GridNormLoggingCounter = 0;
|
||||
GridMPINormLoggingCounter=0;
|
||||
GridNormLogVector.resize(0);
|
||||
GridCsumLogVector.resize(0);
|
||||
GridMPINormLogVector.resize(0);
|
||||
GridNormLoggingMode = GridNormLoggingModePrint;
|
||||
}
|
||||
void SetGridNormLoggingModeRecord(void)
|
||||
{
|
||||
std::cout << " GridNormLogging Reproducibility logging set to RECORD " <<std::endl;
|
||||
GridNormLoggingCounter = 0;
|
||||
GridMPINormLoggingCounter=0;
|
||||
GridNormLogVector.resize(0);
|
||||
GridCsumLogVector.resize(0);
|
||||
GridMPINormLogVector.resize(0);
|
||||
GridNormLoggingMode = GridNormLoggingModeRecord;
|
||||
}
|
||||
void SetGridNormLoggingModeVerify(void)
|
||||
{
|
||||
std::cout << " GridNormLogging Reproducibility logging set to VERIFY " << GridNormLogVector.size()<< " log entries "<<std::endl;
|
||||
GridNormLoggingCounter = 0;
|
||||
GridMPINormLoggingCounter=0;
|
||||
GridNormLoggingMode = GridNormLoggingModeVerify;
|
||||
}
|
||||
void GridNormLog(double value,uint32_t csum)
|
||||
{
|
||||
if(GridNormLoggingMode == GridNormLoggingModePrint) {
|
||||
std::cerr<<"GridNormLog : "<< GridNormLoggingCounter <<" " << std::hexfloat << value << " csum " <<std::hex<<csum<<std::dec <<std::endl;
|
||||
GridNormLoggingCounter++;
|
||||
}
|
||||
if(GridNormLoggingMode == GridNormLoggingModeRecord) {
|
||||
GridNormLogVector.push_back(value);
|
||||
GridCsumLogVector.push_back(csum);
|
||||
GridNormLoggingCounter++;
|
||||
}
|
||||
if(GridNormLoggingMode == GridNormLoggingModeVerify) {
|
||||
assert(GridNormLoggingCounter < GridNormLogVector.size());
|
||||
if ( (value != GridNormLogVector[GridNormLoggingCounter])
|
||||
|| (csum!=GridCsumLogVector[GridNormLoggingCounter]) ) {
|
||||
std::cerr << " Oops got norm "<< std::hexfloat<<value<<" expect "<<GridNormLogVector[GridNormLoggingCounter] <<std::endl;
|
||||
std::cerr << " Oops got csum "<< std::hex<<csum<<" expect "<<GridCsumLogVector[GridNormLoggingCounter] <<std::endl;
|
||||
fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for norm %d/%zu %.16e %.16e %x %x\n",
|
||||
GridHostname(),
|
||||
GlobalSharedMemory::WorldShmRank,
|
||||
GridNormLoggingCounter,GridNormLogVector.size(),
|
||||
value, GridNormLogVector[GridNormLoggingCounter],
|
||||
csum, GridCsumLogVector[GridNormLoggingCounter]); fflush(stderr);
|
||||
assert(0); // Force takedown of job
|
||||
}
|
||||
if ( GridNormLogVector.size()==GridNormLoggingCounter ) {
|
||||
std::cout << " GridNormLogging : Verified entire sequence of "<<GridNormLoggingCounter<<" norms "<<std::endl;
|
||||
}
|
||||
GridNormLoggingCounter++;
|
||||
}
|
||||
}
|
||||
void GridMPINormLog(double local,double result)
|
||||
{
|
||||
if(GridNormLoggingMode == GridNormLoggingModePrint) {
|
||||
std::cerr<<"GridMPINormLog : "<< GridMPINormLoggingCounter <<" " << std::hexfloat << local << " -> " <<result <<std::endl;
|
||||
GridMPINormLoggingCounter++;
|
||||
}
|
||||
if(GridNormLoggingMode == GridNormLoggingModeRecord) {
|
||||
std::cerr<<"GridMPINormLog RECORDING : "<< GridMPINormLoggingCounter <<" " << std::hexfloat << local << "-> "<< result <<std::endl;
|
||||
GridMPINormLogVector.push_back(result);
|
||||
GridMPINormLoggingCounter++;
|
||||
}
|
||||
if(GridNormLoggingMode == GridNormLoggingModeVerify) {
|
||||
std::cerr<<"GridMPINormLog : "<< GridMPINormLoggingCounter <<" " << std::hexfloat << local << "-> "<< result <<std::endl;
|
||||
assert(GridMPINormLoggingCounter < GridMPINormLogVector.size());
|
||||
if ( result != GridMPINormLogVector[GridMPINormLoggingCounter] ) {
|
||||
fprintf(stderr,"%s:%d MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e hist %.16e\n",
|
||||
GridHostname(),
|
||||
GlobalSharedMemory::WorldShmRank,
|
||||
GridMPINormLoggingCounter,GridMPINormLogVector.size(),
|
||||
result, local, GridMPINormLogVector[GridMPINormLoggingCounter]); fflush(stderr);
|
||||
assert(0); // Force takedown of job
|
||||
}
|
||||
if ( GridMPINormLogVector.size()==GridMPINormLoggingCounter ) {
|
||||
std::cout << " GridMPINormLogging : Verified entire sequence of "<<GridMPINormLoggingCounter<<" norms "<<std::endl;
|
||||
}
|
||||
GridMPINormLoggingCounter++;
|
||||
}
|
||||
}
|
||||
|
||||
int GridThread::_threads =1;
|
||||
int GridThread::_hyperthreads=1;
|
||||
int GridThread::_cores=1;
|
||||
|
Loading…
Reference in New Issue
Block a user