1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-09 23:45:36 +00:00

Flight recorder, resurrecting the "world famous" Britney test

This commit is contained in:
Peter Boyle 2024-03-22 15:32:26 +00:00
parent b92dfcc8d3
commit 60b7f6c99d

View File

@ -90,129 +90,6 @@ NAMESPACE_BEGIN(Grid);
static Coordinate Grid_default_latt;
static Coordinate Grid_default_mpi;
///////////////////////////////////////////////////////
// Grid Norm logging for repro testing
///////////////////////////////////////////////////////
int GridNormLoggingMode;
int32_t GridNormLoggingCounter;
int32_t GridMPINormLoggingCounter;
std::vector<double> GridNormLogVector;
std::vector<double> GridMPINormLogVector;
std::vector<uint32_t> GridCsumLogVector;
void SetGridNormLoggingMode(GridNormLoggingMode_t mode)
{
switch ( mode ) {
case GridNormLoggingModePrint:
SetGridNormLoggingModePrint();
break;
case GridNormLoggingModeRecord:
SetGridNormLoggingModeRecord();
break;
case GridNormLoggingModeVerify:
SetGridNormLoggingModeVerify();
break;
case GridNormLoggingModeNone:
GridNormLoggingMode = mode;
GridNormLoggingCounter=0;
GridMPINormLoggingCounter=0;
GridNormLogVector.resize(0);
GridCsumLogVector.resize(0);
GridMPINormLogVector.resize(0);
break;
default:
assert(0);
}
}
void SetGridNormLoggingModePrint(void)
{
std::cout << " GridNormLogging Reproducibility logging set to print output " <<std::endl;
GridNormLoggingCounter = 0;
GridMPINormLoggingCounter=0;
GridNormLogVector.resize(0);
GridCsumLogVector.resize(0);
GridMPINormLogVector.resize(0);
GridNormLoggingMode = GridNormLoggingModePrint;
}
void SetGridNormLoggingModeRecord(void)
{
std::cout << " GridNormLogging Reproducibility logging set to RECORD " <<std::endl;
GridNormLoggingCounter = 0;
GridMPINormLoggingCounter=0;
GridNormLogVector.resize(0);
GridCsumLogVector.resize(0);
GridMPINormLogVector.resize(0);
GridNormLoggingMode = GridNormLoggingModeRecord;
}
void SetGridNormLoggingModeVerify(void)
{
std::cout << " GridNormLogging Reproducibility logging set to VERIFY " << GridNormLogVector.size()<< " log entries "<<std::endl;
GridNormLoggingCounter = 0;
GridMPINormLoggingCounter=0;
GridNormLoggingMode = GridNormLoggingModeVerify;
}
void GridNormLog(double value,uint32_t csum)
{
if(GridNormLoggingMode == GridNormLoggingModePrint) {
std::cerr<<"GridNormLog : "<< GridNormLoggingCounter <<" " << std::hexfloat << value << " csum " <<std::hex<<csum<<std::dec <<std::endl;
GridNormLoggingCounter++;
}
if(GridNormLoggingMode == GridNormLoggingModeRecord) {
GridNormLogVector.push_back(value);
GridCsumLogVector.push_back(csum);
GridNormLoggingCounter++;
}
if(GridNormLoggingMode == GridNormLoggingModeVerify) {
assert(GridNormLoggingCounter < GridNormLogVector.size());
if ( (value != GridNormLogVector[GridNormLoggingCounter])
|| (csum!=GridCsumLogVector[GridNormLoggingCounter]) ) {
std::cerr << " Oops got norm "<< std::hexfloat<<value<<" expect "<<GridNormLogVector[GridNormLoggingCounter] <<std::endl;
std::cerr << " Oops got csum "<< std::hex<<csum<<" expect "<<GridCsumLogVector[GridNormLoggingCounter] <<std::endl;
fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for norm %d/%zu %.16e %.16e %x %x\n",
GridHostname(),
GlobalSharedMemory::WorldShmRank,
GridNormLoggingCounter,GridNormLogVector.size(),
value, GridNormLogVector[GridNormLoggingCounter],
csum, GridCsumLogVector[GridNormLoggingCounter]); fflush(stderr);
assert(0); // Force takedown of job
}
if ( GridNormLogVector.size()==GridNormLoggingCounter ) {
std::cout << " GridNormLogging : Verified entire sequence of "<<GridNormLoggingCounter<<" norms "<<std::endl;
}
GridNormLoggingCounter++;
}
}
void GridMPINormLog(double local,double result)
{
if(GridNormLoggingMode == GridNormLoggingModePrint) {
std::cerr<<"GridMPINormLog : "<< GridMPINormLoggingCounter <<" " << std::hexfloat << local << " -> " <<result <<std::endl;
GridMPINormLoggingCounter++;
}
if(GridNormLoggingMode == GridNormLoggingModeRecord) {
std::cerr<<"GridMPINormLog RECORDING : "<< GridMPINormLoggingCounter <<" " << std::hexfloat << local << "-> "<< result <<std::endl;
GridMPINormLogVector.push_back(result);
GridMPINormLoggingCounter++;
}
if(GridNormLoggingMode == GridNormLoggingModeVerify) {
std::cerr<<"GridMPINormLog : "<< GridMPINormLoggingCounter <<" " << std::hexfloat << local << "-> "<< result <<std::endl;
assert(GridMPINormLoggingCounter < GridMPINormLogVector.size());
if ( result != GridMPINormLogVector[GridMPINormLoggingCounter] ) {
fprintf(stderr,"%s:%d MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e hist %.16e\n",
GridHostname(),
GlobalSharedMemory::WorldShmRank,
GridMPINormLoggingCounter,GridMPINormLogVector.size(),
result, local, GridMPINormLogVector[GridMPINormLoggingCounter]); fflush(stderr);
assert(0); // Force takedown of job
}
if ( GridMPINormLogVector.size()==GridMPINormLoggingCounter ) {
std::cout << " GridMPINormLogging : Verified entire sequence of "<<GridMPINormLoggingCounter<<" norms "<<std::endl;
}
GridMPINormLoggingCounter++;
}
}
int GridThread::_threads =1;
int GridThread::_hyperthreads=1;
int GridThread::_cores=1;