1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-12-05 05:44:41 +00:00

Changes to help with error logging on aurora -- triage MPI / Slingshot vs. host-device / SYCL on checksum error

This commit is contained in:
Peter Boyle
2025-12-02 15:51:29 -05:00
parent 4a0aaf0786
commit 50bcd76fc1
4 changed files with 50 additions and 5 deletions

View File

@@ -589,6 +589,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
srq.PacketType = InterNodeRecv;
srq.bytes = rbytes;
srq.req = rrq;
srq.dest = from;
srq.host_buf = host_recv;
srq.device_buf = recv;
srq.tag = tag;
@@ -765,7 +766,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.dest = from;
srq.commdir = dir;
list.push_back(srq);
}
@@ -823,6 +824,31 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
// }
// }
// Get global error count in comms receive
#define AUDIT_FLIGHT_RECORDER_ERRORS
#ifdef AUDIT_FLIGHT_RECORDER_ERRORS
uint64_t EC = FlightRecorder::CommsErrorCount();
this->GlobalSum(EC);
if (EC) {
for(int r=0;r<list.size();r++){
#ifdef GRID_CHECKSUM_COMMS
uint64_t rbytes_data = list[r].bytes - 8;
#else
uint64_t rbytes_data = list[r].bytes;
#endif
if (list[r].PacketType == InterNodeReceiveHtoD) {
uint64_t csg = gpu_xor((uint64_t*)list[r].device_buf,rbytes_data/8);
uint64_t csh = cpu_xor((uint64_t*)list[r].host_buf,rbytes_data/8);
std::cerr << " Packet "<<r<<" Receive from " <<list[r].dest<<" host csum "<<csh<<" gpu csum "<<csg<<std::endl;
} if (list[r].PacketType == InterNodeXmitISend ) {
uint64_t csg = gpu_xor((uint64_t*)list[r].device_buf,rbytes_data/8);
uint64_t csh = cpu_xor((uint64_t*)list[r].host_buf,rbytes_data/8);
std::cerr << " Packet "<<r<<" Send to " <<list[r].dest<<" host csum "<<csh<<" gpu csum "<<csg<<std::endl;
}
}
}
#endif
#ifdef GRID_CHECKSUM_COMMS
for(int r=0;r<list.size();r++){
if ( list[r].PacketType == InterNodeReceiveHtoD ) {

View File

@@ -68,8 +68,7 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
return result;
}
template<class Word> Word svm_xor(Word *vec,uint64_t L)
template<class Word> Word gpu_xor(Word *vec,uint64_t L)
{
Word identity; identity=0;
Word ret = 0;
@@ -87,6 +86,18 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
theGridAccelerator->wait();
return ret;
}
template<class Word> Word cpu_xor(Word *vec,uint64_t L)
{
Word csum=0;
for(uint64_t w=0;w<L;w++){
csum = csum ^ vec[w];
}
return csum;
}
template<class Word> Word svm_xor(Word *vec,uint64_t L)
{
gpu_xor(vec,L);
}
template<class Word> Word checksum_gpu(Word *vec,uint64_t L)
{
Word identity; identity=0;

View File

@@ -47,6 +47,7 @@ int32_t FlightRecorder::CsumLoggingCounter;
int32_t FlightRecorder::NormLoggingCounter;
int32_t FlightRecorder::ReductionLoggingCounter;
uint64_t FlightRecorder::ErrorCounter;
uint64_t FlightRecorder::CommsErrorCounter;
std::vector<double> FlightRecorder::NormLogVector;
std::vector<double> FlightRecorder::ReductionLogVector;
@@ -118,6 +119,10 @@ void FlightRecorder::SetLoggingModeVerify(void)
ResetCounters();
LoggingMode = LoggingModeVerify;
}
uint64_t FlightRecorder::CommsErrorCount(void)
{
return CommsErrorCounter;
}
uint64_t FlightRecorder::ErrorCount(void)
{
return ErrorCounter;
@@ -312,6 +317,7 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
if ( !ContinueOnFail ) GRID_ASSERT(0);
ErrorCounter++;
} else {
if ( PrintEntireLog ) {
std::cerr<<"FlightRecorder::XmitLog : VALID "<< XmitLoggingCounter <<" "<< std::hexfloat << _xor << " "<< XmitLogVector[XmitLoggingCounter] <<std::endl;
@@ -357,7 +363,9 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
if ( !ContinueOnFail ) GRID_ASSERT(0);
CommsErrorCounter++;
ErrorCounter++;
} else {
if ( PrintEntireLog ) {
std::cerr<<"FlightRecorder::RecvLog : VALID "<< RecvLoggingCounter <<" "<< std::hexfloat << _xor << " "<< RecvLogVector[RecvLoggingCounter] <<std::endl;

View File

@@ -9,8 +9,8 @@ class FlightRecorder {
LoggingModeRecord,
LoggingModeVerify
};
static int LoggingMode;
static uint64_t CommsErrorCounter;
static uint64_t ErrorCounter;
static const char * StepName;
static int32_t StepLoggingCounter;