mirror of
https://github.com/paboyle/Grid.git
synced 2025-12-05 05:44:41 +00:00
Changes to help with error logging on aurora -- triage MPI / Slingshot vs. host-device / SYCL on checksum error
This commit is contained in:
@@ -589,6 +589,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
|
||||
srq.PacketType = InterNodeRecv;
|
||||
srq.bytes = rbytes;
|
||||
srq.req = rrq;
|
||||
srq.dest = from;
|
||||
srq.host_buf = host_recv;
|
||||
srq.device_buf = recv;
|
||||
srq.tag = tag;
|
||||
@@ -765,7 +766,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
srq.host_buf = NULL;
|
||||
srq.device_buf = xmit;
|
||||
srq.tag = -1;
|
||||
srq.dest = dest;
|
||||
srq.dest = from;
|
||||
srq.commdir = dir;
|
||||
list.push_back(srq);
|
||||
}
|
||||
@@ -823,6 +824,31 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
|
||||
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
|
||||
// }
|
||||
// }
|
||||
// Get global error count in comms receive
|
||||
#define AUDIT_FLIGHT_RECORDER_ERRORS
|
||||
#ifdef AUDIT_FLIGHT_RECORDER_ERRORS
|
||||
uint64_t EC = FlightRecorder::CommsErrorCount();
|
||||
this->GlobalSum(EC);
|
||||
if (EC) {
|
||||
for(int r=0;r<list.size();r++){
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
uint64_t rbytes_data = list[r].bytes - 8;
|
||||
#else
|
||||
uint64_t rbytes_data = list[r].bytes;
|
||||
#endif
|
||||
if (list[r].PacketType == InterNodeReceiveHtoD) {
|
||||
uint64_t csg = gpu_xor((uint64_t*)list[r].device_buf,rbytes_data/8);
|
||||
uint64_t csh = cpu_xor((uint64_t*)list[r].host_buf,rbytes_data/8);
|
||||
std::cerr << " Packet "<<r<<" Receive from " <<list[r].dest<<" host csum "<<csh<<" gpu csum "<<csg<<std::endl;
|
||||
} if (list[r].PacketType == InterNodeXmitISend ) {
|
||||
uint64_t csg = gpu_xor((uint64_t*)list[r].device_buf,rbytes_data/8);
|
||||
uint64_t csh = cpu_xor((uint64_t*)list[r].host_buf,rbytes_data/8);
|
||||
std::cerr << " Packet "<<r<<" Send to " <<list[r].dest<<" host csum "<<csh<<" gpu csum "<<csg<<std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
for(int r=0;r<list.size();r++){
|
||||
if ( list[r].PacketType == InterNodeReceiveHtoD ) {
|
||||
|
||||
@@ -68,8 +68,7 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
template<class Word> Word svm_xor(Word *vec,uint64_t L)
|
||||
template<class Word> Word gpu_xor(Word *vec,uint64_t L)
|
||||
{
|
||||
Word identity; identity=0;
|
||||
Word ret = 0;
|
||||
@@ -87,6 +86,18 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
|
||||
theGridAccelerator->wait();
|
||||
return ret;
|
||||
}
|
||||
template<class Word> Word cpu_xor(Word *vec,uint64_t L)
|
||||
{
|
||||
Word csum=0;
|
||||
for(uint64_t w=0;w<L;w++){
|
||||
csum = csum ^ vec[w];
|
||||
}
|
||||
return csum;
|
||||
}
|
||||
template<class Word> Word svm_xor(Word *vec,uint64_t L)
|
||||
{
|
||||
gpu_xor(vec,L);
|
||||
}
|
||||
template<class Word> Word checksum_gpu(Word *vec,uint64_t L)
|
||||
{
|
||||
Word identity; identity=0;
|
||||
|
||||
@@ -47,6 +47,7 @@ int32_t FlightRecorder::CsumLoggingCounter;
|
||||
int32_t FlightRecorder::NormLoggingCounter;
|
||||
int32_t FlightRecorder::ReductionLoggingCounter;
|
||||
uint64_t FlightRecorder::ErrorCounter;
|
||||
uint64_t FlightRecorder::CommsErrorCounter;
|
||||
|
||||
std::vector<double> FlightRecorder::NormLogVector;
|
||||
std::vector<double> FlightRecorder::ReductionLogVector;
|
||||
@@ -118,6 +119,10 @@ void FlightRecorder::SetLoggingModeVerify(void)
|
||||
ResetCounters();
|
||||
LoggingMode = LoggingModeVerify;
|
||||
}
|
||||
uint64_t FlightRecorder::CommsErrorCount(void)
|
||||
{
|
||||
return CommsErrorCounter;
|
||||
}
|
||||
uint64_t FlightRecorder::ErrorCount(void)
|
||||
{
|
||||
return ErrorCounter;
|
||||
@@ -312,6 +317,7 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
|
||||
if ( !ContinueOnFail ) GRID_ASSERT(0);
|
||||
|
||||
ErrorCounter++;
|
||||
|
||||
} else {
|
||||
if ( PrintEntireLog ) {
|
||||
std::cerr<<"FlightRecorder::XmitLog : VALID "<< XmitLoggingCounter <<" "<< std::hexfloat << _xor << " "<< XmitLogVector[XmitLoggingCounter] <<std::endl;
|
||||
@@ -357,7 +363,9 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
|
||||
|
||||
if ( !ContinueOnFail ) GRID_ASSERT(0);
|
||||
|
||||
CommsErrorCounter++;
|
||||
ErrorCounter++;
|
||||
|
||||
} else {
|
||||
if ( PrintEntireLog ) {
|
||||
std::cerr<<"FlightRecorder::RecvLog : VALID "<< RecvLoggingCounter <<" "<< std::hexfloat << _xor << " "<< RecvLogVector[RecvLoggingCounter] <<std::endl;
|
||||
|
||||
@@ -9,8 +9,8 @@ class FlightRecorder {
|
||||
LoggingModeRecord,
|
||||
LoggingModeVerify
|
||||
};
|
||||
|
||||
static int LoggingMode;
|
||||
static uint64_t CommsErrorCounter;
|
||||
static uint64_t ErrorCounter;
|
||||
static const char * StepName;
|
||||
static int32_t StepLoggingCounter;
|
||||
|
||||
Reference in New Issue
Block a user