1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-12-04 13:24:40 +00:00

Compare commits

..

5 Commits

Author SHA1 Message Date
7132a4fd28 Update 2025-12-02 23:22:42 +00:00
e8057d6b4a Updated for verbose on host vs. device side csum 2025-12-02 23:15:32 +00:00
973584e039 Update on aurora 2025-12-02 22:24:54 +00:00
Peter Boyle
ea46c2dc3c config command 2025-12-02 16:01:37 -05:00
Peter Boyle
50bcd76fc1 Changes to help with error logging on aurora -- triage MPI / Slingshot vs. host-device / SYCL on checksum error 2025-12-02 15:51:29 -05:00
8 changed files with 132 additions and 7 deletions

View File

@@ -589,6 +589,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
srq.PacketType = InterNodeRecv;
srq.bytes = rbytes;
srq.req = rrq;
srq.dest = from;
srq.host_buf = host_recv;
srq.device_buf = recv;
srq.tag = tag;
@@ -765,7 +766,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.dest = from;
srq.commdir = dir;
list.push_back(srq);
}
@@ -817,12 +818,40 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
GRID_ASSERT(ierr==0);
}
// for(int r=0;r<nreq;r++){
// if ( list[r].PacketType==InterNodeRecv ) {
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
// }
// }
// Get global error count in comms receive
#define AUDIT_FLIGHT_RECORDER_ERRORS
#ifdef AUDIT_FLIGHT_RECORDER_ERRORS
uint64_t EC = FlightRecorder::CommsErrorCount();
if (EC) std::cerr << " global sum error count "<<EC<<std::endl;
this->GlobalSum(EC);
if (EC) {
for(int r=0;r<list.size();r++){
#ifdef GRID_CHECKSUM_COMMS
uint64_t rbytes_data = list[r].bytes - 8;
#else
uint64_t rbytes_data = list[r].bytes;
#endif
if (list[r].PacketType == InterNodeReceiveHtoD) {
std::cerr << " calling xor reduce "<<std::endl;
uint64_t csg = gpu_xor((uint64_t*)list[r].device_buf,rbytes_data/8);
uint64_t csh = cpu_xor((uint64_t*)list[r].host_buf,rbytes_data/8);
std::cerr << " Packet "<<r<<" Receive from " <<list[r].dest<<" host csum "<<csh<<" gpu csum "<<csg<<std::endl;
} if (list[r].PacketType == InterNodeXmitISend ) {
std::cerr << " calling xor reduce "<<std::endl;
uint64_t csg = gpu_xor((uint64_t*)list[r].device_buf,rbytes_data/8);
uint64_t csh = cpu_xor((uint64_t*)list[r].host_buf,rbytes_data/8);
std::cerr << " Packet "<<r<<" Send to " <<list[r].dest<<" host csum "<<csh<<" gpu csum "<<csg<<std::endl;
}
}
}
#endif
#ifdef GRID_CHECKSUM_COMMS
for(int r=0;r<list.size();r++){
if ( list[r].PacketType == InterNodeReceiveHtoD ) {

View File

@@ -68,8 +68,7 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
return result;
}
template<class Word> Word svm_xor(Word *vec,uint64_t L)
template<class Word> Word gpu_xor(Word *vec,uint64_t L)
{
Word identity; identity=0;
Word ret = 0;
@@ -87,6 +86,18 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
theGridAccelerator->wait();
return ret;
}
template<class Word> Word cpu_xor(Word *vec,uint64_t L)
{
Word csum=0;
for(uint64_t w=0;w<L;w++){
csum = csum ^ vec[w];
}
return csum;
}
template<class Word> Word svm_xor(Word *vec,uint64_t L)
{
gpu_xor(vec,L);
}
template<class Word> Word checksum_gpu(Word *vec,uint64_t L)
{
Word identity; identity=0;

View File

@@ -47,6 +47,7 @@ int32_t FlightRecorder::CsumLoggingCounter;
int32_t FlightRecorder::NormLoggingCounter;
int32_t FlightRecorder::ReductionLoggingCounter;
uint64_t FlightRecorder::ErrorCounter;
uint64_t FlightRecorder::CommsErrorCounter;
std::vector<double> FlightRecorder::NormLogVector;
std::vector<double> FlightRecorder::ReductionLogVector;
@@ -118,6 +119,10 @@ void FlightRecorder::SetLoggingModeVerify(void)
ResetCounters();
LoggingMode = LoggingModeVerify;
}
uint64_t FlightRecorder::CommsErrorCount(void)
{
return CommsErrorCounter;
}
uint64_t FlightRecorder::ErrorCount(void)
{
return ErrorCounter;
@@ -312,6 +317,7 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
if ( !ContinueOnFail ) GRID_ASSERT(0);
ErrorCounter++;
} else {
if ( PrintEntireLog ) {
std::cerr<<"FlightRecorder::XmitLog : VALID "<< XmitLoggingCounter <<" "<< std::hexfloat << _xor << " "<< XmitLogVector[XmitLoggingCounter] <<std::endl;
@@ -357,7 +363,9 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
if ( !ContinueOnFail ) GRID_ASSERT(0);
CommsErrorCounter++;
ErrorCounter++;
} else {
if ( PrintEntireLog ) {
std::cerr<<"FlightRecorder::RecvLog : VALID "<< RecvLoggingCounter <<" "<< std::hexfloat << _xor << " "<< RecvLogVector[RecvLoggingCounter] <<std::endl;

View File

@@ -9,8 +9,8 @@ class FlightRecorder {
LoggingModeRecord,
LoggingModeVerify
};
static int LoggingMode;
static uint64_t CommsErrorCounter;
static uint64_t ErrorCounter;
static const char * StepName;
static int32_t StepLoggingCounter;
@@ -39,6 +39,7 @@ class FlightRecorder {
static void Truncate(void);
static void ResetCounters(void);
static uint64_t ErrorCount(void);
static uint64_t CommsErrorCount(void);
static void xmitLog(void *,uint64_t bytes);
static void recvLog(void *,uint64_t bytes,int rank);
};

View File

@@ -51,7 +51,7 @@ EOF
CMD="mpiexec -np 384 -ppn 12 -envall --hostfile nodefile \
../gpu_tile.sh \
$BINARY --mpi 4.4.4.6 --grid 64.64.64.96 \
--shm-mpi 0 --comms-overlap --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --log Message "
--shm-mpi 0 --comms-overlap --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --log Message --debug-stdout "
echo $CMD > command-line
env > environment

View File

@@ -0,0 +1,62 @@
#!/bin/bash
#PBS -l select=256
#PBS -q run_next
##PBS -A LatticeQCD_aesp_CNDA
#PBS -A LatticeFlavor
#PBS -l walltime=06:00:00
#PBS -N reproBigJob
#PBS -k doe
#PBS -l filesystems=flare
#PBS -l filesystems=home
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
# 56 cores / 6 threads ~9
export OMP_NUM_THREADS=6
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
export GRID_PRINT_ENTIRE_LOG=0
export GRID_CHECKSUM_RECV_BUF=1
export GRID_CHECKSUM_SEND_BUF=1
export MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE=generic
export MPIR_CVAR_NOLOCAL=1
cd $PBS_O_WORKDIR
source ../sourceme.sh
cp $PBS_NODEFILE nodefile
DIR=reproBigJob.$PBS_JOBID
mkdir -p $DIR
cd $DIR
cp $PBS_NODEFILE nodefile
BINARY=../Test_dwf_mixedcg_prec
echo > pingjob <<EOF
while read node ;
do
echo ssh $node killall -HUP Test_dwf_mixedcg_prec
done < nodefile
EOF
CMD="mpiexec -np 3072 -ppn 12 -envall --hostfile nodefile \
../gpu_tile.sh \
$BINARY --mpi 8.8.8.6 --grid 128.128.128.288 \
--shm-mpi 0 --comms-overlap --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 18000 --log Message --debug-stdout "
echo $CMD > command-line
env > environment
$CMD
grep Oops */Grid.stderr.* > failures.$PBS_JOBID

View File

@@ -0,0 +1,12 @@
CXX=mpicxx CXXFLAGS=-I/opt/local/include LDFLAGS=-L/opt/local/lib/ ../../configure \
--enable-simd=GEN \
--enable-comms=mpi3 \
--enable-debug \
--enable-unified=yes \
--prefix /Users/peterboyle/QCD/BugHunt/install \
--with-lime=/Users/peterboyle/QCD/SciDAC/install/ \
--with-openssl=$BREW \
--disable-fermion-reps \
--disable-gparity \
--enable-debug

View File

@@ -82,6 +82,7 @@ int main (int argc, char ** argv)
Grid_init(&argc,&argv);
std::cout << GridLogMessage<<" in main(): Grid is initialised"<<std::endl;
const int Ls=12;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
@@ -94,6 +95,7 @@ int main (int argc, char ** argv)
GridCartesian * FGrid_f = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_f);
GridRedBlackCartesian * FrbGrid_f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_f);
std::cout << GridLogMessage<<" in main(): making RNGs"<<std::endl;
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
@@ -152,7 +154,7 @@ int main (int argc, char ** argv)
time_t start = time(NULL);
UGrid->Broadcast(0,(void *)&start,sizeof(start));
FlightRecorder::ContinueOnFail = 0;
FlightRecorder::ContinueOnFail = 1;
FlightRecorder::PrintEntireLog = 0;
FlightRecorder::ChecksumComms = 0;
FlightRecorder::ChecksumCommsSend=0;