From 10fc263675d29f26108143c9be1dba0ed547f435 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Mon, 26 Nov 2018 19:47:03 +0000 Subject: [PATCH] Hadrons: A2A vector write can fail and retry --- Hadrons/A2AVectors.hpp | 97 +++++++++++++++++++++----- Hadrons/Modules/MSolver/A2AVectors.hpp | 30 +++++--- 2 files changed, 102 insertions(+), 25 deletions(-) diff --git a/Hadrons/A2AVectors.hpp b/Hadrons/A2AVectors.hpp index f55eb6d7..52b256ee 100644 --- a/Hadrons/A2AVectors.hpp +++ b/Hadrons/A2AVectors.hpp @@ -83,9 +83,14 @@ public: Record(void): index(0) {} }; public: + // maxRetry meaning: + // -1: don't read back to check (default) + // 0: read to check, and crash (assert) in case of failure + // n > 0: read to check, retry to write n times before crashing template static void write(const std::string fileStem, std::vector &vec, - const bool multiFile, const int trajectory = -1); + const bool multiFile, const int trajectory = -1, + const int maxRetry = -1); template static void read(std::vector &vec, const std::string fileStem, const bool multiFile, const int trajectory = -1); @@ -258,12 +263,13 @@ void A2AVectorsSchurDiagTwo::makeHighModeW5D(FermionField &wout_4d, ******************************************************************************/ template void A2AVectorsIo::write(const std::string fileStem, std::vector &vec, - const bool multiFile, const int trajectory) + const bool multiFile, const int trajectory, const int maxRetry) { Record record; GridBase *grid = vec[0]._grid; ScidacWriter binWriter(grid->IsBoss()); std::string filename = vecFilename(fileStem, trajectory, multiFile); + Field buf(grid); if (multiFile) { @@ -271,27 +277,86 @@ void A2AVectorsIo::write(const std::string fileStem, std::vector &vec, for (unsigned int i = 0; i < vec.size(); ++i) { - fullFilename = filename + "/elem" + std::to_string(i) + ".bin"; + int status = GridLimeReader::LIME_READ_FAILURE, attempt = std::max(0, maxRetry); - LOG(Message) << "Writing vector " << i << std::endl; - makeFileDir(fullFilename, grid); - binWriter.open(fullFilename); - record.index = i; - binWriter.writeScidacFieldRecord(vec[i], record); - binWriter.close(); + while ((status != GridLimeReader::LIME_READ_SUCCESS) and (attempt >= 0)) + { + fullFilename = filename + "/elem" + std::to_string(i) + ".bin"; + + LOG(Message) << "Writing vector " << i << std::endl; + makeFileDir(fullFilename, grid); + binWriter.open(fullFilename); + record.index = i; + binWriter.writeScidacFieldRecord(vec[i], record); + binWriter.close(); + if (maxRetry < -1) + { + status = GridLimeReader::LIME_READ_SUCCESS; + } + else if (attempt >= 0) + { + ScidacReader binReader; + + LOG(Message) << "Reading back vector " << i + << " (" << attempt << " attempt(s) left)" << std::endl; + binReader.open(fullFilename); + status = binReader.readScidacFieldRecord(buf, record, false); + if (status != GridLimeReader::LIME_READ_SUCCESS) + { + LOG(Message) << "Read failure" << std::endl; + } + attempt--; + } + } + if (status != GridLimeReader::LIME_READ_SUCCESS) + { + HADRONS_ERROR(Io, "I/O error while writing vector " + std::to_string(i)); + } } } else { - makeFileDir(filename, grid); - binWriter.open(filename); - for (unsigned int i = 0; i < vec.size(); ++i) + int status = GridLimeReader::LIME_READ_FAILURE, attempt = std::max(0, maxRetry); + + while ((status != GridLimeReader::LIME_READ_SUCCESS) and (attempt >= 0)) { - LOG(Message) << "Writing vector " << i << std::endl; - record.index = i; - binWriter.writeScidacFieldRecord(vec[i], record); + makeFileDir(filename, grid); + binWriter.open(filename); + for (unsigned int i = 0; i < vec.size(); ++i) + { + LOG(Message) << "Writing vector " << i << std::endl; + record.index = i; + binWriter.writeScidacFieldRecord(vec[i], record); + } + binWriter.close(); + if (maxRetry < -1) + { + status = GridLimeReader::LIME_READ_SUCCESS; + } + else if (attempt >= 0) + { + ScidacReader binReader; + + binReader.open(filename); + LOG(Message) << "Reading back vector set (" + << attempt << " attempt(s) left)" << std::endl; + for (unsigned int i = 0; i < vec.size(); ++i) + { + LOG(Message) << "Reading vector " << i << std::endl; + status = binReader.readScidacFieldRecord(buf, record, false); + if (status != GridLimeReader::LIME_READ_SUCCESS) + { + LOG(Message) << "Read failure" << std::endl; + break; + } + } + attempt--; + } + } + if (status != GridLimeReader::LIME_READ_SUCCESS) + { + HADRONS_ERROR(Io, "I/O error while writing vector set"); } - binWriter.close(); } } diff --git a/Hadrons/Modules/MSolver/A2AVectors.hpp b/Hadrons/Modules/MSolver/A2AVectors.hpp index f9980ee3..a0d52c9a 100644 --- a/Hadrons/Modules/MSolver/A2AVectors.hpp +++ b/Hadrons/Modules/MSolver/A2AVectors.hpp @@ -44,16 +44,24 @@ BEGIN_HADRONS_NAMESPACE ******************************************************************************/ BEGIN_MODULE_NAMESPACE(MSolver) +class A2AVectorsIoPar: Serializable +{ +public: + GRID_SERIALIZABLE_CLASS_MEMBERS(A2AVectorsIoPar, + std::string, filestem, + bool, multiFile, + int, maxRetry); +}; + class A2AVectorsPar: Serializable { public: GRID_SERIALIZABLE_CLASS_MEMBERS(A2AVectorsPar, - std::string, noise, - std::string, action, - std::string, eigenPack, - std::string, solver, - std::string, output, - bool, multiFile); + std::string, noise, + std::string, action, + std::string, eigenPack, + std::string, solver, + A2AVectorsIoPar, output); }; template @@ -240,13 +248,17 @@ void TA2AVectors::execute(void) } // I/O if necessary - if (!par().output.empty()) + if (!par().output.filestem.empty()) { startTimer("V I/O"); - A2AVectorsIo::write(par().output + "_v", v, par().multiFile, vm().getTrajectory()); + A2AVectorsIo::write(par().output.filestem + "_v", v, + par().output.multiFile, vm().getTrajectory(), + par().output.maxRetry); stopTimer("V I/O"); startTimer("W I/O"); - A2AVectorsIo::write(par().output + "_w", w, par().multiFile, vm().getTrajectory()); + A2AVectorsIo::write(par().output.filestem + "_w", w, + par().output.multiFile, vm().getTrajectory(), + par().output.maxRetry); stopTimer("W I/O"); } }