From 136d3802cbea8f74f725aab337660ca3c47603a9 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Tue, 27 Nov 2018 18:38:24 +0000 Subject: [PATCH] binary parallel IO can do read tests and eventually re-write in case of failure --- Grid/parallelIO/BinaryIO.cc | 3 +++ Grid/parallelIO/BinaryIO.h | 28 ++++++++++++++++++++++++++-- 2 files changed, 29 insertions(+), 2 deletions(-) create mode 100644 Grid/parallelIO/BinaryIO.cc diff --git a/Grid/parallelIO/BinaryIO.cc b/Grid/parallelIO/BinaryIO.cc new file mode 100644 index 00000000..221a7fe8 --- /dev/null +++ b/Grid/parallelIO/BinaryIO.cc @@ -0,0 +1,3 @@ +#include + +int Grid::BinaryIO::latticeWriteMaxRetry = -1; diff --git a/Grid/parallelIO/BinaryIO.h b/Grid/parallelIO/BinaryIO.h index a60fe962..ac82af11 100644 --- a/Grid/parallelIO/BinaryIO.h +++ b/Grid/parallelIO/BinaryIO.h @@ -81,6 +81,7 @@ inline void removeWhitespace(std::string &key) /////////////////////////////////////////////////////////////////////////////////////////////////// class BinaryIO { public: + static int latticeWriteMaxRetry; ///////////////////////////////////////////////////////////////////////////// // more byte manipulation helpers @@ -583,6 +584,8 @@ PARALLEL_CRITICAL typedef typename vobj::Realified::scalar_type word; word w=0; GridBase *grid = Umu._grid; uint64_t lsites = grid->lSites(); + int attemptsLeft = std::max(0, BinaryIO::latticeWriteMaxRetry); + bool checkWrite = (BinaryIO::latticeWriteMaxRetry >= 0); std::vector scalardata(lsites); std::vector iodata(lsites); // Munge, checksum, byte order in here @@ -598,8 +601,28 @@ PARALLEL_CRITICAL grid->Barrier(); timer.Stop(); - IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC, - nersc_csum,scidac_csuma,scidac_csumb); + while (attemptsLeft >= 0) + { + grid->Barrier(); + IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC, + nersc_csum,scidac_csuma,scidac_csumb); + if (checkWrite) + { + std::vector ckiodata(lsites); + uint32_t cknersc_csum, ckscidac_csuma, ckscidac_csumb; + + std::cout << GridLogMessage << "writeLatticeObject: read back object to check" << std::endl; + grid->Barrier(); + IOobject(w,grid,ckiodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC, + cknersc_csum,ckscidac_csuma,ckscidac_csumb); + if ((cknersc_csum != nersc_csum) or (ckscidac_csuma != scidac_csuma) or (ckscidac_csumb != scidac_csumb)) + { + std::cout << GridLogMessage << "writeLatticeObject: checksum failure in test read (" << attemptsLeft << " write attempt(s) remaining)" << std::endl; + } + } + attemptsLeft--; + } + std::cout<