1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-14 01:35:36 +00:00

Merge branch 'feature/resilient-io' into develop

This commit is contained in:
Antonin Portelli 2018-12-01 12:57:43 +00:00
commit c509bd3fe2
4 changed files with 43 additions and 5 deletions

View File

@ -0,0 +1,3 @@
#include <Grid/GridCore.h>
int Grid::BinaryIO::latticeWriteMaxRetry = -1;

View File

@ -81,6 +81,7 @@ inline void removeWhitespace(std::string &key)
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
class BinaryIO { class BinaryIO {
public: public:
static int latticeWriteMaxRetry;
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
// more byte manipulation helpers // more byte manipulation helpers
@ -370,7 +371,7 @@ PARALLEL_CRITICAL
#endif #endif
} else { } else {
std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : " std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : "
<< iodata.size() * sizeof(fobj) << " bytes" << std::endl; << iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl;
std::ifstream fin; std::ifstream fin;
fin.open(file, std::ios::binary | std::ios::in); fin.open(file, std::ios::binary | std::ios::in);
if (control & BINARYIO_MASTER_APPEND) if (control & BINARYIO_MASTER_APPEND)
@ -582,7 +583,9 @@ PARALLEL_CRITICAL
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::Realified::scalar_type word; word w=0; typedef typename vobj::Realified::scalar_type word; word w=0;
GridBase *grid = Umu._grid; GridBase *grid = Umu._grid;
uint64_t lsites = grid->lSites(); uint64_t lsites = grid->lSites(), offsetCopy = offset;
int attemptsLeft = std::max(0, BinaryIO::latticeWriteMaxRetry);
bool checkWrite = (BinaryIO::latticeWriteMaxRetry >= 0);
std::vector<sobj> scalardata(lsites); std::vector<sobj> scalardata(lsites);
std::vector<fobj> iodata(lsites); // Munge, checksum, byte order in here std::vector<fobj> iodata(lsites); // Munge, checksum, byte order in here
@ -597,9 +600,35 @@ PARALLEL_CRITICAL
grid->Barrier(); grid->Barrier();
timer.Stop(); timer.Stop();
while (attemptsLeft >= 0)
{
grid->Barrier();
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
nersc_csum,scidac_csuma,scidac_csumb);
if (checkWrite)
{
std::vector<fobj> ckiodata(lsites);
uint32_t cknersc_csum, ckscidac_csuma, ckscidac_csumb;
uint64_t ckoffset = offsetCopy;
std::cout << GridLogMessage << "writeLatticeObject: read back object" << std::endl;
grid->Barrier();
IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
cknersc_csum,ckscidac_csuma,ckscidac_csumb);
if ((cknersc_csum != nersc_csum) or (ckscidac_csuma != scidac_csuma) or (ckscidac_csumb != scidac_csumb))
{
std::cout << GridLogMessage << "writeLatticeObject: read test checksum failure, re-writing (" << attemptsLeft << " attempt(s) remaining)" << std::endl;
offset = offsetCopy;
}
else
{
std::cout << GridLogMessage << "writeLatticeObject: read test checksum correct" << std::endl;
break;
}
}
attemptsLeft--;
}
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
nersc_csum,scidac_csuma,scidac_csumb);
std::cout<<GridLogMessage<<"writeLatticeObject: unvectorize overhead "<<timer.Elapsed() <<std::endl; std::cout<<GridLogMessage<<"writeLatticeObject: unvectorize overhead "<<timer.Elapsed() <<std::endl;
} }
@ -725,5 +754,6 @@ PARALLEL_CRITICAL
std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl; std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
} }
}; };
} }
#endif #endif

View File

@ -108,6 +108,9 @@ void Application::run(void)
HADRONS_ERROR(Definition, "run id is empty"); HADRONS_ERROR(Definition, "run id is empty");
} }
LOG(Message) << "RUN ID '" << getPar().runId << "'" << std::endl; LOG(Message) << "RUN ID '" << getPar().runId << "'" << std::endl;
BinaryIO::latticeWriteMaxRetry = getPar().parallelWriteMaxRetry;
LOG(Message) << "Attempt(s) for resilient parallel I/O: "
<< BinaryIO::latticeWriteMaxRetry << std::endl;
vm().setRunId(getPar().runId); vm().setRunId(getPar().runId);
vm().printContent(); vm().printContent();
env().printContent(); env().printContent();

View File

@ -56,7 +56,9 @@ public:
TrajRange, trajCounter, TrajRange, trajCounter,
VirtualMachine::GeneticPar, genetic, VirtualMachine::GeneticPar, genetic,
std::string, runId, std::string, runId,
std::string, graphFile); std::string, graphFile,
int, parallelWriteMaxRetry);
GlobalPar(void): parallelWriteMaxRetry{-1} {}
}; };
public: public:
// constructors // constructors