mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Merge branch 'feature/resilient-io' into develop
This commit is contained in:
commit
c509bd3fe2
3
Grid/parallelIO/BinaryIO.cc
Normal file
3
Grid/parallelIO/BinaryIO.cc
Normal file
@ -0,0 +1,3 @@
|
||||
#include <Grid/GridCore.h>
|
||||
|
||||
int Grid::BinaryIO::latticeWriteMaxRetry = -1;
|
@ -81,6 +81,7 @@ inline void removeWhitespace(std::string &key)
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
class BinaryIO {
|
||||
public:
|
||||
static int latticeWriteMaxRetry;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
// more byte manipulation helpers
|
||||
@ -370,7 +371,7 @@ PARALLEL_CRITICAL
|
||||
#endif
|
||||
} else {
|
||||
std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : "
|
||||
<< iodata.size() * sizeof(fobj) << " bytes" << std::endl;
|
||||
<< iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl;
|
||||
std::ifstream fin;
|
||||
fin.open(file, std::ios::binary | std::ios::in);
|
||||
if (control & BINARYIO_MASTER_APPEND)
|
||||
@ -582,7 +583,9 @@ PARALLEL_CRITICAL
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
typedef typename vobj::Realified::scalar_type word; word w=0;
|
||||
GridBase *grid = Umu._grid;
|
||||
uint64_t lsites = grid->lSites();
|
||||
uint64_t lsites = grid->lSites(), offsetCopy = offset;
|
||||
int attemptsLeft = std::max(0, BinaryIO::latticeWriteMaxRetry);
|
||||
bool checkWrite = (BinaryIO::latticeWriteMaxRetry >= 0);
|
||||
|
||||
std::vector<sobj> scalardata(lsites);
|
||||
std::vector<fobj> iodata(lsites); // Munge, checksum, byte order in here
|
||||
@ -597,9 +600,35 @@ PARALLEL_CRITICAL
|
||||
|
||||
grid->Barrier();
|
||||
timer.Stop();
|
||||
while (attemptsLeft >= 0)
|
||||
{
|
||||
grid->Barrier();
|
||||
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
|
||||
nersc_csum,scidac_csuma,scidac_csumb);
|
||||
if (checkWrite)
|
||||
{
|
||||
std::vector<fobj> ckiodata(lsites);
|
||||
uint32_t cknersc_csum, ckscidac_csuma, ckscidac_csumb;
|
||||
uint64_t ckoffset = offsetCopy;
|
||||
|
||||
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
|
||||
nersc_csum,scidac_csuma,scidac_csumb);
|
||||
std::cout << GridLogMessage << "writeLatticeObject: read back object" << std::endl;
|
||||
grid->Barrier();
|
||||
IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
|
||||
cknersc_csum,ckscidac_csuma,ckscidac_csumb);
|
||||
if ((cknersc_csum != nersc_csum) or (ckscidac_csuma != scidac_csuma) or (ckscidac_csumb != scidac_csumb))
|
||||
{
|
||||
std::cout << GridLogMessage << "writeLatticeObject: read test checksum failure, re-writing (" << attemptsLeft << " attempt(s) remaining)" << std::endl;
|
||||
offset = offsetCopy;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << GridLogMessage << "writeLatticeObject: read test checksum correct" << std::endl;
|
||||
break;
|
||||
}
|
||||
}
|
||||
attemptsLeft--;
|
||||
}
|
||||
|
||||
|
||||
std::cout<<GridLogMessage<<"writeLatticeObject: unvectorize overhead "<<timer.Elapsed() <<std::endl;
|
||||
}
|
||||
@ -725,5 +754,6 @@ PARALLEL_CRITICAL
|
||||
std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
#endif
|
||||
|
@ -108,6 +108,9 @@ void Application::run(void)
|
||||
HADRONS_ERROR(Definition, "run id is empty");
|
||||
}
|
||||
LOG(Message) << "RUN ID '" << getPar().runId << "'" << std::endl;
|
||||
BinaryIO::latticeWriteMaxRetry = getPar().parallelWriteMaxRetry;
|
||||
LOG(Message) << "Attempt(s) for resilient parallel I/O: "
|
||||
<< BinaryIO::latticeWriteMaxRetry << std::endl;
|
||||
vm().setRunId(getPar().runId);
|
||||
vm().printContent();
|
||||
env().printContent();
|
||||
|
@ -56,7 +56,9 @@ public:
|
||||
TrajRange, trajCounter,
|
||||
VirtualMachine::GeneticPar, genetic,
|
||||
std::string, runId,
|
||||
std::string, graphFile);
|
||||
std::string, graphFile,
|
||||
int, parallelWriteMaxRetry);
|
||||
GlobalPar(void): parallelWriteMaxRetry{-1} {}
|
||||
};
|
||||
public:
|
||||
// constructors
|
||||
|
Loading…
Reference in New Issue
Block a user