mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-14 01:35:36 +00:00
Merge branch 'feature/resilient-io' into develop
This commit is contained in:
commit
c509bd3fe2
3
Grid/parallelIO/BinaryIO.cc
Normal file
3
Grid/parallelIO/BinaryIO.cc
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
#include <Grid/GridCore.h>
|
||||||
|
|
||||||
|
int Grid::BinaryIO::latticeWriteMaxRetry = -1;
|
@ -81,6 +81,7 @@ inline void removeWhitespace(std::string &key)
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
class BinaryIO {
|
class BinaryIO {
|
||||||
public:
|
public:
|
||||||
|
static int latticeWriteMaxRetry;
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////
|
||||||
// more byte manipulation helpers
|
// more byte manipulation helpers
|
||||||
@ -370,7 +371,7 @@ PARALLEL_CRITICAL
|
|||||||
#endif
|
#endif
|
||||||
} else {
|
} else {
|
||||||
std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : "
|
std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : "
|
||||||
<< iodata.size() * sizeof(fobj) << " bytes" << std::endl;
|
<< iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl;
|
||||||
std::ifstream fin;
|
std::ifstream fin;
|
||||||
fin.open(file, std::ios::binary | std::ios::in);
|
fin.open(file, std::ios::binary | std::ios::in);
|
||||||
if (control & BINARYIO_MASTER_APPEND)
|
if (control & BINARYIO_MASTER_APPEND)
|
||||||
@ -582,7 +583,9 @@ PARALLEL_CRITICAL
|
|||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
typedef typename vobj::Realified::scalar_type word; word w=0;
|
typedef typename vobj::Realified::scalar_type word; word w=0;
|
||||||
GridBase *grid = Umu._grid;
|
GridBase *grid = Umu._grid;
|
||||||
uint64_t lsites = grid->lSites();
|
uint64_t lsites = grid->lSites(), offsetCopy = offset;
|
||||||
|
int attemptsLeft = std::max(0, BinaryIO::latticeWriteMaxRetry);
|
||||||
|
bool checkWrite = (BinaryIO::latticeWriteMaxRetry >= 0);
|
||||||
|
|
||||||
std::vector<sobj> scalardata(lsites);
|
std::vector<sobj> scalardata(lsites);
|
||||||
std::vector<fobj> iodata(lsites); // Munge, checksum, byte order in here
|
std::vector<fobj> iodata(lsites); // Munge, checksum, byte order in here
|
||||||
@ -597,9 +600,35 @@ PARALLEL_CRITICAL
|
|||||||
|
|
||||||
grid->Barrier();
|
grid->Barrier();
|
||||||
timer.Stop();
|
timer.Stop();
|
||||||
|
while (attemptsLeft >= 0)
|
||||||
|
{
|
||||||
|
grid->Barrier();
|
||||||
|
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
|
||||||
|
nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
|
if (checkWrite)
|
||||||
|
{
|
||||||
|
std::vector<fobj> ckiodata(lsites);
|
||||||
|
uint32_t cknersc_csum, ckscidac_csuma, ckscidac_csumb;
|
||||||
|
uint64_t ckoffset = offsetCopy;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "writeLatticeObject: read back object" << std::endl;
|
||||||
|
grid->Barrier();
|
||||||
|
IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
|
||||||
|
cknersc_csum,ckscidac_csuma,ckscidac_csumb);
|
||||||
|
if ((cknersc_csum != nersc_csum) or (ckscidac_csuma != scidac_csuma) or (ckscidac_csumb != scidac_csumb))
|
||||||
|
{
|
||||||
|
std::cout << GridLogMessage << "writeLatticeObject: read test checksum failure, re-writing (" << attemptsLeft << " attempt(s) remaining)" << std::endl;
|
||||||
|
offset = offsetCopy;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
std::cout << GridLogMessage << "writeLatticeObject: read test checksum correct" << std::endl;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
attemptsLeft--;
|
||||||
|
}
|
||||||
|
|
||||||
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
|
|
||||||
nersc_csum,scidac_csuma,scidac_csumb);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage<<"writeLatticeObject: unvectorize overhead "<<timer.Elapsed() <<std::endl;
|
std::cout<<GridLogMessage<<"writeLatticeObject: unvectorize overhead "<<timer.Elapsed() <<std::endl;
|
||||||
}
|
}
|
||||||
@ -725,5 +754,6 @@ PARALLEL_CRITICAL
|
|||||||
std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
|
std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -108,6 +108,9 @@ void Application::run(void)
|
|||||||
HADRONS_ERROR(Definition, "run id is empty");
|
HADRONS_ERROR(Definition, "run id is empty");
|
||||||
}
|
}
|
||||||
LOG(Message) << "RUN ID '" << getPar().runId << "'" << std::endl;
|
LOG(Message) << "RUN ID '" << getPar().runId << "'" << std::endl;
|
||||||
|
BinaryIO::latticeWriteMaxRetry = getPar().parallelWriteMaxRetry;
|
||||||
|
LOG(Message) << "Attempt(s) for resilient parallel I/O: "
|
||||||
|
<< BinaryIO::latticeWriteMaxRetry << std::endl;
|
||||||
vm().setRunId(getPar().runId);
|
vm().setRunId(getPar().runId);
|
||||||
vm().printContent();
|
vm().printContent();
|
||||||
env().printContent();
|
env().printContent();
|
||||||
|
@ -56,7 +56,9 @@ public:
|
|||||||
TrajRange, trajCounter,
|
TrajRange, trajCounter,
|
||||||
VirtualMachine::GeneticPar, genetic,
|
VirtualMachine::GeneticPar, genetic,
|
||||||
std::string, runId,
|
std::string, runId,
|
||||||
std::string, graphFile);
|
std::string, graphFile,
|
||||||
|
int, parallelWriteMaxRetry);
|
||||||
|
GlobalPar(void): parallelWriteMaxRetry{-1} {}
|
||||||
};
|
};
|
||||||
public:
|
public:
|
||||||
// constructors
|
// constructors
|
||||||
|
Loading…
Reference in New Issue
Block a user