2017-04-05 14:41:04 +01:00
|
|
|
/*************************************************************************************
|
2016-01-02 14:51:32 +00:00
|
|
|
|
|
|
|
Grid physics library, www.github.com/paboyle/Grid
|
|
|
|
|
|
|
|
Source file: ./lib/parallelIO/NerscIO.h
|
|
|
|
|
|
|
|
Copyright (C) 2015
|
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
Author: Matt Spraggs <matthew.spraggs@gmail.com>
|
|
|
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
|
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
2016-01-02 14:51:32 +00:00
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License along
|
|
|
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
|
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
|
|
|
|
|
|
See the full license in the file "LICENSE" in the top level distribution directory
|
2017-04-05 14:41:04 +01:00
|
|
|
*************************************************************************************/
|
|
|
|
/* END LEGAL */
|
2015-04-22 22:46:48 +01:00
|
|
|
#ifndef GRID_NERSC_IO_H
|
|
|
|
#define GRID_NERSC_IO_H
|
|
|
|
|
|
|
|
namespace Grid {
|
2017-04-05 14:41:04 +01:00
|
|
|
namespace QCD {
|
|
|
|
|
|
|
|
using namespace Grid;
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
// Write and read from fstream; comput header offset for payload
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
class NerscIO : public BinaryIO {
|
|
|
|
public:
|
Binary IO file for generic Grid array parallel I/O.
Number of IO MPI tasks can be varied by selecting which
dimensions use parallel IO and which dimensions use Serial send to boss
I/O.
Thus can neck down from, say 1024 nodes = 4x4x8x8 to {1,8,32,64,128,256,1024} nodes
doing the I/O.
Interpolates nicely between ALL nodes write their data, a single boss per time-plane
in processor space [old UKQCD fortran code did this], and a single node doing all I/O.
Not sure I have the transfer sizes big enough and am not overly convinced fstream
is guaranteed to not give buffer inconsistencies unless I set streambuf size to zero.
Practically it has worked on 8 tasks, 2x1x2x2 writing /cloning NERSC configurations
on my MacOS + OpenMPI and Clang environment.
It is VERY easy to switch to pwrite at a later date, and also easy to send x-strips around from
each node in order to gather bigger chunks at the syscall level.
That would push us up to the circa 8x 18*4*8 == 4KB size write chunk, and by taking, say, x/y non
parallel we get to 16MB contiguous chunks written in multi 4KB transactions
per IOnode in 64^3 lattices for configuration I/O.
I suspect this is fine for system performance.
2015-08-26 13:40:29 +01:00
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
static inline void truncate(std::string file){
|
|
|
|
std::ofstream fout(file,std::ios::out);
|
|
|
|
}
|
2016-05-04 20:14:38 +01:00
|
|
|
|
2017-06-11 23:14:10 +01:00
|
|
|
static inline unsigned int writeHeader(FieldMetaData &field,std::string file)
|
2017-04-05 14:41:04 +01:00
|
|
|
{
|
|
|
|
std::ofstream fout(file,std::ios::out|std::ios::in);
|
|
|
|
fout.seekp(0,std::ios::beg);
|
2017-06-11 23:14:10 +01:00
|
|
|
dump_meta_data(field, fout);
|
2017-04-05 14:41:04 +01:00
|
|
|
field.data_start = fout.tellp();
|
|
|
|
return field.data_start;
|
|
|
|
}
|
|
|
|
|
|
|
|
// for the header-reader
|
2017-06-11 23:14:10 +01:00
|
|
|
static inline int readHeader(std::string file,GridBase *grid, FieldMetaData &field)
|
2017-04-05 14:41:04 +01:00
|
|
|
{
|
|
|
|
int offset=0;
|
|
|
|
std::map<std::string,std::string> header;
|
|
|
|
std::string line;
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////
|
|
|
|
// read the header
|
|
|
|
//////////////////////////////////////////////////
|
|
|
|
std::ifstream fin(file);
|
|
|
|
|
|
|
|
getline(fin,line); // read one line and insist is
|
|
|
|
|
|
|
|
removeWhitespace(line);
|
|
|
|
std::cout << GridLogMessage << "* " << line << std::endl;
|
|
|
|
|
|
|
|
assert(line==std::string("BEGIN_HEADER"));
|
|
|
|
|
|
|
|
do {
|
|
|
|
getline(fin,line); // read one line
|
|
|
|
std::cout << GridLogMessage << "* "<<line<< std::endl;
|
|
|
|
int eq = line.find("=");
|
|
|
|
if(eq >0) {
|
2015-04-22 22:46:48 +01:00
|
|
|
std::string key=line.substr(0,eq);
|
|
|
|
std::string val=line.substr(eq+1);
|
|
|
|
removeWhitespace(key);
|
|
|
|
removeWhitespace(val);
|
|
|
|
|
|
|
|
header[key] = val;
|
|
|
|
}
|
2017-04-05 14:41:04 +01:00
|
|
|
} while( line.find("END_HEADER") == std::string::npos );
|
2015-04-22 22:46:48 +01:00
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
field.data_start = fin.tellg();
|
2015-04-22 22:46:48 +01:00
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
//////////////////////////////////////////////////
|
|
|
|
// chomp the values
|
|
|
|
//////////////////////////////////////////////////
|
|
|
|
field.hdr_version = header["HDR_VERSION"];
|
|
|
|
field.data_type = header["DATATYPE"];
|
|
|
|
field.storage_format = header["STORAGE_FORMAT"];
|
2015-04-22 22:46:48 +01:00
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
field.dimension[0] = std::stol(header["DIMENSION_1"]);
|
|
|
|
field.dimension[1] = std::stol(header["DIMENSION_2"]);
|
|
|
|
field.dimension[2] = std::stol(header["DIMENSION_3"]);
|
|
|
|
field.dimension[3] = std::stol(header["DIMENSION_4"]);
|
|
|
|
|
|
|
|
assert(grid->_ndimension == 4);
|
|
|
|
for(int d=0;d<4;d++){
|
|
|
|
assert(grid->_fdimensions[d]==field.dimension[d]);
|
2015-04-22 22:46:48 +01:00
|
|
|
}
|
2017-04-05 14:41:04 +01:00
|
|
|
|
|
|
|
field.link_trace = std::stod(header["LINK_TRACE"]);
|
|
|
|
field.plaquette = std::stod(header["PLAQUETTE"]);
|
|
|
|
|
|
|
|
field.boundary[0] = header["BOUNDARY_1"];
|
|
|
|
field.boundary[1] = header["BOUNDARY_2"];
|
|
|
|
field.boundary[2] = header["BOUNDARY_3"];
|
|
|
|
field.boundary[3] = header["BOUNDARY_4"];
|
|
|
|
|
|
|
|
field.checksum = std::stoul(header["CHECKSUM"],0,16);
|
|
|
|
field.ensemble_id = header["ENSEMBLE_ID"];
|
|
|
|
field.ensemble_label = header["ENSEMBLE_LABEL"];
|
|
|
|
field.sequence_number = std::stol(header["SEQUENCE_NUMBER"]);
|
|
|
|
field.creator = header["CREATOR"];
|
|
|
|
field.creator_hardware = header["CREATOR_HARDWARE"];
|
|
|
|
field.creation_date = header["CREATION_DATE"];
|
|
|
|
field.archive_date = header["ARCHIVE_DATE"];
|
|
|
|
field.floating_point = header["FLOATING_POINT"];
|
|
|
|
|
|
|
|
return field.data_start;
|
2015-04-22 22:46:48 +01:00
|
|
|
}
|
|
|
|
|
2017-06-11 23:14:10 +01:00
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
// Now the meat: the object readers
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
2015-04-22 22:46:48 +01:00
|
|
|
|
2017-06-01 22:37:26 +01:00
|
|
|
template<class vsimd>
|
2017-06-11 23:14:10 +01:00
|
|
|
static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
|
|
|
|
FieldMetaData& header,
|
|
|
|
std::string file)
|
2017-06-01 22:37:26 +01:00
|
|
|
{
|
2017-04-05 14:41:04 +01:00
|
|
|
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
|
2015-04-22 22:46:48 +01:00
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
GridBase *grid = Umu._grid;
|
|
|
|
int offset = readHeader(file,Umu._grid,header);
|
2015-04-22 22:46:48 +01:00
|
|
|
|
2017-06-11 23:14:10 +01:00
|
|
|
FieldMetaData clone(header);
|
2015-04-22 22:46:48 +01:00
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
std::string format(header.floating_point);
|
2015-04-22 22:46:48 +01:00
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
int ieee32big = (format == std::string("IEEE32BIG"));
|
|
|
|
int ieee32 = (format == std::string("IEEE32"));
|
|
|
|
int ieee64big = (format == std::string("IEEE64BIG"));
|
|
|
|
int ieee64 = (format == std::string("IEEE64"));
|
2015-04-22 22:46:48 +01:00
|
|
|
|
2017-06-11 23:14:10 +01:00
|
|
|
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
2017-04-05 14:41:04 +01:00
|
|
|
// depending on datatype, set up munger;
|
|
|
|
// munger is a function of <floating point, Real, data_type>
|
|
|
|
if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
|
2017-06-01 22:37:26 +01:00
|
|
|
if ( ieee32 || ieee32big ) {
|
2017-06-11 23:14:10 +01:00
|
|
|
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F>
|
|
|
|
(Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
|
|
|
|
nersc_csum,scidac_csuma,scidac_csumb);
|
2017-06-01 22:37:26 +01:00
|
|
|
}
|
|
|
|
if ( ieee64 || ieee64big ) {
|
2017-06-11 23:14:10 +01:00
|
|
|
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D>
|
|
|
|
(Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
|
|
|
|
nersc_csum,scidac_csuma,scidac_csumb);
|
2017-06-01 22:37:26 +01:00
|
|
|
}
|
2017-04-05 14:41:04 +01:00
|
|
|
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
|
|
|
|
if ( ieee32 || ieee32big ) {
|
2017-06-11 23:14:10 +01:00
|
|
|
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
|
|
|
|
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
|
|
|
|
nersc_csum,scidac_csuma,scidac_csumb);
|
2017-04-05 14:41:04 +01:00
|
|
|
}
|
|
|
|
if ( ieee64 || ieee64big ) {
|
2017-06-11 23:14:10 +01:00
|
|
|
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
|
|
|
|
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
|
|
|
|
nersc_csum,scidac_csuma,scidac_csumb);
|
2017-04-05 14:41:04 +01:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
assert(0);
|
|
|
|
}
|
Binary IO file for generic Grid array parallel I/O.
Number of IO MPI tasks can be varied by selecting which
dimensions use parallel IO and which dimensions use Serial send to boss
I/O.
Thus can neck down from, say 1024 nodes = 4x4x8x8 to {1,8,32,64,128,256,1024} nodes
doing the I/O.
Interpolates nicely between ALL nodes write their data, a single boss per time-plane
in processor space [old UKQCD fortran code did this], and a single node doing all I/O.
Not sure I have the transfer sizes big enough and am not overly convinced fstream
is guaranteed to not give buffer inconsistencies unless I set streambuf size to zero.
Practically it has worked on 8 tasks, 2x1x2x2 writing /cloning NERSC configurations
on my MacOS + OpenMPI and Clang environment.
It is VERY easy to switch to pwrite at a later date, and also easy to send x-strips around from
each node in order to gather bigger chunks at the syscall level.
That would push us up to the circa 8x 18*4*8 == 4KB size write chunk, and by taking, say, x/y non
parallel we get to 16MB contiguous chunks written in multi 4KB transactions
per IOnode in 64^3 lattices for configuration I/O.
I suspect this is fine for system performance.
2015-08-26 13:40:29 +01:00
|
|
|
|
2017-06-11 23:14:10 +01:00
|
|
|
GaugeStatistics<GaugeField>(Umu,clone);
|
2016-02-21 14:03:21 +00:00
|
|
|
|
2017-06-11 23:14:10 +01:00
|
|
|
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
|
2017-04-05 14:41:04 +01:00
|
|
|
<<" header "<<std::hex<<header.checksum<<std::dec <<std::endl;
|
|
|
|
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" plaquette "<<clone.plaquette
|
|
|
|
<<" header "<<header.plaquette<<std::endl;
|
|
|
|
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" link_trace "<<clone.link_trace
|
|
|
|
<<" header "<<header.link_trace<<std::endl;
|
2017-05-25 11:43:33 +01:00
|
|
|
|
2017-05-25 13:32:24 +01:00
|
|
|
if ( fabs(clone.plaquette -header.plaquette ) >= 1.0e-5 ) {
|
|
|
|
std::cout << " Plaquette mismatch "<<std::endl;
|
|
|
|
std::cout << Umu[0]<<std::endl;
|
|
|
|
std::cout << Umu[1]<<std::endl;
|
|
|
|
}
|
2017-06-11 23:14:10 +01:00
|
|
|
if ( nersc_csum != header.checksum ) {
|
2017-05-25 13:32:24 +01:00
|
|
|
std::cerr << " checksum mismatch " << std::endl;
|
|
|
|
std::cerr << " plaqs " << clone.plaquette << " " << header.plaquette << std::endl;
|
|
|
|
std::cerr << " trace " << clone.link_trace<< " " << header.link_trace<< std::endl;
|
2017-06-11 23:14:10 +01:00
|
|
|
std::cerr << " nersc_csum " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
|
2017-06-01 22:37:26 +01:00
|
|
|
exit(0);
|
2017-04-05 14:41:04 +01:00
|
|
|
}
|
2017-06-01 22:37:26 +01:00
|
|
|
assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
|
|
|
|
assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
|
2017-06-11 23:14:10 +01:00
|
|
|
assert(nersc_csum == header.checksum );
|
2017-06-01 22:37:26 +01:00
|
|
|
|
|
|
|
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
|
|
|
|
}
|
2015-04-22 22:46:48 +01:00
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
template<class vsimd>
|
2017-06-11 23:14:10 +01:00
|
|
|
static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
|
|
|
|
std::string file,
|
|
|
|
int two_row,
|
|
|
|
int bits32)
|
2017-04-05 14:41:04 +01:00
|
|
|
{
|
|
|
|
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
|
2015-08-30 12:18:34 +01:00
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
typedef iLorentzColourMatrix<vsimd> vobj;
|
|
|
|
typedef typename vobj::scalar_object sobj;
|
Binary IO file for generic Grid array parallel I/O.
Number of IO MPI tasks can be varied by selecting which
dimensions use parallel IO and which dimensions use Serial send to boss
I/O.
Thus can neck down from, say 1024 nodes = 4x4x8x8 to {1,8,32,64,128,256,1024} nodes
doing the I/O.
Interpolates nicely between ALL nodes write their data, a single boss per time-plane
in processor space [old UKQCD fortran code did this], and a single node doing all I/O.
Not sure I have the transfer sizes big enough and am not overly convinced fstream
is guaranteed to not give buffer inconsistencies unless I set streambuf size to zero.
Practically it has worked on 8 tasks, 2x1x2x2 writing /cloning NERSC configurations
on my MacOS + OpenMPI and Clang environment.
It is VERY easy to switch to pwrite at a later date, and also easy to send x-strips around from
each node in order to gather bigger chunks at the syscall level.
That would push us up to the circa 8x 18*4*8 == 4KB size write chunk, and by taking, say, x/y non
parallel we get to 16MB contiguous chunks written in multi 4KB transactions
per IOnode in 64^3 lattices for configuration I/O.
I suspect this is fine for system performance.
2015-08-26 13:40:29 +01:00
|
|
|
|
2017-06-11 23:14:10 +01:00
|
|
|
FieldMetaData header;
|
|
|
|
///////////////////////////////////////////
|
2017-04-05 14:41:04 +01:00
|
|
|
// Following should become arguments
|
2017-06-11 23:14:10 +01:00
|
|
|
///////////////////////////////////////////
|
2017-04-05 14:41:04 +01:00
|
|
|
header.sequence_number = 1;
|
|
|
|
header.ensemble_id = "UKQCD";
|
|
|
|
header.ensemble_label = "DWF";
|
Binary IO file for generic Grid array parallel I/O.
Number of IO MPI tasks can be varied by selecting which
dimensions use parallel IO and which dimensions use Serial send to boss
I/O.
Thus can neck down from, say 1024 nodes = 4x4x8x8 to {1,8,32,64,128,256,1024} nodes
doing the I/O.
Interpolates nicely between ALL nodes write their data, a single boss per time-plane
in processor space [old UKQCD fortran code did this], and a single node doing all I/O.
Not sure I have the transfer sizes big enough and am not overly convinced fstream
is guaranteed to not give buffer inconsistencies unless I set streambuf size to zero.
Practically it has worked on 8 tasks, 2x1x2x2 writing /cloning NERSC configurations
on my MacOS + OpenMPI and Clang environment.
It is VERY easy to switch to pwrite at a later date, and also easy to send x-strips around from
each node in order to gather bigger chunks at the syscall level.
That would push us up to the circa 8x 18*4*8 == 4KB size write chunk, and by taking, say, x/y non
parallel we get to 16MB contiguous chunks written in multi 4KB transactions
per IOnode in 64^3 lattices for configuration I/O.
I suspect this is fine for system performance.
2015-08-26 13:40:29 +01:00
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
typedef LorentzColourMatrixD fobj3D;
|
|
|
|
typedef LorentzColour2x3D fobj2D;
|
2016-10-24 15:48:22 +01:00
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
GridBase *grid = Umu._grid;
|
Binary IO file for generic Grid array parallel I/O.
Number of IO MPI tasks can be varied by selecting which
dimensions use parallel IO and which dimensions use Serial send to boss
I/O.
Thus can neck down from, say 1024 nodes = 4x4x8x8 to {1,8,32,64,128,256,1024} nodes
doing the I/O.
Interpolates nicely between ALL nodes write their data, a single boss per time-plane
in processor space [old UKQCD fortran code did this], and a single node doing all I/O.
Not sure I have the transfer sizes big enough and am not overly convinced fstream
is guaranteed to not give buffer inconsistencies unless I set streambuf size to zero.
Practically it has worked on 8 tasks, 2x1x2x2 writing /cloning NERSC configurations
on my MacOS + OpenMPI and Clang environment.
It is VERY easy to switch to pwrite at a later date, and also easy to send x-strips around from
each node in order to gather bigger chunks at the syscall level.
That would push us up to the circa 8x 18*4*8 == 4KB size write chunk, and by taking, say, x/y non
parallel we get to 16MB contiguous chunks written in multi 4KB transactions
per IOnode in 64^3 lattices for configuration I/O.
I suspect this is fine for system performance.
2015-08-26 13:40:29 +01:00
|
|
|
|
2017-06-11 23:14:10 +01:00
|
|
|
GridMetaData(grid,header);
|
|
|
|
assert(header.nd==4);
|
|
|
|
GaugeStatistics<GaugeField>(Umu,header);
|
|
|
|
MachineCharacteristics(header);
|
Binary IO file for generic Grid array parallel I/O.
Number of IO MPI tasks can be varied by selecting which
dimensions use parallel IO and which dimensions use Serial send to boss
I/O.
Thus can neck down from, say 1024 nodes = 4x4x8x8 to {1,8,32,64,128,256,1024} nodes
doing the I/O.
Interpolates nicely between ALL nodes write their data, a single boss per time-plane
in processor space [old UKQCD fortran code did this], and a single node doing all I/O.
Not sure I have the transfer sizes big enough and am not overly convinced fstream
is guaranteed to not give buffer inconsistencies unless I set streambuf size to zero.
Practically it has worked on 8 tasks, 2x1x2x2 writing /cloning NERSC configurations
on my MacOS + OpenMPI and Clang environment.
It is VERY easy to switch to pwrite at a later date, and also easy to send x-strips around from
each node in order to gather bigger chunks at the syscall level.
That would push us up to the circa 8x 18*4*8 == 4KB size write chunk, and by taking, say, x/y non
parallel we get to 16MB contiguous chunks written in multi 4KB transactions
per IOnode in 64^3 lattices for configuration I/O.
I suspect this is fine for system performance.
2015-08-26 13:40:29 +01:00
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
int offset;
|
2015-04-22 22:46:48 +01:00
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
truncate(file);
|
2015-04-22 22:46:48 +01:00
|
|
|
|
2017-06-11 23:14:10 +01:00
|
|
|
// Sod it -- always write 3x3 double
|
|
|
|
header.floating_point = std::string("IEEE64BIG");
|
|
|
|
header.data_type = std::string("4D_SU3_GAUGE_3x3");
|
|
|
|
GaugeSimpleUnmunger<fobj3D,sobj> munge;
|
|
|
|
offset = writeHeader(header,file);
|
|
|
|
|
|
|
|
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
|
|
|
BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
|
|
|
|
nersc_csum,scidac_csuma,scidac_csumb);
|
|
|
|
header.checksum = nersc_csum;
|
|
|
|
writeHeader(header,file);
|
|
|
|
|
2017-06-01 22:37:26 +01:00
|
|
|
std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
|
|
|
|
<<std::hex<<header.checksum
|
|
|
|
<<std::dec<<" plaq "<< header.plaquette <<std::endl;
|
2017-06-11 23:14:10 +01:00
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
}
|
|
|
|
///////////////////////////////
|
|
|
|
// RNG state
|
|
|
|
///////////////////////////////
|
|
|
|
static inline void writeRNGState(GridSerialRNG &serial,GridParallelRNG ¶llel,std::string file)
|
|
|
|
{
|
|
|
|
typedef typename GridParallelRNG::RngStateType RngStateType;
|
2015-12-19 18:32:25 +00:00
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
// Following should become arguments
|
2017-06-11 23:14:10 +01:00
|
|
|
FieldMetaData header;
|
2017-04-05 14:41:04 +01:00
|
|
|
header.sequence_number = 1;
|
|
|
|
header.ensemble_id = "UKQCD";
|
|
|
|
header.ensemble_label = "DWF";
|
2015-12-19 18:32:25 +00:00
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
GridBase *grid = parallel._grid;
|
2015-12-19 18:32:25 +00:00
|
|
|
|
2017-06-11 23:14:10 +01:00
|
|
|
GridMetaData(grid,header);
|
|
|
|
assert(header.nd==4);
|
2017-04-05 14:41:04 +01:00
|
|
|
header.link_trace=0.0;
|
|
|
|
header.plaquette=0.0;
|
2017-06-11 23:14:10 +01:00
|
|
|
MachineCharacteristics(header);
|
2015-12-19 18:32:25 +00:00
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
int offset;
|
2015-12-19 18:32:25 +00:00
|
|
|
|
|
|
|
#ifdef RNG_RANLUX
|
2017-04-05 14:41:04 +01:00
|
|
|
header.floating_point = std::string("UINT64");
|
|
|
|
header.data_type = std::string("RANLUX48");
|
2017-04-01 16:25:44 +01:00
|
|
|
#endif
|
|
|
|
#ifdef RNG_MT19937
|
2017-04-05 14:41:04 +01:00
|
|
|
header.floating_point = std::string("UINT32");
|
|
|
|
header.data_type = std::string("MT19937");
|
2015-12-19 18:32:25 +00:00
|
|
|
#endif
|
2017-04-01 16:25:44 +01:00
|
|
|
#ifdef RNG_SITMO
|
2017-04-05 14:41:04 +01:00
|
|
|
header.floating_point = std::string("UINT64");
|
|
|
|
header.data_type = std::string("SITMO");
|
2015-12-19 18:32:25 +00:00
|
|
|
#endif
|
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
truncate(file);
|
|
|
|
offset = writeHeader(header,file);
|
2017-06-11 23:14:10 +01:00
|
|
|
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
|
|
|
BinaryIO::writeRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
|
|
|
|
header.checksum = nersc_csum;
|
2017-04-05 14:41:04 +01:00
|
|
|
offset = writeHeader(header,file);
|
2015-12-19 18:32:25 +00:00
|
|
|
|
2017-06-01 22:37:26 +01:00
|
|
|
std::cout<<GridLogMessage
|
|
|
|
<<"Written NERSC RNG STATE "<<file<< " checksum "
|
|
|
|
<<std::hex<<header.checksum
|
|
|
|
<<std::dec<<std::endl;
|
2015-12-19 18:32:25 +00:00
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
}
|
2015-12-19 18:32:25 +00:00
|
|
|
|
2017-06-11 23:14:10 +01:00
|
|
|
static inline void readRNGState(GridSerialRNG &serial,GridParallelRNG & parallel,FieldMetaData& header,std::string file)
|
2017-04-05 14:41:04 +01:00
|
|
|
{
|
|
|
|
typedef typename GridParallelRNG::RngStateType RngStateType;
|
2015-12-19 18:32:25 +00:00
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
GridBase *grid = parallel._grid;
|
2015-12-19 18:32:25 +00:00
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
int offset = readHeader(file,grid,header);
|
2015-12-19 18:32:25 +00:00
|
|
|
|
2017-06-11 23:14:10 +01:00
|
|
|
FieldMetaData clone(header);
|
2015-12-19 18:32:25 +00:00
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
std::string format(header.floating_point);
|
|
|
|
std::string data_type(header.data_type);
|
2015-12-19 18:32:25 +00:00
|
|
|
|
|
|
|
#ifdef RNG_RANLUX
|
2017-04-05 14:41:04 +01:00
|
|
|
assert(format == std::string("UINT64"));
|
|
|
|
assert(data_type == std::string("RANLUX48"));
|
2017-04-01 16:25:44 +01:00
|
|
|
#endif
|
|
|
|
#ifdef RNG_MT19937
|
2017-04-05 14:41:04 +01:00
|
|
|
assert(format == std::string("UINT32"));
|
|
|
|
assert(data_type == std::string("MT19937"));
|
2015-12-19 18:32:25 +00:00
|
|
|
#endif
|
2017-04-01 16:25:44 +01:00
|
|
|
#ifdef RNG_SITMO
|
2017-04-05 14:41:04 +01:00
|
|
|
assert(format == std::string("UINT64"));
|
|
|
|
assert(data_type == std::string("SITMO"));
|
2015-12-19 18:32:25 +00:00
|
|
|
#endif
|
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
// depending on datatype, set up munger;
|
|
|
|
// munger is a function of <floating point, Real, data_type>
|
2017-06-11 23:14:10 +01:00
|
|
|
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
|
|
|
BinaryIO::readRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
|
2015-12-19 18:32:25 +00:00
|
|
|
|
2017-06-11 23:14:10 +01:00
|
|
|
if ( nersc_csum != header.checksum ) {
|
|
|
|
std::cerr << "checksum mismatch "<<std::hex<< nersc_csum <<" "<<header.checksum<<std::dec<<std::endl;
|
2017-05-25 13:32:24 +01:00
|
|
|
exit(0);
|
|
|
|
}
|
2017-06-11 23:14:10 +01:00
|
|
|
assert(nersc_csum == header.checksum );
|
2015-12-19 18:32:25 +00:00
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
|
|
|
|
}
|
2015-12-19 18:32:25 +00:00
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
};
|
Binary IO file for generic Grid array parallel I/O.
Number of IO MPI tasks can be varied by selecting which
dimensions use parallel IO and which dimensions use Serial send to boss
I/O.
Thus can neck down from, say 1024 nodes = 4x4x8x8 to {1,8,32,64,128,256,1024} nodes
doing the I/O.
Interpolates nicely between ALL nodes write their data, a single boss per time-plane
in processor space [old UKQCD fortran code did this], and a single node doing all I/O.
Not sure I have the transfer sizes big enough and am not overly convinced fstream
is guaranteed to not give buffer inconsistencies unless I set streambuf size to zero.
Practically it has worked on 8 tasks, 2x1x2x2 writing /cloning NERSC configurations
on my MacOS + OpenMPI and Clang environment.
It is VERY easy to switch to pwrite at a later date, and also easy to send x-strips around from
each node in order to gather bigger chunks at the syscall level.
That would push us up to the circa 8x 18*4*8 == 4KB size write chunk, and by taking, say, x/y non
parallel we get to 16MB contiguous chunks written in multi 4KB transactions
per IOnode in 64^3 lattices for configuration I/O.
I suspect this is fine for system performance.
2015-08-26 13:40:29 +01:00
|
|
|
|
2017-04-05 14:41:04 +01:00
|
|
|
}}
|
2015-04-22 22:46:48 +01:00
|
|
|
#endif
|