Grid/tests/IO/Test_nersc_io.cc

    /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

    Source file: ./tests/Test_nersc_io.cc

    Copyright (C) 2015

Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
#include <Grid/Grid.h>

using namespace std;
using namespace Grid;
using namespace Grid::QCD;


int main (int argc, char ** argv)
{
  Grid_init(&argc,&argv);

  std::cout <<GridLogMessage<< " main "<<std::endl;

  std::vector<int> simd_layout = GridDefaultSimd(4,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  std::vector<int> latt_size  ({48,48,48,96});
  //std::vector<int> latt_size  ({32,32,32,32});
  //std::vector<int> latt_size  ({16,16,16,32});
  std::vector<int> clatt_size  ({4,4,4,8});
  int orthodir=3;
  int orthosz =latt_size[orthodir];
    
  GridCartesian     Fine(latt_size,simd_layout,mpi_layout);
  GridCartesian     Coarse(clatt_size,simd_layout,mpi_layout);


  GridParallelRNG   pRNGa(&Fine);
  GridParallelRNG   pRNGb(&Fine);
  GridSerialRNG     sRNGa;
  GridSerialRNG     sRNGb;

  std::cout <<GridLogMessage<< " seeding... "<<std::endl;
  pRNGa.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
  sRNGa.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
  std::cout <<GridLogMessage<< " ...done "<<std::endl;

  std::string rfile("./ckpoint_rng.4000");
  NerscIO::writeRNGState(sRNGa,pRNGa,rfile);
  NerscField rngheader;
  NerscIO::readRNGState (sRNGb,pRNGb,rngheader,rfile);

  LatticeComplex tmpa(&Fine); random(pRNGa,tmpa);
  LatticeComplex tmpb(&Fine); random(pRNGb,tmpb);
  tmpa = tmpa - tmpb;
  std::cout <<GridLogMessage<< " difference between restored randoms and orig "<<norm2( tmpa ) <<" / "<< norm2(tmpb)<<std::endl;

  ComplexD a,b;

  random(sRNGa,a);
  random(sRNGb,b);
  std::cout <<GridLogMessage<< " serial RNG numbers "<<a<<" "<<b<<std::endl;

  LatticeGaugeField Umu(&Fine);
  LatticeGaugeField Umu_diff(&Fine);
  LatticeGaugeField Umu_saved(&Fine);

  std::vector<LatticeColourMatrix> U(4,&Fine);
  
  SU3::HotConfiguration(pRNGa,Umu);

  NerscField header;
  std::string file("./ckpoint_lat.4000");

  int precision32 = 0;
  int tworow      = 0;
  NerscIO::writeConfiguration(Umu,file,tworow,precision32);
  Umu_saved = Umu;
  NerscIO::readConfiguration(Umu,header,file);
  Umu_diff = Umu - Umu_saved;
  //std::cout << "Umu_save "<<Umu_saved[0]<<std::endl;
  //std::cout << "Umu_read "<<Umu[0]<<std::endl;
  std::cout <<GridLogMessage<< "norm2 Gauge Diff = "<<norm2(Umu_diff)<<std::endl;

  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
  }

  // Painful ; fix syntactical niceness
  LatticeComplex LinkTrace(&Fine);
  LinkTrace=zero;
  for(int mu=0;mu<Nd;mu++){
    LinkTrace = LinkTrace + trace(U[mu]);
  }

  // (1+2+3)=6 = N(N-1)/2 terms
  LatticeComplex Plaq(&Fine);
  LatticeComplex cPlaq(&Coarse);

  Plaq = zero;
#if 1
  for(int mu=1;mu<Nd;mu++){
    for(int nu=0;nu<mu;nu++){
      Plaq = Plaq + trace(U[mu]*Cshift(U[nu],mu,1)*adj(Cshift(U[mu],nu,1))*adj(U[nu]));
    }
  }
#endif
  double vol = Fine.gSites();
  Complex PlaqScale(1.0/vol/6.0/3.0);

  std::vector<TComplex> Plaq_T(orthosz);
  sliceSum(Plaq,Plaq_T,Nd-1);
  int Nt = Plaq_T.size();

  TComplex Plaq_T_sum; 
  Plaq_T_sum=zero;
  for(int t=0;t<Nt;t++){
    Plaq_T_sum = Plaq_T_sum+Plaq_T[t];
    Complex Pt=TensorRemove(Plaq_T[t]);
    std::cout<<GridLogMessage << "sliced ["<<t<<"]" <<Pt*PlaqScale*Real(Nt)<<std::endl;
  }

  {
    Complex Pt = TensorRemove(Plaq_T_sum);
    std::cout<<GridLogMessage << "total " <<Pt*PlaqScale<<std::endl;
  }  


  TComplex Tp = sum(Plaq);
  Complex p  = TensorRemove(Tp);
  std::cout<<GridLogMessage << "calculated plaquettes " <<p*PlaqScale<<std::endl;

  Complex LinkTraceScale(1.0/vol/4.0/3.0);
  TComplex Tl = sum(LinkTrace);
  Complex l  = TensorRemove(Tl);
  std::cout<<GridLogMessage << "calculated link trace " <<l*LinkTraceScale<<std::endl;

  blockSum(cPlaq,Plaq);
  TComplex TcP = sum(cPlaq);
  Complex ll= TensorRemove(TcP);
  std::cout<<GridLogMessage << "coarsened plaquettes sum to " <<ll*PlaqScale<<std::endl;

  std::string clone2x3("./ckpoint_clone2x3.4000");
  std::string clone3x3("./ckpoint_clone3x3.4000");

  NerscIO::writeConfiguration(Umu,clone3x3,0,precision32);
  NerscIO::writeConfiguration(Umu,clone2x3,1,precision32);
  
  Grid_finalize();
}
Global edit adding copyright and license info to every source file. 2016-01-02 14:51:32 +00:00			`/*************************************************************************************`

			`Grid physics library, www.github.com/paboyle/Grid`

			`Source file: ./tests/Test_nersc_io.cc`

			`Copyright (C) 2015`

			`Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>`
			`Author: Peter Boyle <paboyle@ph.ed.ac.uk>`
			`Author: paboyle <paboyle@ph.ed.ac.uk>`

			`This program is free software; you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation; either version 2 of the License, or`
			`(at your option) any later version.`

			`This program is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License along`
			`with this program; if not, write to the Free Software Foundation, Inc.,`
			`51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.`

			`See the full license in the file "LICENSE" in the top level distribution directory`
			`*************************************************************************************/`
			`/* END LEGAL */`
Open up dependency on Eigen and FFTW 2016-07-07 22:31:07 +01:00			`#include <Grid/Grid.h>`
Got the NERSC IO working and fixed a bug in cshift. 2015-04-22 22:46:48 +01:00
			`using namespace std;`
			`using namespace Grid;`
			`using namespace Grid::QCD;`


			`int main (int argc, char ** argv)`
			`{`
			`Grid_init(&argc,&argv);`

Roll over to MPI version of I/O 2017-06-01 22:38:18 +01:00			`std::cout <<GridLogMessage<< " main "<<std::endl;`
Command line args and a general clean up 2015-05-11 12:43:10 +01:00
big commit fixing nocompiles in defective C++11 compilers (gcc, icpc). stared getting to near the bleeding edge I guess 2015-06-30 15:01:26 +01:00			`std::vector<int> simd_layout = GridDefaultSimd(4,vComplex::Nsimd());`
Adding a better controlled threading class, preparing to force in deterministic reduction. 2015-05-11 18:59:03 +01:00			`std::vector<int> mpi_layout = GridDefaultMpi();`
Roll over to MPI version of I/O 2017-06-01 22:38:18 +01:00			`std::vector<int> latt_size ({48,48,48,96});`
			`//std::vector<int> latt_size ({32,32,32,32});`
			`//std::vector<int> latt_size ({16,16,16,32});`
Reorganised the TODO. Really getting somewhere 2015-04-23 20:42:30 +01:00			`std::vector<int> clatt_size ({4,4,4,8});`
Slice summation working. May move this into lattice/Grid_lattice_reduction however 2015-04-23 15:13:00 +01:00			`int orthodir=3;`
			`int orthosz =latt_size[orthodir];`
Got the NERSC IO working and fixed a bug in cshift. 2015-04-22 22:46:48 +01:00
			`GridCartesian Fine(latt_size,simd_layout,mpi_layout);`
Reorganised the TODO. Really getting somewhere 2015-04-23 20:42:30 +01:00			`GridCartesian Coarse(clatt_size,simd_layout,mpi_layout);`

Roll over to MPI version of I/O 2017-06-01 22:38:18 +01:00
Options to use mersenne twister OR ranlux48 via --enable-rng flag at configure time. Can save and restore RNG state via new (serial) I/O routines in a NERSC header style file. Store a Parallel (one per site) and a single serial RNG file. 2015-12-19 18:32:25 +00:00			`GridParallelRNG pRNGa(&Fine);`
			`GridParallelRNG pRNGb(&Fine);`
			`GridSerialRNG sRNGa;`
			`GridSerialRNG sRNGb;`

Roll over to MPI version of I/O 2017-06-01 22:38:18 +01:00			`std::cout <<GridLogMessage<< " seeding... "<<std::endl;`
SITMO I/O for NERSC working now bit repro 2017-05-05 16:54:44 +01:00			`pRNGa.SeedFixedIntegers(std::vector<int>({45,12,81,9}));`
			`sRNGa.SeedFixedIntegers(std::vector<int>({45,12,81,9}));`
Roll over to MPI version of I/O 2017-06-01 22:38:18 +01:00			`std::cout <<GridLogMessage<< " ...done "<<std::endl;`

Options to use mersenne twister OR ranlux48 via --enable-rng flag at configure time. Can save and restore RNG state via new (serial) I/O routines in a NERSC header style file. Store a Parallel (one per site) and a single serial RNG file. 2015-12-19 18:32:25 +00:00			`std::string rfile("./ckpoint_rng.4000");`
			`NerscIO::writeRNGState(sRNGa,pRNGa,rfile);`
			`NerscField rngheader;`
			`NerscIO::readRNGState (sRNGb,pRNGb,rngheader,rfile);`

			`LatticeComplex tmpa(&Fine); random(pRNGa,tmpa);`
			`LatticeComplex tmpb(&Fine); random(pRNGb,tmpb);`
			`tmpa = tmpa - tmpb;`
Roll over to MPI version of I/O 2017-06-01 22:38:18 +01:00			`std::cout <<GridLogMessage<< " difference between restored randoms and orig "<<norm2( tmpa ) <<" / "<< norm2(tmpb)<<std::endl;`
Options to use mersenne twister OR ranlux48 via --enable-rng flag at configure time. Can save and restore RNG state via new (serial) I/O routines in a NERSC header style file. Store a Parallel (one per site) and a single serial RNG file. 2015-12-19 18:32:25 +00:00
			`ComplexD a,b;`

			`random(sRNGa,a);`
			`random(sRNGb,b);`
Roll over to MPI version of I/O 2017-06-01 22:38:18 +01:00			`std::cout <<GridLogMessage<< " serial RNG numbers "<<a<<" "<<b<<std::endl;`
Few modifications on stdout messages 2016-10-20 17:01:59 +01:00
Got the NERSC IO working and fixed a bug in cshift. 2015-04-22 22:46:48 +01:00			`LatticeGaugeField Umu(&Fine);`
add bug-fixed Test_nersc)_io. 2015-06-16 20:23:27 +01:00			`LatticeGaugeField Umu_diff(&Fine);`
			`LatticeGaugeField Umu_saved(&Fine);`
Got the NERSC IO working and fixed a bug in cshift. 2015-04-22 22:46:48 +01:00
			`std::vector<LatticeColourMatrix> U(4,&Fine);`

Attempts to speed up the parallel IO 2017-05-25 13:32:24 +01:00			`SU3::HotConfiguration(pRNGa,Umu);`
SITMO I/O for NERSC working now bit repro 2017-05-05 16:54:44 +01:00
Got the NERSC IO working and fixed a bug in cshift. 2015-04-22 22:46:48 +01:00			`NerscField header;`
			`std::string file("./ckpoint_lat.4000");`
SITMO I/O for NERSC working now bit repro 2017-05-05 16:54:44 +01:00
			`int precision32 = 0;`
			`int tworow = 0;`
			`NerscIO::writeConfiguration(Umu,file,tworow,precision32);`
Diff comparison check 2017-05-30 23:40:11 +01:00			`Umu_saved = Umu;`
Binary IO file for generic Grid array parallel I/O. Number of IO MPI tasks can be varied by selecting which dimensions use parallel IO and which dimensions use Serial send to boss I/O. Thus can neck down from, say 1024 nodes = 4x4x8x8 to {1,8,32,64,128,256,1024} nodes doing the I/O. Interpolates nicely between ALL nodes write their data, a single boss per time-plane in processor space [old UKQCD fortran code did this], and a single node doing all I/O. Not sure I have the transfer sizes big enough and am not overly convinced fstream is guaranteed to not give buffer inconsistencies unless I set streambuf size to zero. Practically it has worked on 8 tasks, 2x1x2x2 writing /cloning NERSC configurations on my MacOS + OpenMPI and Clang environment. It is VERY easy to switch to pwrite at a later date, and also easy to send x-strips around from each node in order to gather bigger chunks at the syscall level. That would push us up to the circa 8x 1848 == 4KB size write chunk, and by taking, say, x/y non parallel we get to 16MB contiguous chunks written in multi 4KB transactions per IOnode in 64^3 lattices for configuration I/O. I suspect this is fine for system performance. 2015-08-26 13:40:29 +01:00			`NerscIO::readConfiguration(Umu,header,file);`
Diff comparison check 2017-05-30 23:40:11 +01:00			`Umu_diff = Umu - Umu_saved;`
			`//std::cout << "Umu_save "<<Umu_saved[0]<<std::endl;`
			`//std::cout << "Umu_read "<<Umu[0]<<std::endl;`
Roll over to MPI version of I/O 2017-06-01 22:38:18 +01:00			`std::cout <<GridLogMessage<< "norm2 Gauge Diff = "<<norm2(Umu_diff)<<std::endl;`
Got the NERSC IO working and fixed a bug in cshift. 2015-04-22 22:46:48 +01:00
			`for(int mu=0;mu<Nd;mu++){`
big commit fixing nocompiles in defective C++11 compilers (gcc, icpc). stared getting to near the bleeding edge I guess 2015-06-30 15:01:26 +01:00			`U[mu] = PeekIndex<LorentzIndex>(Umu,mu);`
Got the NERSC IO working and fixed a bug in cshift. 2015-04-22 22:46:48 +01:00			`}`

			`// Painful ; fix syntactical niceness`
			`LatticeComplex LinkTrace(&Fine);`
			`LinkTrace=zero;`
			`for(int mu=0;mu<Nd;mu++){`
			`LinkTrace = LinkTrace + trace(U[mu]);`
			`}`

			`// (1+2+3)=6 = N(N-1)/2 terms`
			`LatticeComplex Plaq(&Fine);`
Reorganised the TODO. Really getting somewhere 2015-04-23 20:42:30 +01:00			`LatticeComplex cPlaq(&Coarse);`
Reworking to keep intel compiler happy 2015-05-19 21:29:07 +01:00
Got the NERSC IO working and fixed a bug in cshift. 2015-04-22 22:46:48 +01:00			`Plaq = zero;`
Reworking to keep intel compiler happy 2015-05-19 21:29:07 +01:00			`#if 1`
Got the NERSC IO working and fixed a bug in cshift. 2015-04-22 22:46:48 +01:00			`for(int mu=1;mu<Nd;mu++){`
			`for(int nu=0;nu<mu;nu++){`
			`Plaq = Plaq + trace(U[mu]Cshift(U[nu],mu,1)adj(Cshift(U[mu],nu,1))*adj(U[nu]));`
			`}`
			`}`
Reworking to keep intel compiler happy 2015-05-19 21:29:07 +01:00			`#endif`
Got the NERSC IO working and fixed a bug in cshift. 2015-04-22 22:46:48 +01:00			`double vol = Fine.gSites();`
			`Complex PlaqScale(1.0/vol/6.0/3.0);`
Slice summation working. May move this into lattice/Grid_lattice_reduction however 2015-04-23 15:13:00 +01:00
			`std::vector<TComplex> Plaq_T(orthosz);`
			`sliceSum(Plaq,Plaq_T,Nd-1);`
			`int Nt = Plaq_T.size();`

Big updates with progress towards wilson matrix 2015-04-26 15:51:09 +01:00			`TComplex Plaq_T_sum;`
			`Plaq_T_sum=zero;`
Slice summation working. May move this into lattice/Grid_lattice_reduction however 2015-04-23 15:13:00 +01:00			`for(int t=0;t<Nt;t++){`
			`Plaq_T_sum = Plaq_T_sum+Plaq_T[t];`
			`Complex Pt=TensorRemove(Plaq_T[t]);`
Sizable improvement in multigrid for unsquared. 6000 matmuls CG unprec 2000 matmuls CG prec (4000 eo muls) 1050 matmuls PGCR on 16^3 x 32 x 8 m=.01 Substantial effort on timing and logging infrastructure 2015-07-23 17:31:13 +01:00			`std::cout<<GridLogMessage << "sliced ["<<t<<"]" <<PtPlaqScaleReal(Nt)<<std::endl;`
Slice summation working. May move this into lattice/Grid_lattice_reduction however 2015-04-23 15:13:00 +01:00			`}`

			`{`
			`Complex Pt = TensorRemove(Plaq_T_sum);`
Sizable improvement in multigrid for unsquared. 6000 matmuls CG unprec 2000 matmuls CG prec (4000 eo muls) 1050 matmuls PGCR on 16^3 x 32 x 8 m=.01 Substantial effort on timing and logging infrastructure 2015-07-23 17:31:13 +01:00			`std::cout<<GridLogMessage << "total " <<Pt*PlaqScale<<std::endl;`
Slice summation working. May move this into lattice/Grid_lattice_reduction however 2015-04-23 15:13:00 +01:00			`}`

add bug-fixed Test_nersc)_io. 2015-06-16 20:23:27 +01:00
Got the NERSC IO working and fixed a bug in cshift. 2015-04-22 22:46:48 +01:00			`TComplex Tp = sum(Plaq);`
			`Complex p = TensorRemove(Tp);`
Sizable improvement in multigrid for unsquared. 6000 matmuls CG unprec 2000 matmuls CG prec (4000 eo muls) 1050 matmuls PGCR on 16^3 x 32 x 8 m=.01 Substantial effort on timing and logging infrastructure 2015-07-23 17:31:13 +01:00			`std::cout<<GridLogMessage << "calculated plaquettes " <<p*PlaqScale<<std::endl;`
Got the NERSC IO working and fixed a bug in cshift. 2015-04-22 22:46:48 +01:00
			`Complex LinkTraceScale(1.0/vol/4.0/3.0);`
			`TComplex Tl = sum(LinkTrace);`
			`Complex l = TensorRemove(Tl);`
Sizable improvement in multigrid for unsquared. 6000 matmuls CG unprec 2000 matmuls CG prec (4000 eo muls) 1050 matmuls PGCR on 16^3 x 32 x 8 m=.01 Substantial effort on timing and logging infrastructure 2015-07-23 17:31:13 +01:00			`std::cout<<GridLogMessage << "calculated link trace " <<l*LinkTraceScale<<std::endl;`
Got the NERSC IO working and fixed a bug in cshift. 2015-04-22 22:46:48 +01:00
Conjugate residual algorithm; some more unary functions 2015-06-08 12:04:59 +01:00			`blockSum(cPlaq,Plaq);`
Reorganised the TODO. Really getting somewhere 2015-04-23 20:42:30 +01:00			`TComplex TcP = sum(cPlaq);`
			`Complex ll= TensorRemove(TcP);`
Sizable improvement in multigrid for unsquared. 6000 matmuls CG unprec 2000 matmuls CG prec (4000 eo muls) 1050 matmuls PGCR on 16^3 x 32 x 8 m=.01 Substantial effort on timing and logging infrastructure 2015-07-23 17:31:13 +01:00			`std::cout<<GridLogMessage << "coarsened plaquettes sum to " <<ll*PlaqScale<<std::endl;`
Binary IO file for generic Grid array parallel I/O. Number of IO MPI tasks can be varied by selecting which dimensions use parallel IO and which dimensions use Serial send to boss I/O. Thus can neck down from, say 1024 nodes = 4x4x8x8 to {1,8,32,64,128,256,1024} nodes doing the I/O. Interpolates nicely between ALL nodes write their data, a single boss per time-plane in processor space [old UKQCD fortran code did this], and a single node doing all I/O. Not sure I have the transfer sizes big enough and am not overly convinced fstream is guaranteed to not give buffer inconsistencies unless I set streambuf size to zero. Practically it has worked on 8 tasks, 2x1x2x2 writing /cloning NERSC configurations on my MacOS + OpenMPI and Clang environment. It is VERY easy to switch to pwrite at a later date, and also easy to send x-strips around from each node in order to gather bigger chunks at the syscall level. That would push us up to the circa 8x 1848 == 4KB size write chunk, and by taking, say, x/y non parallel we get to 16MB contiguous chunks written in multi 4KB transactions per IOnode in 64^3 lattices for configuration I/O. I suspect this is fine for system performance. 2015-08-26 13:40:29 +01:00
			`std::string clone2x3("./ckpoint_clone2x3.4000");`
			`std::string clone3x3("./ckpoint_clone3x3.4000");`

			`NerscIO::writeConfiguration(Umu,clone3x3,0,precision32);`
			`NerscIO::writeConfiguration(Umu,clone2x3,1,precision32);`
Reorganised the TODO. Really getting somewhere 2015-04-23 20:42:30 +01:00
Got the NERSC IO working and fixed a bug in cshift. 2015-04-22 22:46:48 +01:00			`Grid_finalize();`
			`}`