From 831ca4e3bf8e0b4231f395b2af9308e007b73186 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Tue, 14 Mar 2017 14:55:18 +0900 Subject: [PATCH 001/170] Added Scalar action for fields in the adjoint representation --- lib/qcd/action/Actions.h | 5 + lib/qcd/action/scalar/ScalarAction.h | 61 ++++++----- lib/qcd/action/scalar/ScalarImpl.h | 93 ++++++++-------- .../action/scalar/ScalarInteractionAction.h | 84 +++++++-------- lib/qcd/hmc/GenericHMCrunner.h | 3 + lib/qcd/representations/hmc_types.h | 2 +- tests/hmc/Test_hmc_ScalarActionNxN.cc | 100 ++++++++++++++++++ 7 files changed, 227 insertions(+), 121 deletions(-) create mode 100644 tests/hmc/Test_hmc_ScalarActionNxN.cc diff --git a/lib/qcd/action/Actions.h b/lib/qcd/action/Actions.h index daf64f3d..0214b8f4 100644 --- a/lib/qcd/action/Actions.h +++ b/lib/qcd/action/Actions.h @@ -69,6 +69,7 @@ Author: paboyle //////////////////////////////////////////// #include #include +#include namespace Grid { namespace QCD { @@ -106,6 +107,10 @@ typedef ScalarAction ScalarActionR; typedef ScalarAction ScalarActionF; typedef ScalarAction ScalarActionD; +typedef ScalarInteractionAction ScalarAdjActionR; +typedef ScalarInteractionAction ScalarAdjActionF; +typedef ScalarInteractionAction ScalarAdjActionD; + }} //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/lib/qcd/action/scalar/ScalarAction.h b/lib/qcd/action/scalar/ScalarAction.h index f10ec9a6..2c82d2e3 100644 --- a/lib/qcd/action/scalar/ScalarAction.h +++ b/lib/qcd/action/scalar/ScalarAction.h @@ -6,10 +6,10 @@ Copyright (C) 2015 -Author: Azusa Yamaguchi -Author: Peter Boyle -Author: neo -Author: paboyle + Author: Azusa Yamaguchi + Author: Peter Boyle + Author: neo + Author: paboyle This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -35,50 +35,49 @@ directory namespace Grid { // FIXME drop the QCD namespace everywhere here - - template - class ScalarAction : public QCD::Action { - public: + +template +class ScalarAction : public QCD::Action { + public: INHERIT_FIELD_TYPES(Impl); - - private: + + private: RealD mass_square; RealD lambda; - - public: - ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l){}; - virtual std::string LogParameters(){ + public: + ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l) {} + + virtual std::string LogParameters() { std::stringstream sstream; sstream << GridLogMessage << "[ScalarAction] lambda : " << lambda << std::endl; sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl; return sstream.str(); - } - - virtual std::string action_name(){return "ScalarAction";} - - virtual void refresh(const Field &U, - GridParallelRNG &pRNG){}; // noop as no pseudoferms - + virtual std::string action_name() {return "ScalarAction";} + + virtual void refresh(const Field &U, GridParallelRNG &pRNG) {} // noop as no pseudoferms + virtual RealD S(const Field &p) { return (mass_square * 0.5 + QCD::Nd) * ScalarObs::sumphisquared(p) + - (lambda / 24.) * ScalarObs::sumphifourth(p) + - ScalarObs::sumphider(p); + (lambda / 24.) * ScalarObs::sumphifourth(p) + + ScalarObs::sumphider(p); }; - + virtual void deriv(const Field &p, - Field &force) { + Field &force) { Field tmp(p._grid); Field p2(p._grid); ScalarObs::phisquared(p2, p); tmp = -(Cshift(p, 0, -1) + Cshift(p, 0, 1)); for (int mu = 1; mu < QCD::Nd; mu++) tmp -= Cshift(p, mu, -1) + Cshift(p, mu, 1); - - force=+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp; - }; - }; - -} // Grid + + force =+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp; + } +}; + + + +} // namespace Grid #endif // SCALAR_ACTION_H diff --git a/lib/qcd/action/scalar/ScalarImpl.h b/lib/qcd/action/scalar/ScalarImpl.h index ee2d2fb8..6d14b61a 100644 --- a/lib/qcd/action/scalar/ScalarImpl.h +++ b/lib/qcd/action/scalar/ScalarImpl.h @@ -5,96 +5,99 @@ namespace Grid { //namespace QCD { - template - class ScalarImplTypes { - public: +template +class ScalarImplTypes { + public: typedef S Simd; - + template using iImplField = iScalar > >; - + typedef iImplField SiteField; - - + typedef Lattice Field; - - static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){ + + static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) { gaussian(pRNG, P); } - + static inline Field projectForce(Field& P){return P;} - - static inline void update_field(Field& P, Field& U, double ep){ + + static inline void update_field(Field& P, Field& U, double ep) { U += P*ep; } - - static inline RealD FieldSquareNorm(Field& U){ + + static inline RealD FieldSquareNorm(Field& U) { return (- sum(trace(U*U))/2.0); } - + static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) { gaussian(pRNG, U); } - + static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) { gaussian(pRNG, U); } - + static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) { U = 1.0; } - + }; template - class ScalarMatrixImplTypes { + class ScalarAdjMatrixImplTypes { public: typedef S Simd; - template using iImplField = iScalar > >; - + typedef iImplField SiteField; - - + typedef Lattice Field; - - static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){ - gaussian(pRNG, P); + + static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) { + QCD::SU::GaussianFundamentalLieAlgebraMatrix(pRNG, P); } - - static inline Field projectForce(Field& P){return P;} - - static inline void update_field(Field& P, Field& U, double ep){ + + static inline Field projectForce(Field& P) {return P;} + + static inline void update_field(Field& P, Field& U, double ep) { U += P*ep; } - - static inline RealD FieldSquareNorm(Field& U){ - return (TensorRemove(- sum(trace(U*U))*0.5).real()); + + static inline RealD FieldSquareNorm(Field& U) { + return (TensorRemove(sum(trace(U*U))).real()); } - + static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) { - gaussian(pRNG, U); + QCD::SU::LieRandomize(pRNG, U); } - + static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) { - gaussian(pRNG, U); + QCD::SU::LieRandomize(pRNG, U, 0.01); } - + static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) { - U = 1.0; + U = zero; } - + }; - - + + typedef ScalarImplTypes ScalarImplR; typedef ScalarImplTypes ScalarImplF; typedef ScalarImplTypes ScalarImplD; - - //} -} + + // Hardcoding here the size of the matrices + typedef ScalarAdjMatrixImplTypes ScalarAdjImplR; + typedef ScalarAdjMatrixImplTypes ScalarAdjImplF; + typedef ScalarAdjMatrixImplTypes ScalarAdjImplD; + + + //} +} #endif diff --git a/lib/qcd/action/scalar/ScalarInteractionAction.h b/lib/qcd/action/scalar/ScalarInteractionAction.h index bd54a010..2607b041 100644 --- a/lib/qcd/action/scalar/ScalarInteractionAction.h +++ b/lib/qcd/action/scalar/ScalarInteractionAction.h @@ -6,10 +6,7 @@ Copyright (C) 2015 -Author: Azusa Yamaguchi -Author: Peter Boyle -Author: neo -Author: paboyle + Author: Guido Cossu This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -30,55 +27,54 @@ directory *************************************************************************************/ /* END LEGAL */ -#ifndef SCALAR_ACTION_H -#define SCALAR_ACTION_H +#ifndef SCALAR_INT_ACTION_H +#define SCALAR_INT_ACTION_H namespace Grid { // FIXME drop the QCD namespace everywhere here - - template - class ScalarInteractionAction : public QCD::Action { - public: - INHERIT_FIELD_TYPES(Impl); - - private: + +template +class ScalarInteractionAction : public QCD::Action { RealD mass_square; RealD lambda; - - public: - ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l){}; - virtual std::string LogParameters(){ + public: + INHERIT_FIELD_TYPES(Impl); + ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l) {} + + virtual std::string LogParameters() { std::stringstream sstream; sstream << GridLogMessage << "[ScalarAction] lambda : " << lambda << std::endl; sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl; return sstream.str(); - } - - virtual std::string action_name(){return "ScalarAction";} - - virtual void refresh(const Field &U, - GridParallelRNG &pRNG){}; // noop as no pseudoferms - - virtual RealD S(const Field &p) { - return (mass_square * 0.5 + QCD::Nd) * ScalarObs::sumphisquared(p) + - (lambda / 24.) * ScalarObs::sumphifourth(p) + - ScalarObs::sumphider(p); - }; - - virtual void deriv(const Field &p, - Field &force) { - Field tmp(p._grid); - Field p2(p._grid); - ScalarObs::phisquared(p2, p); - tmp = -(Cshift(p, 0, -1) + Cshift(p, 0, 1)); - for (int mu = 1; mu < QCD::Nd; mu++) tmp -= Cshift(p, mu, -1) + Cshift(p, mu, 1); - - force=+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp; - }; - }; - -} // Grid -#endif // SCALAR_ACTION_H + virtual std::string action_name() {return "ScalarAction";} + + virtual void refresh(const Field &U, + GridParallelRNG &pRNG) {} // noop as no pseudoferms + + virtual RealD S(const Field &p) { + Field action(p._grid); + Field pshift(p._grid); + Field phisquared(p._grid); + phisquared = p*p; + action = (2.0*QCD::Nd + mass_square)*phisquared + lambda*phisquared*phisquared; + for (int mu = 0; mu < QCD::Nd; mu++) { + pshift = Cshift(p, mu, +1); // not efficient implement with stencils + action -= pshift*p + p*pshift; + } + return -(TensorRemove(sum(trace(action)))).real(); + }; + + virtual void deriv(const Field &p, + Field &force) { + force = (2.0*QCD::Nd + mass_square)*p + 2.0*lambda*p*p*p; + // following is inefficient + for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1); + } +}; + +} // namespace Grid + +#endif // SCALAR_INT_ACTION_H diff --git a/lib/qcd/hmc/GenericHMCrunner.h b/lib/qcd/hmc/GenericHMCrunner.h index 66b16435..a97fb4e4 100644 --- a/lib/qcd/hmc/GenericHMCrunner.h +++ b/lib/qcd/hmc/GenericHMCrunner.h @@ -202,6 +202,9 @@ using GenericHMCRunnerTemplate = HMCWrapperTemplate ScalarGenericHMCRunner; +typedef HMCWrapperTemplate + ScalarAdjGenericHMCRunner; + } // namespace QCD } // namespace Grid diff --git a/lib/qcd/representations/hmc_types.h b/lib/qcd/representations/hmc_types.h index 3701c9b2..b4991941 100644 --- a/lib/qcd/representations/hmc_types.h +++ b/lib/qcd/representations/hmc_types.h @@ -62,7 +62,7 @@ class Representations { typedef Representations NoHirep; typedef Representations > ScalarFields; - //typedef Representations > ScalarMatrixFields; +typedef Representations > ScalarMatrixFields; // Helper classes to access the elements // Strips the first N parameters from the tuple diff --git a/tests/hmc/Test_hmc_ScalarActionNxN.cc b/tests/hmc/Test_hmc_ScalarActionNxN.cc new file mode 100644 index 00000000..8b93efde --- /dev/null +++ b/tests/hmc/Test_hmc_ScalarActionNxN.cc @@ -0,0 +1,100 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./tests/Test_hmc_WilsonFermionGauge.cc + +Copyright (C) 2016 + +Author: Guido Cossu + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#include +namespace Grid{ +class ScalarActionParameters : Serializable { + public: + GRID_SERIALIZABLE_CLASS_MEMBERS(ScalarActionParameters, + double, mass_squared, + double, lambda); +}; + +} +int main(int argc, char **argv) { + using namespace Grid; + using namespace Grid::QCD; + + Grid_init(&argc, &argv); + int threads = GridThread::GetThreads(); + // here make a routine to print all the relevant information on the run + std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl; + + // Typedefs to simplify notation + typedef ScalarAdjGenericHMCRunner HMCWrapper; // Uses the default minimum norm, real scalar fields + + //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: + HMCWrapper TheHMC; + + // Grid from the command line + GridModule ScalarGrid; + ScalarGrid.set_full( SpaceTimeGrid::makeFourDimGrid( + GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()), + GridDefaultMpi())); + ScalarGrid.set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(ScalarGrid.get_full())); + TheHMC.Resources.AddGrid("scalar", ScalarGrid); + // Possibile to create the module by hand + // hardcoding parameters or using a Reader + + // Checkpointer definition + CheckpointerParameters CPparams; + CPparams.config_prefix = "ckpoint_scalar_lat"; + CPparams.rng_prefix = "ckpoint_scalar_rng"; + CPparams.saveInterval = 50; + CPparams.format = "IEEE64BIG"; + + TheHMC.Resources.LoadBinaryCheckpointer(CPparams); + + RNGModuleParameters RNGpar; + RNGpar.serial_seeds = "1 2 3 4 5"; + RNGpar.parallel_seeds = "6 7 8 9 10"; + TheHMC.Resources.SetRNGSeeds(RNGpar); + ///////////////////////////////////////////////////////////// + // Collect actions, here use more encapsulation + + // Scalar action in adjoint representation + ScalarActionParameters SPar; + SPar.mass_squared = 0.5; + SPar.lambda = 0.1; + ScalarAdjActionR Saction(SPar.mass_squared, SPar.lambda); + + // Collect actions + ActionLevel Level1(1); + Level1.push_back(&Saction); + TheHMC.TheAction.push_back(Level1); + ///////////////////////////////////////////////////////////// + + // HMC parameters are serialisable + TheHMC.Parameters.MD.MDsteps = 10; + TheHMC.Parameters.MD.trajL = 1.0; + + TheHMC.ReadCommandLine(argc, argv); + TheHMC.Run(); + + Grid_finalize(); + +} // main From 38806343a873ea10264c79103db31182d6770947 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Wed, 15 Mar 2017 15:16:16 +0900 Subject: [PATCH 002/170] Improving efficiency of the force term --- .../action/scalar/ScalarInteractionAction.h | 91 ++++++++++++++++--- tests/Test_stencil.cc | 43 +++++---- tests/hmc/Test_hmc_ScalarActionNxN.cc | 11 +-- 3 files changed, 104 insertions(+), 41 deletions(-) diff --git a/lib/qcd/action/scalar/ScalarInteractionAction.h b/lib/qcd/action/scalar/ScalarInteractionAction.h index 2607b041..5a322a5e 100644 --- a/lib/qcd/action/scalar/ScalarInteractionAction.h +++ b/lib/qcd/action/scalar/ScalarInteractionAction.h @@ -30,17 +30,34 @@ directory #ifndef SCALAR_INT_ACTION_H #define SCALAR_INT_ACTION_H + +// Note: this action can completely absorb the ScalarAction for real float fields +// use the scalarObjs to generalise the structure + namespace Grid { // FIXME drop the QCD namespace everywhere here template class ScalarInteractionAction : public QCD::Action { +public: + INHERIT_FIELD_TYPES(Impl); +private: RealD mass_square; RealD lambda; + + typedef typename Field::vector_object vobj; + typedef CartesianStencil Stencil; + + SimpleCompressor compressor; + int npoint = 8; + std::vector directions = {0,1,2,3,0,1,2,3}; // forcing 4 dimensions + std::vector displacements = {1,1,1,1, -1,-1,-1,-1}; + + public: - INHERIT_FIELD_TYPES(Impl); - ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l) {} + + ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l){} virtual std::string LogParameters() { std::stringstream sstream; @@ -51,27 +68,75 @@ class ScalarInteractionAction : public QCD::Action { virtual std::string action_name() {return "ScalarAction";} - virtual void refresh(const Field &U, - GridParallelRNG &pRNG) {} // noop as no pseudoferms + virtual void refresh(const Field &U, GridParallelRNG &pRNG) {} virtual RealD S(const Field &p) { - Field action(p._grid); - Field pshift(p._grid); - Field phisquared(p._grid); + static Stencil phiStencil(p._grid, npoint, 0, directions, displacements); + phiStencil.HaloExchange(p, compressor); + + Field action(p._grid), pshift(p._grid), phisquared(p._grid); phisquared = p*p; action = (2.0*QCD::Nd + mass_square)*phisquared + lambda*phisquared*phisquared; for (int mu = 0; mu < QCD::Nd; mu++) { - pshift = Cshift(p, mu, +1); // not efficient implement with stencils - action -= pshift*p + p*pshift; + // pshift = Cshift(p, mu, +1); // not efficient, implement with stencils + PARALLEL_FOR_LOOP + for (int i = 0; i < p._grid->oSites(); i++) { + int permute_type; + StencilEntry *SE; + vobj temp2; + vobj *temp; + vobj *t_p; + + SE = phiStencil.GetEntry(permute_type, mu, i); + t_p = &p._odata[i]; + if ( SE->_is_local ) { + temp = &p._odata[SE->_offset]; + if ( SE->_permute ) { + permute(temp2, *temp, permute_type); + action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2; + } else { + action._odata[i] -= *temp*(*t_p) + (*t_p)*(*temp); + } + } else { + action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset]; + } + } + // action -= pshift*p + p*pshift; } + // NB the trace in the algebra is normalised to 1/2 + // minus sign coming from the antihermitian fields return -(TensorRemove(sum(trace(action)))).real(); }; - virtual void deriv(const Field &p, - Field &force) { + virtual void deriv(const Field &p, Field &force) { force = (2.0*QCD::Nd + mass_square)*p + 2.0*lambda*p*p*p; - // following is inefficient - for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1); + // move this outside + static Stencil phiStencil(p._grid, npoint, 0, directions, displacements); + phiStencil.HaloExchange(p, compressor); + + //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1); + for (int point = 0; point < npoint; point++) { + PARALLEL_FOR_LOOP + for (int i = 0; i < p._grid->oSites(); i++) { + vobj *temp; + vobj temp2; + int permute_type; + StencilEntry *SE; + SE = phiStencil.GetEntry(permute_type, point, i); + + if ( SE->_is_local ) { + temp = &p._odata[SE->_offset]; + if ( SE->_permute ) { + permute(temp2, *temp, permute_type); + force._odata[i] -= temp2; + } else { + force._odata[i] -= *temp; + } + } else { + force._odata[i] -= phiStencil.CommBuf()[SE->_offset]; + } + } + } } }; diff --git a/tests/Test_stencil.cc b/tests/Test_stencil.cc index 1b71b8a5..1d35e1bb 100644 --- a/tests/Test_stencil.cc +++ b/tests/Test_stencil.cc @@ -1,6 +1,6 @@ /************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid + Grid physics library, www.github.com/paboyle/Grid Source file: ./tests/Test_stencil.cc @@ -33,9 +33,8 @@ using namespace std; using namespace Grid; using namespace Grid::QCD; -int main (int argc, char ** argv) -{ - Grid_init(&argc,&argv); +int main(int argc, char ** argv) { + Grid_init(&argc, &argv); // typedef LatticeColourMatrix Field; typedef LatticeComplex Field; @@ -47,7 +46,7 @@ int main (int argc, char ** argv) std::vector mpi_layout = GridDefaultMpi(); double volume = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; - + GridCartesian Fine(latt_size,simd_layout,mpi_layout); GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout); GridParallelRNG fRNG(&Fine); @@ -55,14 +54,14 @@ int main (int argc, char ** argv) // fRNG.SeedRandomDevice(); std::vector seeds({1,2,3,4}); fRNG.SeedFixedIntegers(seeds); - + Field Foo(&Fine); Field Bar(&Fine); Field Check(&Fine); Field Diff(&Fine); LatticeComplex lex(&Fine); - lex = zero; + lex = zero; random(fRNG,Foo); gaussian(fRNG,Bar); @@ -98,7 +97,7 @@ int main (int argc, char ** argv) Fine.oCoorFromOindex(ocoor,o); ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir]; } - + SimpleCompressor compress; myStencil.HaloExchange(Foo,compress); @@ -106,16 +105,16 @@ int main (int argc, char ** argv) // Implement a stencil code that should agree with cshift! for(int i=0;ioSites();i++){ - + int permute_type; StencilEntry *SE; SE = myStencil.GetEntry(permute_type,0,i); - + if ( SE->_is_local && SE->_permute ) permute(Check._odata[i],Foo._odata[SE->_offset],permute_type); else if (SE->_is_local) Check._odata[i] = Foo._odata[SE->_offset]; - else + else Check._odata[i] = myStencil.CommBuf()[SE->_offset]; } @@ -144,7 +143,7 @@ int main (int argc, char ** argv) <<") " < compress; EStencil.HaloExchange(EFoo,compress); OStencil.HaloExchange(OFoo,compress); - + Bar = Cshift(Foo,dir,disp); if ( disp & 0x1 ) { ECheck.checkerboard = Even; OCheck.checkerboard = Odd; - } else { + } else { ECheck.checkerboard = Odd; OCheck.checkerboard = Even; } @@ -206,7 +205,7 @@ int main (int argc, char ** argv) permute(OCheck._odata[i],EFoo._odata[SE->_offset],permute_type); else if (SE->_is_local) OCheck._odata[i] = EFoo._odata[SE->_offset]; - else + else OCheck._odata[i] = EStencil.CommBuf()[SE->_offset]; } for(int i=0;ioSites();i++){ @@ -214,18 +213,18 @@ int main (int argc, char ** argv) StencilEntry *SE; SE = OStencil.GetEntry(permute_type,0,i); // std::cout << "ODD source "<< i<<" -> " <_offset << " "<< SE->_is_local<_is_local && SE->_permute ) permute(ECheck._odata[i],OFoo._odata[SE->_offset],permute_type); else if (SE->_is_local) ECheck._odata[i] = OFoo._odata[SE->_offset]; - else + else ECheck._odata[i] = OStencil.CommBuf()[SE->_offset]; } - + setCheckerboard(Check,ECheck); setCheckerboard(Check,OCheck); - + Real nrmC = norm2(Check); Real nrmB = norm2(Bar); Diff = Check-Bar; @@ -248,10 +247,10 @@ int main (int argc, char ** argv) diff =norm2(ddiff); if ( diff > 0){ std::cout <<"Coor (" << coor[0]<<","< -namespace Grid{ +namespace Grid { class ScalarActionParameters : Serializable { public: GRID_SERIALIZABLE_CLASS_MEMBERS(ScalarActionParameters, @@ -44,7 +44,7 @@ int main(int argc, char **argv) { // here make a routine to print all the relevant information on the run std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl; - // Typedefs to simplify notation + // Typedefs to simplify notation typedef ScalarAdjGenericHMCRunner HMCWrapper; // Uses the default minimum norm, real scalar fields //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: @@ -52,7 +52,7 @@ int main(int argc, char **argv) { // Grid from the command line GridModule ScalarGrid; - ScalarGrid.set_full( SpaceTimeGrid::makeFourDimGrid( + ScalarGrid.set_full(SpaceTimeGrid::makeFourDimGrid( GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi())); ScalarGrid.set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(ScalarGrid.get_full())); @@ -89,12 +89,11 @@ int main(int argc, char **argv) { ///////////////////////////////////////////////////////////// // HMC parameters are serialisable - TheHMC.Parameters.MD.MDsteps = 10; + TheHMC.Parameters.MD.MDsteps = 20; TheHMC.Parameters.MD.trajL = 1.0; TheHMC.ReadCommandLine(argc, argv); TheHMC.Run(); Grid_finalize(); - -} // main +} // main From 038b6ee9cdfc5902b27a8645b1f1758c9db3656f Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Thu, 16 Mar 2017 01:09:24 +0900 Subject: [PATCH 003/170] Fixing JSON compilation error --- lib/json/json.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/json/json.hpp b/lib/json/json.hpp index 97214f0b..bfb38c3e 100644 --- a/lib/json/json.hpp +++ b/lib/json/json.hpp @@ -64,7 +64,7 @@ SOFTWARE. #endif #elif defined(__GNUC__) #define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) - #if GCC_VERSION < 40900 + #if GCC_VERSION < 40800 #error "unsupported GCC version - see https://github.com/nlohmann/json#supported-compilers" #endif #endif From ef0fe2bcc12c95d37a8b537ba1694da22da8ab41 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 21 Mar 2017 11:39:46 +0000 Subject: [PATCH 004/170] Added empty ScalarFV module --- extras/Hadrons/Modules.hpp | 1 + extras/Hadrons/Modules/MScalar/ScalarFV.cc | 40 +++++++++++++++++++ extras/Hadrons/Modules/MScalar/ScalarFV.hpp | 44 +++++++++++++++++++++ extras/Hadrons/modules.inc | 4 +- 4 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 extras/Hadrons/Modules/MScalar/ScalarFV.cc create mode 100644 extras/Hadrons/Modules/MScalar/ScalarFV.hpp diff --git a/extras/Hadrons/Modules.hpp b/extras/Hadrons/Modules.hpp index a25419c5..b482eded 100644 --- a/extras/Hadrons/Modules.hpp +++ b/extras/Hadrons/Modules.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include diff --git a/extras/Hadrons/Modules/MScalar/ScalarFV.cc b/extras/Hadrons/Modules/MScalar/ScalarFV.cc new file mode 100644 index 00000000..26f9f648 --- /dev/null +++ b/extras/Hadrons/Modules/MScalar/ScalarFV.cc @@ -0,0 +1,40 @@ +#include + +using namespace Grid; +using namespace Hadrons; +using namespace MScalar; + +/****************************************************************************** +* TScalarFV implementation * +******************************************************************************/ +// constructor ///////////////////////////////////////////////////////////////// +TScalarFV::TScalarFV(const std::string name) +: Module(name) +{} + +// dependencies/products /////////////////////////////////////////////////////// +std::vector TScalarFV::getInput(void) +{ + std::vector in; + + return in; +} + +std::vector TScalarFV::getOutput(void) +{ + std::vector out = {getName()}; + + return out; +} + +// setup /////////////////////////////////////////////////////////////////////// +void TScalarFV::setup(void) +{ + +} + +// execution /////////////////////////////////////////////////////////////////// +void TScalarFV::execute(void) +{ + +} diff --git a/extras/Hadrons/Modules/MScalar/ScalarFV.hpp b/extras/Hadrons/Modules/MScalar/ScalarFV.hpp new file mode 100644 index 00000000..d350bcae --- /dev/null +++ b/extras/Hadrons/Modules/MScalar/ScalarFV.hpp @@ -0,0 +1,44 @@ +#ifndef Hadrons_ScalarFV_hpp_ +#define Hadrons_ScalarFV_hpp_ + +#include +#include +#include + +BEGIN_HADRONS_NAMESPACE + +/****************************************************************************** + * ScalarFV * + ******************************************************************************/ +BEGIN_MODULE_NAMESPACE(MScalar) + +class ScalarFVPar: Serializable +{ +public: + GRID_SERIALIZABLE_CLASS_MEMBERS(ScalarFVPar, + unsigned int, i); +}; + +class TScalarFV: public Module +{ +public: + // constructor + TScalarFV(const std::string name); + // destructor + virtual ~TScalarFV(void) = default; + // dependency relation + virtual std::vector getInput(void); + virtual std::vector getOutput(void); + // setup + virtual void setup(void); + // execution + virtual void execute(void); +}; + +MODULE_REGISTER_NS(ScalarFV, TScalarFV, MScalar); + +END_MODULE_NAMESPACE + +END_HADRONS_NAMESPACE + +#endif // Hadrons_ScalarFV_hpp_ diff --git a/extras/Hadrons/modules.inc b/extras/Hadrons/modules.inc index dfbe85ff..f368bbdc 100644 --- a/extras/Hadrons/modules.inc +++ b/extras/Hadrons/modules.inc @@ -4,7 +4,8 @@ modules_cc =\ Modules/MGauge/StochEm.cc \ Modules/MGauge/Unit.cc \ Modules/MScalar/ChargedProp.cc \ - Modules/MScalar/FreeProp.cc + Modules/MScalar/FreeProp.cc \ + Modules/MScalar/ScalarFV.cc modules_hpp =\ Modules/MAction/DWF.hpp \ @@ -18,6 +19,7 @@ modules_hpp =\ Modules/MScalar/ChargedProp.hpp \ Modules/MScalar/FreeProp.hpp \ Modules/MScalar/Scalar.hpp \ + Modules/MScalar/ScalarFV.hpp \ Modules/MSolver/RBPrecCG.hpp \ Modules/MSource/Point.hpp \ Modules/MSource/SeqGamma.hpp \ From 54c10a42cc5e5c46e55b4aa8faba6930927e56e1 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Fri, 24 Mar 2017 11:42:32 +0000 Subject: [PATCH 005/170] Add source and emField inputs to ScalarFV module --- extras/Hadrons/Modules/MScalar/ScalarFV.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extras/Hadrons/Modules/MScalar/ScalarFV.cc b/extras/Hadrons/Modules/MScalar/ScalarFV.cc index 26f9f648..e7a72abe 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarFV.cc +++ b/extras/Hadrons/Modules/MScalar/ScalarFV.cc @@ -15,7 +15,7 @@ TScalarFV::TScalarFV(const std::string name) // dependencies/products /////////////////////////////////////////////////////// std::vector TScalarFV::getInput(void) { - std::vector in; + std::vector in = {par().source, par().emField}; return in; } From 0c006fbfaac3d47c892fb1305002149d7f7fc31b Mon Sep 17 00:00:00 2001 From: James Harrison Date: Fri, 24 Mar 2017 11:59:09 +0000 Subject: [PATCH 006/170] Add ScalarFV inputs to ScalarFV.hpp --- extras/Hadrons/Modules/MScalar/ScalarFV.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/extras/Hadrons/Modules/MScalar/ScalarFV.hpp b/extras/Hadrons/Modules/MScalar/ScalarFV.hpp index d350bcae..f7802bda 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarFV.hpp +++ b/extras/Hadrons/Modules/MScalar/ScalarFV.hpp @@ -16,6 +16,11 @@ class ScalarFVPar: Serializable { public: GRID_SERIALIZABLE_CLASS_MEMBERS(ScalarFVPar, + std::string, emField, + std::string, source, + double, mass, + double, charge, + std::string, output, unsigned int, i); }; From 85516e9c7c5f1fb3096777e2995a7005f2fbc675 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Fri, 24 Mar 2017 17:13:55 +0000 Subject: [PATCH 007/170] Output all terms of scalar propagator separately --- extras/Hadrons/Modules/MScalar/ChargedProp.cc | 90 +++++++++++++++---- .../Hadrons/Modules/MScalar/ChargedProp.hpp | 5 +- 2 files changed, 78 insertions(+), 17 deletions(-) diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.cc b/extras/Hadrons/Modules/MScalar/ChargedProp.cc index 40d4504c..b76ea8d2 100644 --- a/extras/Hadrons/Modules/MScalar/ChargedProp.cc +++ b/extras/Hadrons/Modules/MScalar/ChargedProp.cc @@ -23,7 +23,8 @@ std::vector TChargedProp::getInput(void) std::vector TChargedProp::getOutput(void) { - std::vector out = {getName()}; + std::vector out = {getName(), getName()+"_0", getName()+"_D1", + getName()+"_D1D1", getName()+"_D2"}; return out; } @@ -38,6 +39,10 @@ void TChargedProp::setup(void) phaseName_.push_back("_shiftphase_" + std::to_string(mu)); } GFSrcName_ = "_" + getName() + "_DinvSrc"; + prop0Name_ = getName() + "_0"; + propD1Name_ = getName() + "_D1"; + propD1D1Name_ = getName() + "_D1D1"; + propD2Name_ = getName() + "_D2"; if (!env().hasRegisteredObject(freeMomPropName_)) { env().registerLattice(freeMomPropName_); @@ -53,7 +58,14 @@ void TChargedProp::setup(void) { env().registerLattice(GFSrcName_); } + if (!env().hasRegisteredObject(prop0Name_)) + { + env().registerLattice(prop0Name_); + } env().registerLattice(getName()); + env().registerLattice(propD1Name_); + env().registerLattice(propD1D1Name_); + env().registerLattice(propD2Name_); } // execution /////////////////////////////////////////////////////////////////// @@ -64,7 +76,7 @@ void TChargedProp::execute(void) Complex ci(0.0,1.0); FFT fft(env().getGrid()); - // cache free scalar propagator + // cache momentum-space free scalar propagator if (!env().hasCreatedObject(freeMomPropName_)) { LOG(Message) << "Caching momentum space free scalar propagator" @@ -88,6 +100,17 @@ void TChargedProp::execute(void) { GFSrc_ = env().getObject(GFSrcName_); } + // cache free scalar propagator + if (!env().hasCreatedObject(prop0Name_)) + { + prop0_ = env().createLattice(prop0Name_); + *prop0_ = *GFSrc_; + fft.FFT_all_dim(*prop0_, *prop0_, FFT::backward); + } + else + { + prop0_ = env().getObject(prop0Name_); + } // cache phases if (!env().hasCreatedObject(phaseName_[0])) { @@ -117,30 +140,33 @@ void TChargedProp::execute(void) << ", charge= " << par().charge << ")..." << std::endl; ScalarField &prop = *env().createLattice(getName()); + ScalarField &propD1 = *env().createLattice(propD1Name_); + ScalarField &propD1D1 = *env().createLattice(propD1D1Name_); + ScalarField &propD2 = *env().createLattice(propD2Name_); ScalarField buf(env().getGrid()); ScalarField &GFSrc = *GFSrc_, &G = *freeMomProp_; double q = par().charge; - // G*F*Src - prop = GFSrc; - - // - q*G*momD1*G*F*Src (momD1 = F*D1*Finv) + // -G*momD1*G*F*Src (momD1 = F*D1*Finv) buf = GFSrc; momD1(buf, fft); - buf = G*buf; - prop = prop - q*buf; + buf = -G*buf; + fft.FFT_all_dim(propD1, buf, FFT::backward); - // + q^2*G*momD1*G*momD1*G*F*Src (here buf = G*momD1*G*F*Src) + // G*momD1*G*momD1*G*F*Src (here buf = G*momD1*G*F*Src) + buf = -buf; momD1(buf, fft); - prop = prop + q*q*G*buf; + propD1D1 = G*buf; + fft.FFT_all_dim(propD1D1, propD1D1, FFT::backward); - // - q^2*G*momD2*G*F*Src (momD2 = F*D2*Finv) + // -G*momD2*G*F*Src (momD2 = F*D2*Finv) buf = GFSrc; momD2(buf, fft); - prop = prop - q*q*G*buf; + buf = -G*buf; + fft.FFT_all_dim(propD2, buf, FFT::backward); - // final FT - fft.FFT_all_dim(prop, prop, FFT::backward); + // full charged scalar propagator + prop = (*prop0_) + q*propD1 + q*q*propD1D1 + q*q*propD2; // OUTPUT IF NECESSARY if (!par().output.empty()) @@ -155,14 +181,48 @@ void TChargedProp::execute(void) std::vector vecBuf; std::vector result; + write(writer, "charge", q); + + // Write full propagator sliceSum(prop, vecBuf, Tp); result.resize(vecBuf.size()); for (unsigned int t = 0; t < vecBuf.size(); ++t) { result[t] = TensorRemove(vecBuf[t]); } - write(writer, "charge", q); write(writer, "prop", result); + + // Write free propagator + sliceSum(*prop0_, vecBuf, Tp); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(writer, "prop_0", result); + + // Write propagator D1 term + sliceSum(propD1, vecBuf, Tp); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(writer, "prop_D1", result); + + // Write propagator D1D1 term + sliceSum(propD1D1, vecBuf, Tp); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(writer, "prop_D1D1", result); + + // Write propagator D2 term + sliceSum(propD2, vecBuf, Tp); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(writer, "prop_D2", result); } } diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.hpp b/extras/Hadrons/Modules/MScalar/ChargedProp.hpp index 8bb5faa0..6a6c6c39 100644 --- a/extras/Hadrons/Modules/MScalar/ChargedProp.hpp +++ b/extras/Hadrons/Modules/MScalar/ChargedProp.hpp @@ -45,9 +45,10 @@ private: void momD1(ScalarField &s, FFT &fft); void momD2(ScalarField &s, FFT &fft); private: - std::string freeMomPropName_, GFSrcName_; + std::string freeMomPropName_, GFSrcName_, prop0Name_, + propD1Name_, propD1D1Name_, propD2Name_; std::vector phaseName_; - ScalarField *freeMomProp_, *GFSrc_; + ScalarField *freeMomProp_, *GFSrc_, *prop0_; std::vector phase_; EmField *A; }; From 483fd3cfa1e65d894f27b9f04aa14301d644416e Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 27 Mar 2017 13:24:51 +0100 Subject: [PATCH 008/170] Add propagator expansion terms as inputs to ScalarFV --- extras/Hadrons/Modules/MScalar/ScalarFV.cc | 7 ++++++- extras/Hadrons/Modules/MScalar/ScalarFV.hpp | 9 ++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/extras/Hadrons/Modules/MScalar/ScalarFV.cc b/extras/Hadrons/Modules/MScalar/ScalarFV.cc index e7a72abe..2a50a62f 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarFV.cc +++ b/extras/Hadrons/Modules/MScalar/ScalarFV.cc @@ -15,7 +15,12 @@ TScalarFV::TScalarFV(const std::string name) // dependencies/products /////////////////////////////////////////////////////// std::vector TScalarFV::getInput(void) { - std::vector in = {par().source, par().emField}; + std::string prop0Name = par().scalarProp + "_0"; + std::string propD1Name = par().scalarProp + "_D1"; + std::string propD1D1Name = par().scalarProp + "_D1D1"; + std::string propD2Name = par().scalarProp + "_D2"; + std::vector in = {par().source, par().emField, par().scalarProp, + prop0Name, propD1Name, propD1D1Name, propD2Name}; return in; } diff --git a/extras/Hadrons/Modules/MScalar/ScalarFV.hpp b/extras/Hadrons/Modules/MScalar/ScalarFV.hpp index f7802bda..fda174db 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarFV.hpp +++ b/extras/Hadrons/Modules/MScalar/ScalarFV.hpp @@ -18,14 +18,17 @@ public: GRID_SERIALIZABLE_CLASS_MEMBERS(ScalarFVPar, std::string, emField, std::string, source, - double, mass, + std::string, scalarProp, double, charge, - std::string, output, - unsigned int, i); + std::string, output); }; class TScalarFV: public Module { +public: + SCALAR_TYPE_ALIASES(SIMPL,); + typedef PhotonR::GaugeField EmField; + typedef PhotonR::GaugeLinkField EmComp; public: // constructor TScalarFV(const std::string name); From 4512dbdf584e07acaec4fc86f25985446fc89aa7 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 27 Mar 2017 15:02:16 +0100 Subject: [PATCH 009/170] Rename module ScalarFV to ScalarVP --- extras/Hadrons/Modules.hpp | 2 +- .../MScalar/{ScalarFV.cc => ScalarVP.cc} | 27 ++++++++++--------- .../MScalar/{ScalarFV.hpp => ScalarVP.hpp} | 22 ++++++++------- extras/Hadrons/modules.inc | 4 +-- 4 files changed, 29 insertions(+), 26 deletions(-) rename extras/Hadrons/Modules/MScalar/{ScalarFV.cc => ScalarVP.cc} (54%) rename extras/Hadrons/Modules/MScalar/{ScalarFV.hpp => ScalarVP.hpp} (69%) diff --git a/extras/Hadrons/Modules.hpp b/extras/Hadrons/Modules.hpp index b482eded..c1d90e3a 100644 --- a/extras/Hadrons/Modules.hpp +++ b/extras/Hadrons/Modules.hpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/extras/Hadrons/Modules/MScalar/ScalarFV.cc b/extras/Hadrons/Modules/MScalar/ScalarVP.cc similarity index 54% rename from extras/Hadrons/Modules/MScalar/ScalarFV.cc rename to extras/Hadrons/Modules/MScalar/ScalarVP.cc index 2a50a62f..dff636dd 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarFV.cc +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.cc @@ -1,31 +1,32 @@ -#include +#include using namespace Grid; using namespace Hadrons; using namespace MScalar; /****************************************************************************** -* TScalarFV implementation * +* TScalarVP implementation * ******************************************************************************/ // constructor ///////////////////////////////////////////////////////////////// -TScalarFV::TScalarFV(const std::string name) -: Module(name) +TScalarVP::TScalarVP(const std::string name) +: Module(name) {} // dependencies/products /////////////////////////////////////////////////////// -std::vector TScalarFV::getInput(void) +std::vector TScalarVP::getInput(void) { - std::string prop0Name = par().scalarProp + "_0"; - std::string propD1Name = par().scalarProp + "_D1"; - std::string propD1D1Name = par().scalarProp + "_D1D1"; - std::string propD2Name = par().scalarProp + "_D2"; + prop0Name_ = par().scalarProp + "_0"; + propD1Name_ = par().scalarProp + "_D1"; + propD1D1Name_ = par().scalarProp + "_D1D1"; + propD2Name_ = par().scalarProp + "_D2"; std::vector in = {par().source, par().emField, par().scalarProp, - prop0Name, propD1Name, propD1D1Name, propD2Name}; + prop0Name_, propD1Name_, propD1D1Name_, + propD2Name_}; return in; } -std::vector TScalarFV::getOutput(void) +std::vector TScalarVP::getOutput(void) { std::vector out = {getName()}; @@ -33,13 +34,13 @@ std::vector TScalarFV::getOutput(void) } // setup /////////////////////////////////////////////////////////////////////// -void TScalarFV::setup(void) +void TScalarVP::setup(void) { } // execution /////////////////////////////////////////////////////////////////// -void TScalarFV::execute(void) +void TScalarVP::execute(void) { } diff --git a/extras/Hadrons/Modules/MScalar/ScalarFV.hpp b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp similarity index 69% rename from extras/Hadrons/Modules/MScalar/ScalarFV.hpp rename to extras/Hadrons/Modules/MScalar/ScalarVP.hpp index fda174db..3c3be434 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarFV.hpp +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp @@ -1,5 +1,5 @@ -#ifndef Hadrons_ScalarFV_hpp_ -#define Hadrons_ScalarFV_hpp_ +#ifndef Hadrons_ScalarVP_hpp_ +#define Hadrons_ScalarVP_hpp_ #include #include @@ -8,14 +8,14 @@ BEGIN_HADRONS_NAMESPACE /****************************************************************************** - * ScalarFV * + * ScalarVP * ******************************************************************************/ BEGIN_MODULE_NAMESPACE(MScalar) -class ScalarFVPar: Serializable +class ScalarVPPar: Serializable { public: - GRID_SERIALIZABLE_CLASS_MEMBERS(ScalarFVPar, + GRID_SERIALIZABLE_CLASS_MEMBERS(ScalarVPPar, std::string, emField, std::string, source, std::string, scalarProp, @@ -23,7 +23,7 @@ public: std::string, output); }; -class TScalarFV: public Module +class TScalarVP: public Module { public: SCALAR_TYPE_ALIASES(SIMPL,); @@ -31,9 +31,9 @@ public: typedef PhotonR::GaugeLinkField EmComp; public: // constructor - TScalarFV(const std::string name); + TScalarVP(const std::string name); // destructor - virtual ~TScalarFV(void) = default; + virtual ~TScalarVP(void) = default; // dependency relation virtual std::vector getInput(void); virtual std::vector getOutput(void); @@ -41,12 +41,14 @@ public: virtual void setup(void); // execution virtual void execute(void); +private: + std::string prop0Name_, propD1Name_, propD1D1Name_, propD2Name_; }; -MODULE_REGISTER_NS(ScalarFV, TScalarFV, MScalar); +MODULE_REGISTER_NS(ScalarVP, TScalarVP, MScalar); END_MODULE_NAMESPACE END_HADRONS_NAMESPACE -#endif // Hadrons_ScalarFV_hpp_ +#endif // Hadrons_ScalarVP_hpp_ diff --git a/extras/Hadrons/modules.inc b/extras/Hadrons/modules.inc index f368bbdc..383cb6b5 100644 --- a/extras/Hadrons/modules.inc +++ b/extras/Hadrons/modules.inc @@ -5,7 +5,7 @@ modules_cc =\ Modules/MGauge/Unit.cc \ Modules/MScalar/ChargedProp.cc \ Modules/MScalar/FreeProp.cc \ - Modules/MScalar/ScalarFV.cc + Modules/MScalar/ScalarVP.cc modules_hpp =\ Modules/MAction/DWF.hpp \ @@ -19,7 +19,7 @@ modules_hpp =\ Modules/MScalar/ChargedProp.hpp \ Modules/MScalar/FreeProp.hpp \ Modules/MScalar/Scalar.hpp \ - Modules/MScalar/ScalarFV.hpp \ + Modules/MScalar/ScalarVP.hpp \ Modules/MSolver/RBPrecCG.hpp \ Modules/MSource/Point.hpp \ Modules/MSource/SeqGamma.hpp \ From 9f755e0379e192826a593d758a53124bd28b781a Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 27 Mar 2017 16:49:18 +0100 Subject: [PATCH 010/170] Add functions momD1 and momD2 to ScalarVP --- extras/Hadrons/Modules/MScalar/ScalarVP.cc | 68 ++++++++++++++++++++- extras/Hadrons/Modules/MScalar/ScalarVP.hpp | 5 ++ 2 files changed, 72 insertions(+), 1 deletion(-) diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.cc b/extras/Hadrons/Modules/MScalar/ScalarVP.cc index dff636dd..423fb1a2 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.cc +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.cc @@ -15,7 +15,7 @@ TScalarVP::TScalarVP(const std::string name) // dependencies/products /////////////////////////////////////////////////////// std::vector TScalarVP::getInput(void) { - prop0Name_ = par().scalarProp + "_0"; + prop0Name_ = par().scalarProp + "_0"; propD1Name_ = par().scalarProp + "_D1"; propD1D1Name_ = par().scalarProp + "_D1D1"; propD2Name_ = par().scalarProp + "_D2"; @@ -36,11 +36,77 @@ std::vector TScalarVP::getOutput(void) // setup /////////////////////////////////////////////////////////////////////// void TScalarVP::setup(void) { + phaseName_.clear(); + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + phaseName_.push_back("_shiftphase_" + std::to_string(mu)); + } } // execution /////////////////////////////////////////////////////////////////// void TScalarVP::execute(void) { + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + phase_.push_back(env().getObject(phaseName_[mu])); + } } + +void TScalarVP::momD1(ScalarField &s, EmField &A, FFT &fft) +{ + ScalarField buf(env().getGrid()), result(env().getGrid()), + Amu(env().getGrid()); + Complex ci(0.0,1.0); + + result = zero; + + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + Amu = peekLorentz(A, mu); + buf = (*phase_[mu])*s; + fft.FFT_all_dim(buf, buf, FFT::backward); + buf = Amu*buf; + fft.FFT_all_dim(buf, buf, FFT::forward); + result = result - ci*buf; + } + fft.FFT_all_dim(s, s, FFT::backward); + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + Amu = peekLorentz(A, mu); + buf = Amu*s; + fft.FFT_all_dim(buf, buf, FFT::forward); + result = result + ci*adj(*phase_[mu])*buf; + } + + s = result; +} + +void TScalarVP::momD2(ScalarField &s, EmField &Asquared, FFT &fft) +{ + ScalarField buf(env().getGrid()), result(env().getGrid()), + A2mu(env().getGrid()); + + result = zero; + + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + A2mu = peekLorentz(Asquared, mu); + buf = (*phase_[mu])*s; + fft.FFT_all_dim(buf, buf, FFT::backward); + buf = A2mu*buf; + fft.FFT_all_dim(buf, buf, FFT::forward); + result = result + .5*buf; + } + fft.FFT_all_dim(s, s, FFT::backward); + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + A2mu = peekLorentz(Asquared, mu); + buf = A2mu*s; + fft.FFT_all_dim(buf, buf, FFT::forward); + result = result + .5*adj(*phase_[mu])*buf; + } + + s = result; +} diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.hpp b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp index 3c3be434..92a4f246 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.hpp +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp @@ -41,8 +41,13 @@ public: virtual void setup(void); // execution virtual void execute(void); +private: + void momD1(ScalarField &s, EmField &A, FFT &fft); + void momD2(ScalarField &s, EmField &Asquared, FFT &fft); private: std::string prop0Name_, propD1Name_, propD1D1Name_, propD2Name_; + std::vector phaseName_; + std::vector phase_; }; MODULE_REGISTER_NS(ScalarVP, TScalarVP, MScalar); From 7b03d8d0879d7f7922b8867eefa9346cb0e5c425 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Wed, 5 Apr 2017 16:17:46 +0100 Subject: [PATCH 011/170] Fixing the remaining merge conflicts --- lib/qcd/action/scalar/Scalar.h | 5 +++++ tests/Test_stencil.cc | 7 ------- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/lib/qcd/action/scalar/Scalar.h b/lib/qcd/action/scalar/Scalar.h index e5bea275..cae38360 100644 --- a/lib/qcd/action/scalar/Scalar.h +++ b/lib/qcd/action/scalar/Scalar.h @@ -31,6 +31,7 @@ directory #include #include +#include namespace Grid { namespace QCD { @@ -39,6 +40,10 @@ namespace QCD { typedef ScalarAction ScalarActionF; typedef ScalarAction ScalarActionD; + typedef ScalarInteractionAction ScalarAdjActionR; + typedef ScalarInteractionAction ScalarAdjActionF; + typedef ScalarInteractionAction ScalarAdjActionD; + } } diff --git a/tests/Test_stencil.cc b/tests/Test_stencil.cc index 2a4744f3..fa4b0b57 100644 --- a/tests/Test_stencil.cc +++ b/tests/Test_stencil.cc @@ -189,13 +189,6 @@ int main(int argc, char ** argv) { SimpleCompressor compress; -<<<<<<< HEAD - EStencil.HaloExchange(EFoo,compress); - OStencil.HaloExchange(OFoo,compress); - -======= - ->>>>>>> feature/hmc_generalise Bar = Cshift(Foo,dir,disp); if ( disp & 0x1 ) { From 26ebe41fef37d310bba59d378060a26749c1a54b Mon Sep 17 00:00:00 2001 From: James Harrison Date: Mon, 10 Apr 2017 16:33:54 +0100 Subject: [PATCH 012/170] QedFVol: Implement charged propagator calculation within ScalarVP module --- extras/Hadrons/Modules/MScalar/ChargedProp.cc | 2 +- extras/Hadrons/Modules/MScalar/ScalarVP.cc | 230 ++++++++++++++++-- extras/Hadrons/Modules/MScalar/ScalarVP.hpp | 16 +- 3 files changed, 225 insertions(+), 23 deletions(-) diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.cc b/extras/Hadrons/Modules/MScalar/ChargedProp.cc index b76ea8d2..b68b144d 100644 --- a/extras/Hadrons/Modules/MScalar/ChargedProp.cc +++ b/extras/Hadrons/Modules/MScalar/ChargedProp.cc @@ -100,7 +100,7 @@ void TChargedProp::execute(void) { GFSrc_ = env().getObject(GFSrcName_); } - // cache free scalar propagator + // cache position-space free scalar propagator if (!env().hasCreatedObject(prop0Name_)) { prop0_ = env().createLattice(prop0Name_); diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.cc b/extras/Hadrons/Modules/MScalar/ScalarVP.cc index 423fb1a2..66cdea7e 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.cc +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.cc @@ -15,13 +15,7 @@ TScalarVP::TScalarVP(const std::string name) // dependencies/products /////////////////////////////////////////////////////// std::vector TScalarVP::getInput(void) { - prop0Name_ = par().scalarProp + "_0"; - propD1Name_ = par().scalarProp + "_D1"; - propD1D1Name_ = par().scalarProp + "_D1D1"; - propD2Name_ = par().scalarProp + "_D2"; - std::vector in = {par().source, par().emField, par().scalarProp, - prop0Name_, propD1Name_, propD1D1Name_, - propD2Name_}; + std::vector in = {par().source, par().emField}; return in; } @@ -36,26 +30,229 @@ std::vector TScalarVP::getOutput(void) // setup /////////////////////////////////////////////////////////////////////// void TScalarVP::setup(void) { + freeMomPropName_ = FREEMOMPROP(par().mass); + GFSrcName_ = "_" + getName() + "_DinvSrc"; + prop0Name_ = getName() + "_prop0"; + propQName_ = getName() + "_propQ"; + propSunName_ = getName() + "_propSun"; + propTadName_ = getName() + "_propTad"; + phaseName_.clear(); + muGFSrcName_.clear(); + muProp0Name_.clear(); + muPropQName_.clear(); + muPropSunName_.clear(); + muPropTadName_.clear(); for (unsigned int mu = 0; mu < env().getNd(); ++mu) { phaseName_.push_back("_shiftphase_" + std::to_string(mu)); + muGFSrcName.push_back("_" + getName() + "_DinvSrc_" + std::to_string(mu)); + muProp0Name_.push_back(getName() + "_prop0_" + std::to_string(mu)); + muPropQName_.push_back(getName() + "_propQ_" + std::to_string(mu)); + muPropSunName_.push_back(getName() + "_propSun_" + std::to_string(mu)); + muPropTadName_.push_back(getName() + "_propTad_" + std::to_string(mu)); } + if (!env().hasRegisteredObject(freeMomPropName_)) + { + env().registerLattice(freeMomPropName_); + } + if (!env().hasRegisteredObject(phaseName_[0])) + { + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + env().registerLattice(phaseName_[mu]); + } + } + if (!env().hasRegisteredObject(GFSrcName_)) + { + env().registerLattice(GFSrcName_); + } + if (!env().hasRegisteredObject(muGFSrcName_[0])) + { + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + env().registerLattice(muGFSrcName_[mu]); + } + } + if (!env().hasRegisteredObject(prop0Name_)) + { + env().registerLattice(prop0Name_); + } + if (!env().hasRegisteredObject(muProp0Name_[0])) + { + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + env().registerLattice(muProp0Name_[mu]); + } + } + env().registerLattice(propQName_); + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + env().registerLattice(muPropQName_[mu]); + } + env().registerLattice(propSunName_); + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + env().registerLattice(muPropSunName_[mu]); + } + env().registerLattice(propTadName_); + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + env().registerLattice(muPropTadName_[mu]); + } + env().registerLattice(getName()); } // execution /////////////////////////////////////////////////////////////////// void TScalarVP::execute(void) { - for (unsigned int mu = 0; mu < env().getNd(); ++mu) + // CACHING ANALYTIC EXPRESSIONS + ScalarField &source = *env().getObject(par().source); + Complex ci(0.0,1.0); + FFT fft(env().getGrid()); + + // cache momentum-space free scalar propagator + if (!env().hasCreatedObject(freeMomPropName_)) { - phase_.push_back(env().getObject(phaseName_[mu])); + LOG(Message) << "Caching momentum space free scalar propagator" + << " (mass= " << par().mass << ")..." << std::endl; + freeMomProp_ = env().createLattice(freeMomPropName_); + Scalar::MomentumSpacePropagator(*freeMomProp_, par().mass); + } + else + { + freeMomProp_ = env().getObject(freeMomPropName_); + } + // cache phases + if (!env().hasCreatedObject(phaseName_[0])) + { + std::vector &l = env().getGrid()->_fdimensions; + + LOG(Message) << "Caching shift phases..." << std::endl; + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + Real twoPiL = M_PI*2./l[mu]; + + phase_.push_back(env().createLattice(phaseName_[mu])); + LatticeCoordinate(*(phase_[mu]), mu); + *(phase_[mu]) = exp(ci*twoPiL*(*(phase_[mu]))); + } + } + else + { + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + phase_.push_back(env().getObject(phaseName_[mu])); + } + } + // cache G*F*src + if (!env().hasCreatedObject(GFSrcName_)) + { + GFSrc_ = env().createLattice(GFSrcName_); + fft.FFT_all_dim(*GFSrc_, source, FFT::forward); + *GFSrc_ = (*freeMomProp_)*(*GFSrc_); + } + else + { + GFSrc_ = env().getObject(GFSrcName_); + } + // cache G*exp(i*k_mu)*F*src + if (!env().hasCreatedObject(muGFSrcName_[0])) + { + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + muGFSrc_.push_back(env().createLattice(muGFSrcName_[mu])); + fft.FFT_all_dim(*(muGFSrc_[mu]), source, FFT::forward); + *(muGFSrc_[mu]) = (*freeMomProp_)*(*phase_[mu])*(*muGFSrc_[mu]); + } + } + else + { + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + muGFSrc_.push_back(env().getObject(muGFSrcName_[mu])); + } + } + // cache position-space free scalar propagators + if (!env().hasCreatedObject(prop0Name_)) + { + prop0_ = env().createLattice(prop0Name_); + fft.FFT_all_dim(*prop0_, *GFSrc_, FFT::backward); + } + else + { + prop0_ = env().getObject(prop0Name_); + } + if (!env().hasCreatedObject(muProp0Name_[0])) + { + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + muProp0_.push_back(env().createLattice(muProp0Name_[mu])); + fft.FFT_all_dim(*(muProp0_[mu]), *(muGFSrc_[mu]), FFT::backward); + } + } + else + { + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + muProp0_.push_back(env().getObject(muProp0Name_[mu])); + } } + // PROPAGATOR CALCULATION + // Propagator from unshifted source + ScalarField &propQ = *env().createLattice(propQName_); + ScalarField &propSun = *env().createLattice(propSunName_); + ScalarField &propTad = *env().createLattice(propTadName_); + chargedProp(propQ, propSun, propTad, *GFSrc_, fft); + + // Propagators from shifted sources + std::vector muPropQ_, muPropSun_, muPropTad_; + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + muPropQ_.push_back(env().createLattice(muPropQName_[mu])); + muPropSun_.push_back(env().createLattice(muPropSunName_[mu])); + muPropTad_.push_back(env().createLattice(muPropTadName_[mu])); + chargedProp(*(muPropQ_[mu]), *(muPropSun_[mu]), *(muPropTad_[mu]), + *(muGFSrc_[mu]), fft); + } + } -void TScalarVP::momD1(ScalarField &s, EmField &A, FFT &fft) +// Calculate O(q) and O(q^2) terms of momentum-space charged propagator +void TScalarVP::chargedProp(ScalarField &prop_q, ScalarField &prop_sun, + ScalarField &prop_tad, ScalarField &GFSrc, + FFT &fft) { + Complex ci(0.0,1.0); + double q = par().charge; + ScalarField &G = *freeMomProp_; + ScalarField buf(env().getGrid()); + + LOG(Message) << "Computing charged scalar propagator" + << " (mass= " << par().mass + << ", charge= " << q << ")..." << std::endl; + + // -q*G*momD1*G*F*Src (momD1 = F*D1*Finv) + buf = GFSrc; + momD1(buf, fft); + buf = G*buf; + prop_q = -q*buf; + + // q*q*G*momD1*G*momD1*G*F*Src + momD1(buf, fft); + prop_sun = q*q*G*buf; + + // -q*q*G*momD2*G*F*Src (momD2 = F*D2*Finv) + buf = GFSrc; + momD2(buf, fft); + prop_tad = -q*q*G*buf; +} + +void TScalarVP::momD1(ScalarField &s, FFT &fft) +{ + EmField &A = *env().getObject(par().emField); ScalarField buf(env().getGrid()), result(env().getGrid()), Amu(env().getGrid()); Complex ci(0.0,1.0); @@ -83,27 +280,28 @@ void TScalarVP::momD1(ScalarField &s, EmField &A, FFT &fft) s = result; } -void TScalarVP::momD2(ScalarField &s, EmField &Asquared, FFT &fft) +void TScalarVP::momD2(ScalarField &s, FFT &fft) { + EmField &A = *env().getObject(par().emField); ScalarField buf(env().getGrid()), result(env().getGrid()), - A2mu(env().getGrid()); + Amu(env().getGrid()); result = zero; for (unsigned int mu = 0; mu < env().getNd(); ++mu) { - A2mu = peekLorentz(Asquared, mu); + Amu = peekLorentz(A, mu); buf = (*phase_[mu])*s; fft.FFT_all_dim(buf, buf, FFT::backward); - buf = A2mu*buf; + buf = Amu*Amu*buf; fft.FFT_all_dim(buf, buf, FFT::forward); result = result + .5*buf; } fft.FFT_all_dim(s, s, FFT::backward); for (unsigned int mu = 0; mu < env().getNd(); ++mu) { - A2mu = peekLorentz(Asquared, mu); - buf = A2mu*s; + Amu = peekLorentz(A, mu); + buf = Amu*Amu*s; fft.FFT_all_dim(buf, buf, FFT::forward); result = result + .5*adj(*phase_[mu])*buf; } diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.hpp b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp index 92a4f246..9d57a62a 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.hpp +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp @@ -18,7 +18,7 @@ public: GRID_SERIALIZABLE_CLASS_MEMBERS(ScalarVPPar, std::string, emField, std::string, source, - std::string, scalarProp, + double, mass, double, charge, std::string, output); }; @@ -42,12 +42,16 @@ public: // execution virtual void execute(void); private: - void momD1(ScalarField &s, EmField &A, FFT &fft); - void momD2(ScalarField &s, EmField &Asquared, FFT &fft); + void momD1(ScalarField &s, FFT &fft); + void momD2(ScalarField &s, FFT &fft); private: - std::string prop0Name_, propD1Name_, propD1D1Name_, propD2Name_; - std::vector phaseName_; - std::vector phase_; + std::string freeMomPropName_, GFSrcName_, prop0Name_, + propQName_, propSunName_, propTadName_; + std::vector phaseName_, muGFSrcName_, muProp0Name_, + muPropQName, muPropSunName_, muPropTadName_; + ScalarField *freeMomProp_, *GFSrc_, *prop0_; + std::vector phase_, muGFSrc_, muProp0_; + emField *A; }; MODULE_REGISTER_NS(ScalarVP, TScalarVP, MScalar); From c8e6f58e24ac04b0387bafd33334189528212627 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Thu, 13 Apr 2017 17:04:37 +0100 Subject: [PATCH 013/170] Fix typos in ScalarVP --- extras/Hadrons/Modules/MScalar/ScalarVP.cc | 3 ++- extras/Hadrons/Modules/MScalar/ScalarVP.hpp | 7 +++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.cc b/extras/Hadrons/Modules/MScalar/ScalarVP.cc index 66cdea7e..79cc5574 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.cc +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.cc @@ -1,4 +1,5 @@ #include +#include using namespace Grid; using namespace Hadrons; @@ -46,7 +47,7 @@ void TScalarVP::setup(void) for (unsigned int mu = 0; mu < env().getNd(); ++mu) { phaseName_.push_back("_shiftphase_" + std::to_string(mu)); - muGFSrcName.push_back("_" + getName() + "_DinvSrc_" + std::to_string(mu)); + muGFSrcName_.push_back("_" + getName() + "_DinvSrc_" + std::to_string(mu)); muProp0Name_.push_back(getName() + "_prop0_" + std::to_string(mu)); muPropQName_.push_back(getName() + "_propQ_" + std::to_string(mu)); muPropSunName_.push_back(getName() + "_propSun_" + std::to_string(mu)); diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.hpp b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp index 9d57a62a..1a2b82fb 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.hpp +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp @@ -42,16 +42,19 @@ public: // execution virtual void execute(void); private: + void chargedProp(ScalarField &prop_q, ScalarField &prop_sun, + ScalarField &prop_tad, ScalarField &GFSrc, + FFT &fft); void momD1(ScalarField &s, FFT &fft); void momD2(ScalarField &s, FFT &fft); private: std::string freeMomPropName_, GFSrcName_, prop0Name_, propQName_, propSunName_, propTadName_; std::vector phaseName_, muGFSrcName_, muProp0Name_, - muPropQName, muPropSunName_, muPropTadName_; + muPropQName_, muPropSunName_, muPropTadName_; ScalarField *freeMomProp_, *GFSrc_, *prop0_; std::vector phase_, muGFSrc_, muProp0_; - emField *A; + EmField *A; }; MODULE_REGISTER_NS(ScalarVP, TScalarVP, MScalar); From bd466a55a89b98c54dbc1287e15feb558a372de4 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 25 Apr 2017 10:04:03 +0100 Subject: [PATCH 014/170] QedFVol: remove charge dependence in chargedProp function of ScalarVP --- extras/Hadrons/Modules/MScalar/ScalarVP.cc | 30 ++++++++++++++-------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.cc b/extras/Hadrons/Modules/MScalar/ScalarVP.cc index 79cc5574..ed777387 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.cc +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.cc @@ -23,8 +23,17 @@ std::vector TScalarVP::getInput(void) std::vector TScalarVP::getOutput(void) { - std::vector out = {getName()}; + std::vector out = {getName(), getName()+"_propQ", + getName()+"_propSun", + getName()+"_propTad"}; + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + out.push_back(getName() + "_propQ_" + std::to_string(mu)); + out.push_back(getName() + "_propSun_" + std::to_string(mu)); + out.push_back(getName() + "_propTad_" + std::to_string(mu)); + } + return out; } @@ -112,6 +121,7 @@ void TScalarVP::execute(void) ScalarField &source = *env().getObject(par().source); Complex ci(0.0,1.0); FFT fft(env().getGrid()); + double q = par().charge; // cache momentum-space free scalar propagator if (!env().hasCreatedObject(freeMomPropName_)) @@ -227,28 +237,28 @@ void TScalarVP::chargedProp(ScalarField &prop_q, ScalarField &prop_sun, FFT &fft) { Complex ci(0.0,1.0); - double q = par().charge; - ScalarField &G = *freeMomProp_; + ScalarField &G = *freeMomProp_; ScalarField buf(env().getGrid()); LOG(Message) << "Computing charged scalar propagator" << " (mass= " << par().mass - << ", charge= " << q << ")..." << std::endl; + << ", charge= " << par().charge << ")..." + << std::endl; - // -q*G*momD1*G*F*Src (momD1 = F*D1*Finv) + // -G*momD1*G*F*Src (momD1 = F*D1*Finv) buf = GFSrc; momD1(buf, fft); buf = G*buf; - prop_q = -q*buf; + prop_q = -buf; - // q*q*G*momD1*G*momD1*G*F*Src + // G*momD1*G*momD1*G*F*Src momD1(buf, fft); - prop_sun = q*q*G*buf; + prop_sun = G*buf; - // -q*q*G*momD2*G*F*Src (momD2 = F*D2*Finv) + // -G*momD2*G*F*Src (momD2 = F*D2*Finv) buf = GFSrc; momD2(buf, fft); - prop_tad = -q*q*G*buf; + prop_tad = -G*buf; } void TScalarVP::momD1(ScalarField &s, FFT &fft) From 3ac27e559678f456d512f58670617b26ca1529e5 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Thu, 27 Apr 2017 14:17:50 +0100 Subject: [PATCH 015/170] QedFVol: remove unnecessary copies of free propagator from shifted sources in ScalarVP --- extras/Hadrons/Modules/MScalar/ScalarVP.cc | 57 ++------------------- extras/Hadrons/Modules/MScalar/ScalarVP.hpp | 6 +-- 2 files changed, 8 insertions(+), 55 deletions(-) diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.cc b/extras/Hadrons/Modules/MScalar/ScalarVP.cc index ed777387..5a5ef4f0 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.cc +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.cc @@ -48,16 +48,12 @@ void TScalarVP::setup(void) propTadName_ = getName() + "_propTad"; phaseName_.clear(); - muGFSrcName_.clear(); - muProp0Name_.clear(); muPropQName_.clear(); muPropSunName_.clear(); muPropTadName_.clear(); for (unsigned int mu = 0; mu < env().getNd(); ++mu) { phaseName_.push_back("_shiftphase_" + std::to_string(mu)); - muGFSrcName_.push_back("_" + getName() + "_DinvSrc_" + std::to_string(mu)); - muProp0Name_.push_back(getName() + "_prop0_" + std::to_string(mu)); muPropQName_.push_back(getName() + "_propQ_" + std::to_string(mu)); muPropSunName_.push_back(getName() + "_propSun_" + std::to_string(mu)); muPropTadName_.push_back(getName() + "_propTad_" + std::to_string(mu)); @@ -78,24 +74,10 @@ void TScalarVP::setup(void) { env().registerLattice(GFSrcName_); } - if (!env().hasRegisteredObject(muGFSrcName_[0])) - { - for (unsigned int mu = 0; mu < env().getNd(); ++mu) - { - env().registerLattice(muGFSrcName_[mu]); - } - } if (!env().hasRegisteredObject(prop0Name_)) { env().registerLattice(prop0Name_); } - if (!env().hasRegisteredObject(muProp0Name_[0])) - { - for (unsigned int mu = 0; mu < env().getNd(); ++mu) - { - env().registerLattice(muProp0Name_[mu]); - } - } env().registerLattice(propQName_); for (unsigned int mu = 0; mu < env().getNd(); ++mu) { @@ -168,23 +150,6 @@ void TScalarVP::execute(void) { GFSrc_ = env().getObject(GFSrcName_); } - // cache G*exp(i*k_mu)*F*src - if (!env().hasCreatedObject(muGFSrcName_[0])) - { - for (unsigned int mu = 0; mu < env().getNd(); ++mu) - { - muGFSrc_.push_back(env().createLattice(muGFSrcName_[mu])); - fft.FFT_all_dim(*(muGFSrc_[mu]), source, FFT::forward); - *(muGFSrc_[mu]) = (*freeMomProp_)*(*phase_[mu])*(*muGFSrc_[mu]); - } - } - else - { - for (unsigned int mu = 0; mu < env().getNd(); ++mu) - { - muGFSrc_.push_back(env().getObject(muGFSrcName_[mu])); - } - } // cache position-space free scalar propagators if (!env().hasCreatedObject(prop0Name_)) { @@ -195,21 +160,6 @@ void TScalarVP::execute(void) { prop0_ = env().getObject(prop0Name_); } - if (!env().hasCreatedObject(muProp0Name_[0])) - { - for (unsigned int mu = 0; mu < env().getNd(); ++mu) - { - muProp0_.push_back(env().createLattice(muProp0Name_[mu])); - fft.FFT_all_dim(*(muProp0_[mu]), *(muGFSrc_[mu]), FFT::backward); - } - } - else - { - for (unsigned int mu = 0; mu < env().getNd(); ++mu) - { - muProp0_.push_back(env().getObject(muProp0Name_[mu])); - } - } // PROPAGATOR CALCULATION // Propagator from unshifted source @@ -220,15 +170,18 @@ void TScalarVP::execute(void) // Propagators from shifted sources std::vector muPropQ_, muPropSun_, muPropTad_; + ScalarField buf(env().getGrid()); for (unsigned int mu = 0; mu < env().getNd(); ++mu) { muPropQ_.push_back(env().createLattice(muPropQName_[mu])); muPropSun_.push_back(env().createLattice(muPropSunName_[mu])); muPropTad_.push_back(env().createLattice(muPropTadName_[mu])); + + buf = adj(*phase_[mu])*(*GFSrc_); chargedProp(*(muPropQ_[mu]), *(muPropSun_[mu]), *(muPropTad_[mu]), - *(muGFSrc_[mu]), fft); + buf, fft); } - + } // Calculate O(q) and O(q^2) terms of momentum-space charged propagator diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.hpp b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp index 1a2b82fb..0d93dc45 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.hpp +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp @@ -50,10 +50,10 @@ private: private: std::string freeMomPropName_, GFSrcName_, prop0Name_, propQName_, propSunName_, propTadName_; - std::vector phaseName_, muGFSrcName_, muProp0Name_, - muPropQName_, muPropSunName_, muPropTadName_; + std::vector phaseName_, muPropQName_, muPropSunName_, + muPropTadName_; ScalarField *freeMomProp_, *GFSrc_, *prop0_; - std::vector phase_, muGFSrc_, muProp0_; + std::vector phase_; EmField *A; }; From 2f0dd83016075d7541a0ef86289af64cfae73fed Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 3 May 2017 12:53:41 +0100 Subject: [PATCH 016/170] Calculate HVP using a single contraction of O(alpha) charged propagators. --- extras/Hadrons/Modules/MScalar/ScalarVP.cc | 131 ++++++++++++++++++++ extras/Hadrons/Modules/MScalar/ScalarVP.hpp | 18 +-- 2 files changed, 142 insertions(+), 7 deletions(-) diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.cc b/extras/Hadrons/Modules/MScalar/ScalarVP.cc index 5a5ef4f0..9689a63f 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.cc +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.cc @@ -32,6 +32,11 @@ std::vector TScalarVP::getOutput(void) out.push_back(getName() + "_propQ_" + std::to_string(mu)); out.push_back(getName() + "_propSun_" + std::to_string(mu)); out.push_back(getName() + "_propTad_" + std::to_string(mu)); + + for (unsigned int nu = 0; nu < env().getNd(); ++nu) + { + out.push_back(getName() + "_" + std::to_string(mu) + "_" + std::to_string(nu)); + } } return out; @@ -51,12 +56,22 @@ void TScalarVP::setup(void) muPropQName_.clear(); muPropSunName_.clear(); muPropTadName_.clear(); + vpTensorName_.clear(); + for (unsigned int mu = 0; mu < env().getNd(); ++mu) { phaseName_.push_back("_shiftphase_" + std::to_string(mu)); muPropQName_.push_back(getName() + "_propQ_" + std::to_string(mu)); muPropSunName_.push_back(getName() + "_propSun_" + std::to_string(mu)); muPropTadName_.push_back(getName() + "_propTad_" + std::to_string(mu)); + + std::vector vpTensorName_mu; + for (unsigned int nu = 0; nu < env().getNd(); ++nu) + { + vpTensorName_mu.push_back(getName() + "_" + std::to_string(mu) + + "_" + std::to_string(nu)); + } + vpTensorName_.push_back(vpTensorName_mu); } if (!env().hasRegisteredObject(freeMomPropName_)) @@ -93,6 +108,13 @@ void TScalarVP::setup(void) { env().registerLattice(muPropTadName_[mu]); } + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + for (unsigned int nu = 0; nu < env().getNd(); ++nu) + { + env().registerLattice(vpTensorName_[mu][nu]); + } + } env().registerLattice(getName()); } @@ -182,6 +204,115 @@ void TScalarVP::execute(void) buf, fft); } + // CONTRACTIONS + vpTensor_.clear(); + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + std::vector vpTensor_mu; + for (unsigned int nu = 0; nu < env().getNd(); ++nu) + { + vpTensor_mu.push_back(env().createLattice(vpTensorName_[mu][nu])); + } + vpTensor_.push_back(vpTensor_mu); + } + ScalarField prop1(env().getGrid()), prop2(env().getGrid()); + EmField &A = *env().getObject(par().emField); + ScalarField Amu(env().getGrid()); + TComplex Anu0; + std::vector coor0 = {0, 0, 0, 0}; + + // Position-space implementation + prop1 = *GFSrc_ + q*propQ + q*q*propSun + q*q*propTad; + fft.FFT_all_dim(prop1, prop1, FFT::backward); + for (unsigned int nu = 0; nu < env().getNd(); ++nu) + { + peekSite(Anu0, peekLorentz(A, nu), coor0); + prop2 = adj(*phase_[nu])*(*GFSrc_) + q*(*(muPropQ_[nu])) + + q*q*(*(muPropSun_[nu]) + *(muPropTad_[nu])); + fft.FFT_all_dim(prop2, prop2, FFT::backward); + + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + LOG(Message) << "Computing Pi[" << mu << "][" << nu << "]..." + << std::endl; + Amu = peekLorentz(A, mu); + ScalarField &pi_mu_nu = *(vpTensor_[mu][nu]); + pi_mu_nu = adj(prop2) + * (1.0 + ci*q*Amu - 0.5*q*q*Amu*Amu) + * Cshift(prop1, mu, 1) + * (1.0 + ci*q*Anu0 - 0.5*q*q*Anu0*Anu0); + pi_mu_nu -= Cshift(adj(prop2), mu, 1) + * (1.0 - ci*q*Amu - 0.5*q*q*Amu*Amu) + * prop1 + * (1.0 + ci*q*Anu0 - 0.5*q*q*Anu0*Anu0); + pi_mu_nu = 2.0*real(pi_mu_nu); + } + } + + // // Momentum-space implementation + // ScalarField propbuf1(env().getGrid()), propbuf2(env().getGrid()); + // prop1 = *GFSrc_ + q*propQ + q*q*propSun + q*q*propTad; + // for (unsigned int nu = 0; nu < env().getNd(); ++nu) + // { + // peekSite(Anu0, peekLorentz(A, nu), coor0); + // prop2 = adj(*phase_[nu])*(*GFSrc_) + q*(*(muPropQ_[nu])) + // + q*q*(*(muPropSun_[nu]) + *(muPropTad_[nu])); + + // for (unsigned int mu = 0; mu < env().getNd(); ++mu) + // { + // LOG(Message) << "Computing Pi[" << mu << "][" << nu << "]..." + // << std::endl; + // Amu = peekLorentz(A, mu); + // ScalarField &pi_mu_nu = *(vpTensor_[mu][nu]); + // propbuf1 = (*phase_[mu])*prop1; + // fft.FFT_all_dim(propbuf1, propbuf1, FFT::backward); + // fft.FFT_all_dim(propbuf2, prop2, FFT::backward); + // pi_mu_nu = adj(propbuf2) + // * (1.0 + ci*q*Amu - 0.5*q*q*Amu*Amu) + // * propbuf1 + // * (1.0 + ci*q*Anu0 - 0.5*q*q*Anu0*Anu0); + // propbuf2 = (*phase_[mu])*prop2; + // fft.FFT_all_dim(propbuf1, prop1, FFT::backward); + // fft.FFT_all_dim(propbuf2, propbuf2, FFT::backward); + // pi_mu_nu -= adj(propbuf2) + // * (1.0 - ci*q*Amu - 0.5*q*q*Amu*Amu) + // * propbuf1 + // * (1.0 + ci*q*Anu0 - 0.5*q*q*Anu0*Anu0); + // pi_mu_nu = 2.0*real(pi_mu_nu); + // } + // } + + // OUTPUT IF NECESSARY + if (!par().output.empty()) + { + std::string filename = par().output + "." + + std::to_string(env().getTrajectory()); + + LOG(Message) << "Saving zero-momentum projection to '" + << filename << "'..." << std::endl; + + CorrWriter writer(filename); + std::vector vecBuf; + std::vector result; + + write(writer, "charge", q); + write(writer, "mass", par().mass); + + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + for (unsigned int nu = 0; nu < env().getNd(); ++nu) + { + sliceSum(*(vpTensor_[mu][nu]), vecBuf, Tp); + result.resize(vecBuf.size()); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(writer, "Pi_"+std::to_string(mu)+"_"+std::to_string(nu), + result); + } + } + } } // Calculate O(q) and O(q^2) terms of momentum-space charged propagator diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.hpp b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp index 0d93dc45..fbe73d85 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.hpp +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp @@ -48,13 +48,17 @@ private: void momD1(ScalarField &s, FFT &fft); void momD2(ScalarField &s, FFT &fft); private: - std::string freeMomPropName_, GFSrcName_, prop0Name_, - propQName_, propSunName_, propTadName_; - std::vector phaseName_, muPropQName_, muPropSunName_, - muPropTadName_; - ScalarField *freeMomProp_, *GFSrc_, *prop0_; - std::vector phase_; - EmField *A; + std::string freeMomPropName_, GFSrcName_, + prop0Name_, propQName_, + propSunName_, propTadName_; + std::vector phaseName_, muPropQName_, + muPropSunName_, muPropTadName_; + std::vector > vpTensorName_; + ScalarField *freeMomProp_, *GFSrc_, + *prop0_; + std::vector phase_; + std::vector > vpTensor_; + EmField *A; }; MODULE_REGISTER_NS(ScalarVP, TScalarVP, MScalar); From db3837be22a6f654e2da45414a4692886ea38d56 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 3 May 2017 13:26:49 +0100 Subject: [PATCH 017/170] =?UTF-8?q?QedFVol:=20Change=20=E2=80=9Cdouble?= =?UTF-8?q?=E2=80=9D=20to=20=E2=80=9CReal=E2=80=9D=20in=20ScalarVP.cc?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extras/Hadrons/Modules/MScalar/ScalarVP.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.cc b/extras/Hadrons/Modules/MScalar/ScalarVP.cc index 9689a63f..7745c40e 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.cc +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.cc @@ -125,7 +125,7 @@ void TScalarVP::execute(void) ScalarField &source = *env().getObject(par().source); Complex ci(0.0,1.0); FFT fft(env().getGrid()); - double q = par().charge; + Real q = par().charge; // cache momentum-space free scalar propagator if (!env().hasCreatedObject(freeMomPropName_)) From 6cb563a40c97c4a443e6f2d5621d7c8c12f5b04e Mon Sep 17 00:00:00 2001 From: James Harrison Date: Fri, 5 May 2017 17:12:41 +0100 Subject: [PATCH 018/170] QedFVol: Access HVP tensor using a vector> instead of vector> --- extras/Hadrons/Modules/MScalar/ScalarVP.cc | 64 +++++---------------- extras/Hadrons/Modules/MScalar/ScalarVP.hpp | 1 - 2 files changed, 15 insertions(+), 50 deletions(-) diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.cc b/extras/Hadrons/Modules/MScalar/ScalarVP.cc index 7745c40e..880b3ffd 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.cc +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.cc @@ -205,15 +205,15 @@ void TScalarVP::execute(void) } // CONTRACTIONS - vpTensor_.clear(); + std::vector > vpTensor; for (unsigned int mu = 0; mu < env().getNd(); ++mu) { - std::vector vpTensor_mu; + std::vector vpTensor_mu; for (unsigned int nu = 0; nu < env().getNd(); ++nu) { - vpTensor_mu.push_back(env().createLattice(vpTensorName_[mu][nu])); + vpTensor_mu.push_back(*env().createLattice(vpTensorName_[mu][nu])); } - vpTensor_.push_back(vpTensor_mu); + vpTensor.push_back(vpTensor_mu); } ScalarField prop1(env().getGrid()), prop2(env().getGrid()); EmField &A = *env().getObject(par().emField); @@ -221,7 +221,6 @@ void TScalarVP::execute(void) TComplex Anu0; std::vector coor0 = {0, 0, 0, 0}; - // Position-space implementation prop1 = *GFSrc_ + q*propQ + q*q*propSun + q*q*propTad; fft.FFT_all_dim(prop1, prop1, FFT::backward); for (unsigned int nu = 0; nu < env().getNd(); ++nu) @@ -231,57 +230,24 @@ void TScalarVP::execute(void) + q*q*(*(muPropSun_[nu]) + *(muPropTad_[nu])); fft.FFT_all_dim(prop2, prop2, FFT::backward); + std::vector pi_nu; for (unsigned int mu = 0; mu < env().getNd(); ++mu) { LOG(Message) << "Computing Pi[" << mu << "][" << nu << "]..." << std::endl; Amu = peekLorentz(A, mu); - ScalarField &pi_mu_nu = *(vpTensor_[mu][nu]); - pi_mu_nu = adj(prop2) - * (1.0 + ci*q*Amu - 0.5*q*q*Amu*Amu) - * Cshift(prop1, mu, 1) - * (1.0 + ci*q*Anu0 - 0.5*q*q*Anu0*Anu0); - pi_mu_nu -= Cshift(adj(prop2), mu, 1) - * (1.0 - ci*q*Amu - 0.5*q*q*Amu*Amu) - * prop1 - * (1.0 + ci*q*Anu0 - 0.5*q*q*Anu0*Anu0); - pi_mu_nu = 2.0*real(pi_mu_nu); + vpTensor[mu][nu] = adj(prop2) + * (1.0 + ci*q*Amu - 0.5*q*q*Amu*Amu) + * Cshift(prop1, mu, 1) + * (1.0 + ci*q*Anu0 - 0.5*q*q*Anu0*Anu0); + vpTensor[mu][nu] -= Cshift(adj(prop2), mu, 1) + * (1.0 - ci*q*Amu - 0.5*q*q*Amu*Amu) + * prop1 + * (1.0 + ci*q*Anu0 - 0.5*q*q*Anu0*Anu0); + vpTensor[mu][nu] = 2.0*real(vpTensor[mu][nu]); } } - // // Momentum-space implementation - // ScalarField propbuf1(env().getGrid()), propbuf2(env().getGrid()); - // prop1 = *GFSrc_ + q*propQ + q*q*propSun + q*q*propTad; - // for (unsigned int nu = 0; nu < env().getNd(); ++nu) - // { - // peekSite(Anu0, peekLorentz(A, nu), coor0); - // prop2 = adj(*phase_[nu])*(*GFSrc_) + q*(*(muPropQ_[nu])) - // + q*q*(*(muPropSun_[nu]) + *(muPropTad_[nu])); - - // for (unsigned int mu = 0; mu < env().getNd(); ++mu) - // { - // LOG(Message) << "Computing Pi[" << mu << "][" << nu << "]..." - // << std::endl; - // Amu = peekLorentz(A, mu); - // ScalarField &pi_mu_nu = *(vpTensor_[mu][nu]); - // propbuf1 = (*phase_[mu])*prop1; - // fft.FFT_all_dim(propbuf1, propbuf1, FFT::backward); - // fft.FFT_all_dim(propbuf2, prop2, FFT::backward); - // pi_mu_nu = adj(propbuf2) - // * (1.0 + ci*q*Amu - 0.5*q*q*Amu*Amu) - // * propbuf1 - // * (1.0 + ci*q*Anu0 - 0.5*q*q*Anu0*Anu0); - // propbuf2 = (*phase_[mu])*prop2; - // fft.FFT_all_dim(propbuf1, prop1, FFT::backward); - // fft.FFT_all_dim(propbuf2, propbuf2, FFT::backward); - // pi_mu_nu -= adj(propbuf2) - // * (1.0 - ci*q*Amu - 0.5*q*q*Amu*Amu) - // * propbuf1 - // * (1.0 + ci*q*Anu0 - 0.5*q*q*Anu0*Anu0); - // pi_mu_nu = 2.0*real(pi_mu_nu); - // } - // } - // OUTPUT IF NECESSARY if (!par().output.empty()) { @@ -302,7 +268,7 @@ void TScalarVP::execute(void) { for (unsigned int nu = 0; nu < env().getNd(); ++nu) { - sliceSum(*(vpTensor_[mu][nu]), vecBuf, Tp); + sliceSum(vpTensor[mu][nu], vecBuf, Tp); result.resize(vecBuf.size()); for (unsigned int t = 0; t < vecBuf.size(); ++t) { diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.hpp b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp index fbe73d85..9d884575 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.hpp +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp @@ -57,7 +57,6 @@ private: ScalarField *freeMomProp_, *GFSrc_, *prop0_; std::vector phase_; - std::vector > vpTensor_; EmField *A; }; From 741bc836f69d37623cba76cf4aee06dee3f6c84e Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Fri, 5 May 2017 17:36:43 +0100 Subject: [PATCH 019/170] Exposing support for Ncolours and Ndimensions and JSON input file for the ScalarAction --- lib/qcd/action/scalar/Scalar.h | 6 +- lib/qcd/action/scalar/ScalarImpl.h | 5 +- .../action/scalar/ScalarInteractionAction.h | 152 +++++++++--------- lib/qcd/hmc/GenericHMCrunner.h | 3 + lib/qcd/representations/hmc_types.h | 3 + lib/stencil/Stencil.h | 2 +- tests/hmc/Test_hmc_ScalarActionNxN.cc | 104 ++++++++---- 7 files changed, 168 insertions(+), 107 deletions(-) diff --git a/lib/qcd/action/scalar/Scalar.h b/lib/qcd/action/scalar/Scalar.h index cae38360..485a6765 100644 --- a/lib/qcd/action/scalar/Scalar.h +++ b/lib/qcd/action/scalar/Scalar.h @@ -40,9 +40,9 @@ namespace QCD { typedef ScalarAction ScalarActionF; typedef ScalarAction ScalarActionD; - typedef ScalarInteractionAction ScalarAdjActionR; - typedef ScalarInteractionAction ScalarAdjActionF; - typedef ScalarInteractionAction ScalarAdjActionD; + template using ScalarAdjActionR = ScalarInteractionAction, Dimensions>; + template using ScalarAdjActionF = ScalarInteractionAction, Dimensions>; + template using ScalarAdjActionD = ScalarInteractionAction, Dimensions>; } } diff --git a/lib/qcd/action/scalar/ScalarImpl.h b/lib/qcd/action/scalar/ScalarImpl.h index 6d14b61a..8b5e3aa2 100644 --- a/lib/qcd/action/scalar/ScalarImpl.h +++ b/lib/qcd/action/scalar/ScalarImpl.h @@ -96,7 +96,10 @@ class ScalarImplTypes { typedef ScalarAdjMatrixImplTypes ScalarAdjImplF; typedef ScalarAdjMatrixImplTypes ScalarAdjImplD; - + template using ScalarNxNAdjImplR = ScalarAdjMatrixImplTypes; + template using ScalarNxNAdjImplF = ScalarAdjMatrixImplTypes; + template using ScalarNxNAdjImplD = ScalarAdjMatrixImplTypes; + //} } diff --git a/lib/qcd/action/scalar/ScalarInteractionAction.h b/lib/qcd/action/scalar/ScalarInteractionAction.h index 5a322a5e..ca8207bd 100644 --- a/lib/qcd/action/scalar/ScalarInteractionAction.h +++ b/lib/qcd/action/scalar/ScalarInteractionAction.h @@ -37,11 +37,11 @@ directory namespace Grid { // FIXME drop the QCD namespace everywhere here -template -class ScalarInteractionAction : public QCD::Action { -public: + template + class ScalarInteractionAction : public QCD::Action { + public: INHERIT_FIELD_TYPES(Impl); -private: + private: RealD mass_square; RealD lambda; @@ -50,14 +50,19 @@ private: typedef CartesianStencil Stencil; SimpleCompressor compressor; - int npoint = 8; - std::vector directions = {0,1,2,3,0,1,2,3}; // forcing 4 dimensions - std::vector displacements = {1,1,1,1, -1,-1,-1,-1}; + int npoint = 2*Ndim; + std::vector directions;// = {0,1,2,3,0,1,2,3}; // forcing 4 dimensions + std::vector displacements;// = {1,1,1,1, -1,-1,-1,-1}; - public: + public: - ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l){} + ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l), displacements(2*Ndim,0), directions(2*Ndim,0){ + for (int mu = 0 ; mu < Ndim; mu++){ + directions[mu] = mu; directions[mu+Ndim] = mu; + displacements[mu] = 1; displacements[mu+Ndim] = -1; + } + } virtual std::string LogParameters() { std::stringstream sstream; @@ -71,75 +76,74 @@ private: virtual void refresh(const Field &U, GridParallelRNG &pRNG) {} virtual RealD S(const Field &p) { - static Stencil phiStencil(p._grid, npoint, 0, directions, displacements); - phiStencil.HaloExchange(p, compressor); - - Field action(p._grid), pshift(p._grid), phisquared(p._grid); - phisquared = p*p; - action = (2.0*QCD::Nd + mass_square)*phisquared + lambda*phisquared*phisquared; - for (int mu = 0; mu < QCD::Nd; mu++) { - // pshift = Cshift(p, mu, +1); // not efficient, implement with stencils - PARALLEL_FOR_LOOP - for (int i = 0; i < p._grid->oSites(); i++) { - int permute_type; - StencilEntry *SE; - vobj temp2; - vobj *temp; - vobj *t_p; - - SE = phiStencil.GetEntry(permute_type, mu, i); - t_p = &p._odata[i]; - if ( SE->_is_local ) { - temp = &p._odata[SE->_offset]; - if ( SE->_permute ) { - permute(temp2, *temp, permute_type); - action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2; - } else { - action._odata[i] -= *temp*(*t_p) + (*t_p)*(*temp); - } - } else { - action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset]; - } - } - // action -= pshift*p + p*pshift; - } - // NB the trace in the algebra is normalised to 1/2 - // minus sign coming from the antihermitian fields - return -(TensorRemove(sum(trace(action)))).real(); + assert(p._grid->Nd() == Ndim); + static Stencil phiStencil(p._grid, npoint, 0, directions, displacements); + phiStencil.HaloExchange(p, compressor); + Field action(p._grid), pshift(p._grid), phisquared(p._grid); + phisquared = p*p; + action = (2.0*Ndim + mass_square)*phisquared + lambda*phisquared*phisquared; + for (int mu = 0; mu < Ndim; mu++) { + // pshift = Cshift(p, mu, +1); // not efficient, implement with stencils + parallel_for (int i = 0; i < p._grid->oSites(); i++) { + int permute_type; + StencilEntry *SE; + vobj temp2; + vobj *temp; + vobj *t_p; + + SE = phiStencil.GetEntry(permute_type, mu, i); + t_p = &p._odata[i]; + if ( SE->_is_local ) { + temp = &p._odata[SE->_offset]; + if ( SE->_permute ) { + permute(temp2, *temp, permute_type); + action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2; + } else { + action._odata[i] -= *temp*(*t_p) + (*t_p)*(*temp); + } + } else { + action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset]; + } + } + // action -= pshift*p + p*pshift; + } + // NB the trace in the algebra is normalised to 1/2 + // minus sign coming from the antihermitian fields + return -(TensorRemove(sum(trace(action)))).real(); }; virtual void deriv(const Field &p, Field &force) { - force = (2.0*QCD::Nd + mass_square)*p + 2.0*lambda*p*p*p; - // move this outside - static Stencil phiStencil(p._grid, npoint, 0, directions, displacements); - phiStencil.HaloExchange(p, compressor); - - //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1); - for (int point = 0; point < npoint; point++) { - PARALLEL_FOR_LOOP - for (int i = 0; i < p._grid->oSites(); i++) { - vobj *temp; - vobj temp2; - int permute_type; - StencilEntry *SE; - SE = phiStencil.GetEntry(permute_type, point, i); - - if ( SE->_is_local ) { - temp = &p._odata[SE->_offset]; - if ( SE->_permute ) { - permute(temp2, *temp, permute_type); - force._odata[i] -= temp2; - } else { - force._odata[i] -= *temp; - } - } else { - force._odata[i] -= phiStencil.CommBuf()[SE->_offset]; - } - } - } + assert(p._grid->Nd() == Ndim); + force = (2.0*Ndim + mass_square)*p + 2.0*lambda*p*p*p; + // move this outside + static Stencil phiStencil(p._grid, npoint, 0, directions, displacements); + phiStencil.HaloExchange(p, compressor); + + //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1); + for (int point = 0; point < npoint; point++) { + parallel_for (int i = 0; i < p._grid->oSites(); i++) { + vobj *temp; + vobj temp2; + int permute_type; + StencilEntry *SE; + SE = phiStencil.GetEntry(permute_type, point, i); + + if ( SE->_is_local ) { + temp = &p._odata[SE->_offset]; + if ( SE->_permute ) { + permute(temp2, *temp, permute_type); + force._odata[i] -= temp2; + } else { + force._odata[i] -= *temp; + } + } else { + force._odata[i] -= phiStencil.CommBuf()[SE->_offset]; + } + } + } } -}; - + }; + } // namespace Grid #endif // SCALAR_INT_ACTION_H diff --git a/lib/qcd/hmc/GenericHMCrunner.h b/lib/qcd/hmc/GenericHMCrunner.h index 353b4905..4f6c1af0 100644 --- a/lib/qcd/hmc/GenericHMCrunner.h +++ b/lib/qcd/hmc/GenericHMCrunner.h @@ -210,6 +210,9 @@ typedef HMCWrapperTemplate typedef HMCWrapperTemplate ScalarAdjGenericHMCRunner; +template +using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR, MinimumNorm2, ScalarNxNMatrixFields >; + } // namespace QCD } // namespace Grid diff --git a/lib/qcd/representations/hmc_types.h b/lib/qcd/representations/hmc_types.h index b4991941..3fee377e 100644 --- a/lib/qcd/representations/hmc_types.h +++ b/lib/qcd/representations/hmc_types.h @@ -64,6 +64,9 @@ typedef Representations NoHirep; typedef Representations > ScalarFields; typedef Representations > ScalarMatrixFields; +template < int Colours> +using ScalarNxNMatrixFields = Representations::Field> >; + // Helper classes to access the elements // Strips the first N parameters from the tuple // sequence of classes to obtain the S sequence diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index d1c28e78..887142c4 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -286,7 +286,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal { int dimension = _directions[point]; int displacement = _distances[point]; - + int fd = _grid->_fdimensions[dimension]; int rd = _grid->_rdimensions[dimension]; diff --git a/tests/hmc/Test_hmc_ScalarActionNxN.cc b/tests/hmc/Test_hmc_ScalarActionNxN.cc index f63936b5..b3ce6840 100644 --- a/tests/hmc/Test_hmc_ScalarActionNxN.cc +++ b/tests/hmc/Test_hmc_ScalarActionNxN.cc @@ -32,68 +32,116 @@ class ScalarActionParameters : Serializable { GRID_SERIALIZABLE_CLASS_MEMBERS(ScalarActionParameters, double, mass_squared, double, lambda); + + template + ScalarActionParameters(Reader& Reader){ + read(Reader, "ScalarAction", *this); + } + }; } int main(int argc, char **argv) { using namespace Grid; using namespace Grid::QCD; - + typedef Grid::JSONReader Serialiser; + Grid_init(&argc, &argv); int threads = GridThread::GetThreads(); // here make a routine to print all the relevant information on the run std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl; // Typedefs to simplify notation - typedef ScalarAdjGenericHMCRunner HMCWrapper; // Uses the default minimum norm, real scalar fields - + constexpr int Ncolours = 4; + constexpr int Ndimensions = 3; + typedef ScalarNxNAdjGenericHMCRunner HMCWrapper; // Uses the default minimum norm, real scalar fields + typedef ScalarAdjActionR ScalarAction; //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: HMCWrapper TheHMC; + TheHMC.ReadCommandLine(argc, argv); + + if (TheHMC.ParameterFile.empty()){ + std::cout << "Input file not specified." + << "Use --ParameterFile option in the command line.\nAborting" + << std::endl; + exit(1); + } + Serialiser Reader(TheHMC.ParameterFile); // Grid from the command line GridModule ScalarGrid; - ScalarGrid.set_full(SpaceTimeGrid::makeFourDimGrid( - GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()), - GridDefaultMpi())); - ScalarGrid.set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(ScalarGrid.get_full())); + if (GridDefaultLatt().size() != Ndimensions){ + std::cout << "Incorrect dimension of the grid\n. Expected dim="<< Ndimensions << std::endl; + exit(1); + } + if (GridDefaultMpi().size() != Ndimensions){ + std::cout << "Incorrect dimension of the mpi grid\n. Expected dim="<< Ndimensions << std::endl; + exit(1); + } + ScalarGrid.set_full(new GridCartesian(GridDefaultLatt(),GridDefaultSimd(Ndimensions, vComplex::Nsimd()),GridDefaultMpi())); + ScalarGrid.set_rb(new GridRedBlackCartesian(ScalarGrid.get_full())); TheHMC.Resources.AddGrid("scalar", ScalarGrid); - // Possibile to create the module by hand - // hardcoding parameters or using a Reader + std::cout << "Lattice size : " << GridDefaultLatt() << std::endl; // Checkpointer definition - CheckpointerParameters CPparams; - CPparams.config_prefix = "ckpoint_scalar_lat"; - CPparams.rng_prefix = "ckpoint_scalar_rng"; - CPparams.saveInterval = 50; - CPparams.format = "IEEE64BIG"; - + CheckpointerParameters CPparams(Reader); TheHMC.Resources.LoadBinaryCheckpointer(CPparams); - RNGModuleParameters RNGpar; - RNGpar.serial_seeds = "1 2 3 4 5"; - RNGpar.parallel_seeds = "6 7 8 9 10"; + RNGModuleParameters RNGpar(Reader); TheHMC.Resources.SetRNGSeeds(RNGpar); ///////////////////////////////////////////////////////////// // Collect actions, here use more encapsulation // Scalar action in adjoint representation - ScalarActionParameters SPar; - SPar.mass_squared = 0.5; - SPar.lambda = 0.1; - ScalarAdjActionR Saction(SPar.mass_squared, SPar.lambda); + ScalarActionParameters SPar(Reader); + ScalarAction Saction(SPar.mass_squared, SPar.lambda); // Collect actions - ActionLevel Level1(1); + ActionLevel> Level1(1); Level1.push_back(&Saction); TheHMC.TheAction.push_back(Level1); ///////////////////////////////////////////////////////////// + TheHMC.Parameters.initialize(Reader); - // HMC parameters are serialisable - TheHMC.Parameters.MD.MDsteps = 20; - TheHMC.Parameters.MD.trajL = 1.0; - - TheHMC.ReadCommandLine(argc, argv); TheHMC.Run(); Grid_finalize(); } // main + +/* Examples for input files + +JSON + +{ + "Checkpointer": { + "config_prefix": "ckpoint_scalar_lat", + "rng_prefix": "ckpoint_scalar_rng", + "saveInterval": 1, + "format": "IEEE64BIG" + }, + "RandomNumberGenerator": { + "serial_seeds": "1 2 3 4 6", + "parallel_seeds": "6 7 8 9 11" + }, + "ScalarAction":{ + "mass_squared": 0.5, + "lambda": 0.1 + }, + "HMC":{ + "StartTrajectory": 0, + "Trajectories": 100, + "MetropolisTest": true, + "NoMetropolisUntil": 10, + "StartingType": "HotStart", + "MD":{ + "name": "MinimumNorm2", + "MDsteps": 15, + "trajL": 2.0 + } + } +} + + +XML example not provided yet + +*/ \ No newline at end of file From 914f180fa31ca19b0710d393221f56785141b67c Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 9 May 2017 11:46:25 +0100 Subject: [PATCH 020/170] QedFVol: Implement exact O(alpha) vacuum polarisation. --- extras/Hadrons/Modules/MScalar/ScalarVP.cc | 119 ++++++++++++++------ extras/Hadrons/Modules/MScalar/ScalarVP.hpp | 3 +- 2 files changed, 85 insertions(+), 37 deletions(-) diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.cc b/extras/Hadrons/Modules/MScalar/ScalarVP.cc index 880b3ffd..6e9be923 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.cc +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.cc @@ -30,8 +30,6 @@ std::vector TScalarVP::getOutput(void) for (unsigned int mu = 0; mu < env().getNd(); ++mu) { out.push_back(getName() + "_propQ_" + std::to_string(mu)); - out.push_back(getName() + "_propSun_" + std::to_string(mu)); - out.push_back(getName() + "_propTad_" + std::to_string(mu)); for (unsigned int nu = 0; nu < env().getNd(); ++nu) { @@ -54,16 +52,12 @@ void TScalarVP::setup(void) phaseName_.clear(); muPropQName_.clear(); - muPropSunName_.clear(); - muPropTadName_.clear(); vpTensorName_.clear(); for (unsigned int mu = 0; mu < env().getNd(); ++mu) { phaseName_.push_back("_shiftphase_" + std::to_string(mu)); muPropQName_.push_back(getName() + "_propQ_" + std::to_string(mu)); - muPropSunName_.push_back(getName() + "_propSun_" + std::to_string(mu)); - muPropTadName_.push_back(getName() + "_propTad_" + std::to_string(mu)); std::vector vpTensorName_mu; for (unsigned int nu = 0; nu < env().getNd(); ++nu) @@ -99,16 +93,8 @@ void TScalarVP::setup(void) env().registerLattice(muPropQName_[mu]); } env().registerLattice(propSunName_); - for (unsigned int mu = 0; mu < env().getNd(); ++mu) - { - env().registerLattice(muPropSunName_[mu]); - } env().registerLattice(propTadName_); for (unsigned int mu = 0; mu < env().getNd(); ++mu) - { - env().registerLattice(muPropTadName_[mu]); - } - for (unsigned int mu = 0; mu < env().getNd(); ++mu) { for (unsigned int nu = 0; nu < env().getNd(); ++nu) { @@ -191,17 +177,16 @@ void TScalarVP::execute(void) chargedProp(propQ, propSun, propTad, *GFSrc_, fft); // Propagators from shifted sources - std::vector muPropQ_, muPropSun_, muPropTad_; - ScalarField buf(env().getGrid()); + std::vector muPropQ; for (unsigned int mu = 0; mu < env().getNd(); ++mu) { - muPropQ_.push_back(env().createLattice(muPropQName_[mu])); - muPropSun_.push_back(env().createLattice(muPropSunName_[mu])); - muPropTad_.push_back(env().createLattice(muPropTadName_[mu])); + muPropQ.push_back(*env().createLattice(muPropQName_[mu])); - buf = adj(*phase_[mu])*(*GFSrc_); - chargedProp(*(muPropQ_[mu]), *(muPropSun_[mu]), *(muPropTad_[mu]), - buf, fft); + // -G*momD1*G*F*tau_mu*Src (momD1 = F*D1*Finv) + muPropQ[mu] = adj(*phase_[mu])*(*GFSrc_); + momD1(muPropQ[mu], fft); + muPropQ[mu] = -(*freeMomProp_)*muPropQ[mu]; + fft.FFT_all_dim(muPropQ[mu], muPropQ[mu], FFT::backward); } // CONTRACTIONS @@ -221,33 +206,94 @@ void TScalarVP::execute(void) TComplex Anu0; std::vector coor0 = {0, 0, 0, 0}; - prop1 = *GFSrc_ + q*propQ + q*q*propSun + q*q*propTad; - fft.FFT_all_dim(prop1, prop1, FFT::backward); + // Free VP + + // Charged VP for (unsigned int nu = 0; nu < env().getNd(); ++nu) { peekSite(Anu0, peekLorentz(A, nu), coor0); - prop2 = adj(*phase_[nu])*(*GFSrc_) + q*(*(muPropQ_[nu])) - + q*q*(*(muPropSun_[nu]) + *(muPropTad_[nu])); - fft.FFT_all_dim(prop2, prop2, FFT::backward); - std::vector pi_nu; for (unsigned int mu = 0; mu < env().getNd(); ++mu) { LOG(Message) << "Computing Pi[" << mu << "][" << nu << "]..." << std::endl; Amu = peekLorentz(A, mu); - vpTensor[mu][nu] = adj(prop2) - * (1.0 + ci*q*Amu - 0.5*q*q*Amu*Amu) - * Cshift(prop1, mu, 1) - * (1.0 + ci*q*Anu0 - 0.5*q*q*Anu0*Anu0); + + // "Exchange" terms + prop1 = *prop0_ + q*propQ; + prop2 = Cshift(*prop0_, nu, -1) + q*muPropQ[nu]; + vpTensor[mu][nu] = adj(prop2) * (1.0 + ci*q*Amu) + * Cshift(prop1, mu, 1) * (1.0 + ci*q*Anu0); + vpTensor[mu][nu] -= Cshift(adj(prop2), mu, 1) * (1.0 - ci*q*Amu) + * prop1 * (1.0 + ci*q*Anu0); + + // Subtract O(alpha^2) term + prop1 = q*propQ; + prop2 = q*muPropQ[nu]; + vpTensor[mu][nu] -= adj(prop2) * ci*q*Amu + * Cshift(prop1, mu, 1) * ci*q*Anu0; + vpTensor[mu][nu] += Cshift(adj(prop2), mu, 1) * (-ci)*q*Amu + * prop1 * ci*q*Anu0; + + // Sunset+tadpole from source + prop1 = q*q*(propSun + propTad); + prop2 = Cshift(*prop0_, nu, -1); + vpTensor[mu][nu] += adj(prop2) * Cshift(prop1, mu, 1); + vpTensor[mu][nu] -= Cshift(adj(prop2), mu, 1) * prop1; + + // Sunset+tadpole from shifted source + prop1 = Cshift(prop1, nu, -1); + vpTensor[mu][nu] += Cshift(adj(*prop0_), mu, 1) * prop1; + vpTensor[mu][nu] -= adj(*prop0_) * Cshift(prop1, mu, 1); + + // Source tadpole + prop1 = *prop0_; + vpTensor[mu][nu] += adj(prop2) + * Cshift(prop1, mu, 1) + * (-0.5)*q*q*Anu0*Anu0; vpTensor[mu][nu] -= Cshift(adj(prop2), mu, 1) - * (1.0 - ci*q*Amu - 0.5*q*q*Amu*Amu) * prop1 - * (1.0 + ci*q*Anu0 - 0.5*q*q*Anu0*Anu0); + * (-0.5)*q*q*Anu0*Anu0; + + // Sink tadpole + vpTensor[mu][nu] += adj(prop2) + * (-0.5)*q*q*Amu*Amu + * Cshift(prop1, mu, 1); + vpTensor[mu][nu] -= Cshift(adj(prop2), mu, 1) + * (-0.5)*q*q*Amu*Amu + * prop1; + vpTensor[mu][nu] = 2.0*real(vpTensor[mu][nu]); } } + // prop1 = *GFSrc_ + q*propQ + q*q*propSun + q*q*propTad; + // fft.FFT_all_dim(prop1, prop1, FFT::backward); + // for (unsigned int nu = 0; nu < env().getNd(); ++nu) + // { + // peekSite(Anu0, peekLorentz(A, nu), coor0); + // prop2 = adj(*phase_[nu])*(*GFSrc_) + q*(*(muPropQ_[nu])) + // + q*q*(*(muPropSun_[nu]) + *(muPropTad_[nu])); + // fft.FFT_all_dim(prop2, prop2, FFT::backward); + + // std::vector pi_nu; + // for (unsigned int mu = 0; mu < env().getNd(); ++mu) + // { + // LOG(Message) << "Computing Pi[" << mu << "][" << nu << "]..." + // << std::endl; + // Amu = peekLorentz(A, mu); + // vpTensor[mu][nu] = adj(prop2) + // * (1.0 + ci*q*Amu - 0.5*q*q*Amu*Amu) + // * Cshift(prop1, mu, 1) + // * (1.0 + ci*q*Anu0 - 0.5*q*q*Anu0*Anu0); + // vpTensor[mu][nu] -= Cshift(adj(prop2), mu, 1) + // * (1.0 - ci*q*Amu - 0.5*q*q*Amu*Amu) + // * prop1 + // * (1.0 + ci*q*Anu0 - 0.5*q*q*Anu0*Anu0); + // vpTensor[mu][nu] = 2.0*real(vpTensor[mu][nu]); + // } + // } + // OUTPUT IF NECESSARY if (!par().output.empty()) { @@ -281,7 +327,7 @@ void TScalarVP::execute(void) } } -// Calculate O(q) and O(q^2) terms of momentum-space charged propagator +// Calculate O(q) and O(q^2) terms of position-space charged propagator void TScalarVP::chargedProp(ScalarField &prop_q, ScalarField &prop_sun, ScalarField &prop_tad, ScalarField &GFSrc, FFT &fft) @@ -300,15 +346,18 @@ void TScalarVP::chargedProp(ScalarField &prop_q, ScalarField &prop_sun, momD1(buf, fft); buf = G*buf; prop_q = -buf; + fft.FFT_all_dim(prop_q, prop_q, FFT::backward); // G*momD1*G*momD1*G*F*Src momD1(buf, fft); prop_sun = G*buf; + fft.FFT_all_dim(prop_sun, prop_sun, FFT::backward); // -G*momD2*G*F*Src (momD2 = F*D2*Finv) buf = GFSrc; momD2(buf, fft); prop_tad = -G*buf; + fft.FFT_all_dim(prop_tad, prop_tad, FFT::backward); } void TScalarVP::momD1(ScalarField &s, FFT &fft) diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.hpp b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp index 9d884575..e0bdd034 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.hpp +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp @@ -51,8 +51,7 @@ private: std::string freeMomPropName_, GFSrcName_, prop0Name_, propQName_, propSunName_, propTadName_; - std::vector phaseName_, muPropQName_, - muPropSunName_, muPropTadName_; + std::vector phaseName_, muPropQName_; std::vector > vpTensorName_; ScalarField *freeMomProp_, *GFSrc_, *prop0_; From 5cfc0180aaab83f80c0d8b92d25bbbe97cce57f7 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Tue, 9 May 2017 12:46:57 +0100 Subject: [PATCH 021/170] QedFVol: Output free VP along with charged VP. --- extras/Hadrons/Modules/MScalar/ScalarVP.cc | 96 ++++++++++----------- extras/Hadrons/Modules/MScalar/ScalarVP.hpp | 3 +- 2 files changed, 48 insertions(+), 51 deletions(-) diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.cc b/extras/Hadrons/Modules/MScalar/ScalarVP.cc index 6e9be923..f6f40700 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.cc +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.cc @@ -23,7 +23,7 @@ std::vector TScalarVP::getInput(void) std::vector TScalarVP::getOutput(void) { - std::vector out = {getName(), getName()+"_propQ", + std::vector out = {getName()+"_propQ", getName()+"_propSun", getName()+"_propTad"}; @@ -34,6 +34,7 @@ std::vector TScalarVP::getOutput(void) for (unsigned int nu = 0; nu < env().getNd(); ++nu) { out.push_back(getName() + "_" + std::to_string(mu) + "_" + std::to_string(nu)); + out.push_back(getName() + "_free_" + std::to_string(mu) + "_" + std::to_string(nu)); } } @@ -53,6 +54,7 @@ void TScalarVP::setup(void) phaseName_.clear(); muPropQName_.clear(); vpTensorName_.clear(); + freeVpTensorName_.clear(); for (unsigned int mu = 0; mu < env().getNd(); ++mu) { @@ -60,12 +62,16 @@ void TScalarVP::setup(void) muPropQName_.push_back(getName() + "_propQ_" + std::to_string(mu)); std::vector vpTensorName_mu; + std::vector freeVpTensorName_mu; for (unsigned int nu = 0; nu < env().getNd(); ++nu) { vpTensorName_mu.push_back(getName() + "_" + std::to_string(mu) + "_" + std::to_string(nu)); + freeVpTensorName_mu.push_back(getName() + "_free_" + std::to_string(mu) + + "_" + std::to_string(nu)); } vpTensorName_.push_back(vpTensorName_mu); + freeVpTensorName_.push_back(freeVpTensorName_mu); } if (!env().hasRegisteredObject(freeMomPropName_)) @@ -90,7 +96,7 @@ void TScalarVP::setup(void) env().registerLattice(propQName_); for (unsigned int mu = 0; mu < env().getNd(); ++mu) { - env().registerLattice(muPropQName_[mu]); + env().registerLattice(muPropQName_[mu]); } env().registerLattice(propSunName_); env().registerLattice(propTadName_); @@ -99,9 +105,9 @@ void TScalarVP::setup(void) for (unsigned int nu = 0; nu < env().getNd(); ++nu) { env().registerLattice(vpTensorName_[mu][nu]); + env().registerLattice(freeVpTensorName_[mu][nu]); } } - env().registerLattice(getName()); } // execution /////////////////////////////////////////////////////////////////// @@ -171,12 +177,18 @@ void TScalarVP::execute(void) // PROPAGATOR CALCULATION // Propagator from unshifted source + LOG(Message) << "Computing O(alpha) charged scalar propagator" + << " (mass= " << par().mass + << ", charge= " << q << ")..." + << std::endl; ScalarField &propQ = *env().createLattice(propQName_); ScalarField &propSun = *env().createLattice(propSunName_); ScalarField &propTad = *env().createLattice(propTadName_); chargedProp(propQ, propSun, propTad, *GFSrc_, fft); // Propagators from shifted sources + LOG(Message) << "Computing O(q) charged scalar propagators..." + << std::endl; std::vector muPropQ; for (unsigned int mu = 0; mu < env().getNd(); ++mu) { @@ -190,25 +202,25 @@ void TScalarVP::execute(void) } // CONTRACTIONS - std::vector > vpTensor; - for (unsigned int mu = 0; mu < env().getNd(); ++mu) - { - std::vector vpTensor_mu; - for (unsigned int nu = 0; nu < env().getNd(); ++nu) - { - vpTensor_mu.push_back(*env().createLattice(vpTensorName_[mu][nu])); - } - vpTensor.push_back(vpTensor_mu); - } ScalarField prop1(env().getGrid()), prop2(env().getGrid()); EmField &A = *env().getObject(par().emField); ScalarField Amu(env().getGrid()); TComplex Anu0; std::vector coor0 = {0, 0, 0, 0}; + std::vector > vpTensor, freeVpTensor; + for (unsigned int mu = 0; mu < env().getNd(); ++mu) + { + std::vector vpTensor_mu; + std::vector freeVpTensor_mu; + for (unsigned int nu = 0; nu < env().getNd(); ++nu) + { + vpTensor_mu.push_back(*env().createLattice(vpTensorName_[mu][nu])); + freeVpTensor_mu.push_back(*env().createLattice(freeVpTensorName_[mu][nu])); + } + vpTensor.push_back(vpTensor_mu); + freeVpTensor.push_back(freeVpTensor_mu); + } - // Free VP - - // Charged VP for (unsigned int nu = 0; nu < env().getNd(); ++nu) { peekSite(Anu0, peekLorentz(A, nu), coor0); @@ -219,9 +231,15 @@ void TScalarVP::execute(void) << std::endl; Amu = peekLorentz(A, mu); + // Free VP + prop1 = *prop0_; + prop2 = Cshift(*prop0_, nu, -1); + freeVpTensor[mu][nu] = adj(prop2) * Cshift(prop1, mu, 1); + freeVpTensor[mu][nu] -= Cshift(adj(prop2), mu, 1) * prop1; + // "Exchange" terms - prop1 = *prop0_ + q*propQ; - prop2 = Cshift(*prop0_, nu, -1) + q*muPropQ[nu]; + prop1 += q*propQ; + prop2 += q*muPropQ[nu]; vpTensor[mu][nu] = adj(prop2) * (1.0 + ci*q*Amu) * Cshift(prop1, mu, 1) * (1.0 + ci*q*Anu0); vpTensor[mu][nu] -= Cshift(adj(prop2), mu, 1) * (1.0 - ci*q*Amu) @@ -267,33 +285,6 @@ void TScalarVP::execute(void) } } - // prop1 = *GFSrc_ + q*propQ + q*q*propSun + q*q*propTad; - // fft.FFT_all_dim(prop1, prop1, FFT::backward); - // for (unsigned int nu = 0; nu < env().getNd(); ++nu) - // { - // peekSite(Anu0, peekLorentz(A, nu), coor0); - // prop2 = adj(*phase_[nu])*(*GFSrc_) + q*(*(muPropQ_[nu])) - // + q*q*(*(muPropSun_[nu]) + *(muPropTad_[nu])); - // fft.FFT_all_dim(prop2, prop2, FFT::backward); - - // std::vector pi_nu; - // for (unsigned int mu = 0; mu < env().getNd(); ++mu) - // { - // LOG(Message) << "Computing Pi[" << mu << "][" << nu << "]..." - // << std::endl; - // Amu = peekLorentz(A, mu); - // vpTensor[mu][nu] = adj(prop2) - // * (1.0 + ci*q*Amu - 0.5*q*q*Amu*Amu) - // * Cshift(prop1, mu, 1) - // * (1.0 + ci*q*Anu0 - 0.5*q*q*Anu0*Anu0); - // vpTensor[mu][nu] -= Cshift(adj(prop2), mu, 1) - // * (1.0 - ci*q*Amu - 0.5*q*q*Amu*Amu) - // * prop1 - // * (1.0 + ci*q*Anu0 - 0.5*q*q*Anu0*Anu0); - // vpTensor[mu][nu] = 2.0*real(vpTensor[mu][nu]); - // } - // } - // OUTPUT IF NECESSARY if (!par().output.empty()) { @@ -322,6 +313,16 @@ void TScalarVP::execute(void) } write(writer, "Pi_"+std::to_string(mu)+"_"+std::to_string(nu), result); + + sliceSum(freeVpTensor[mu][nu], vecBuf, Tp); + result.resize(vecBuf.size()); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(writer, + "Pi_"+std::to_string(mu)+"_"+std::to_string(nu)+"_free", + result); } } } @@ -335,11 +336,6 @@ void TScalarVP::chargedProp(ScalarField &prop_q, ScalarField &prop_sun, Complex ci(0.0,1.0); ScalarField &G = *freeMomProp_; ScalarField buf(env().getGrid()); - - LOG(Message) << "Computing charged scalar propagator" - << " (mass= " << par().mass - << ", charge= " << par().charge << ")..." - << std::endl; // -G*momD1*G*F*Src (momD1 = F*D1*Finv) buf = GFSrc; diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.hpp b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp index e0bdd034..4629f6e6 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.hpp +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp @@ -52,7 +52,8 @@ private: prop0Name_, propQName_, propSunName_, propTadName_; std::vector phaseName_, muPropQName_; - std::vector > vpTensorName_; + std::vector > vpTensorName_, + freeVpTensorName_; ScalarField *freeMomProp_, *GFSrc_, *prop0_; std::vector phase_; From 43c817cc67c6447bbf69bfc7d7772fba4e7ff9eb Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Thu, 11 May 2017 00:07:17 +0100 Subject: [PATCH 022/170] Scalar action: const fix --- lib/qcd/action/scalar/ScalarInteractionAction.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/qcd/action/scalar/ScalarInteractionAction.h b/lib/qcd/action/scalar/ScalarInteractionAction.h index ca8207bd..5f4c630c 100644 --- a/lib/qcd/action/scalar/ScalarInteractionAction.h +++ b/lib/qcd/action/scalar/ScalarInteractionAction.h @@ -88,8 +88,7 @@ namespace Grid { int permute_type; StencilEntry *SE; vobj temp2; - vobj *temp; - vobj *t_p; + const vobj *temp, *t_p; SE = phiStencil.GetEntry(permute_type, mu, i); t_p = &p._odata[i]; @@ -122,7 +121,7 @@ namespace Grid { //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1); for (int point = 0; point < npoint; point++) { parallel_for (int i = 0; i < p._grid->oSites(); i++) { - vobj *temp; + const vobj *temp; vobj temp2; int permute_type; StencilEntry *SE; From d1ece741370d1b829f5946afc7c21c585a158d31 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Thu, 11 May 2017 11:40:44 +0100 Subject: [PATCH 023/170] HMC scalar test: magnetisation measurement --- tests/hmc/Test_hmc_ScalarActionNxN.cc | 54 ++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/tests/hmc/Test_hmc_ScalarActionNxN.cc b/tests/hmc/Test_hmc_ScalarActionNxN.cc index b3ce6840..bcaee31d 100644 --- a/tests/hmc/Test_hmc_ScalarActionNxN.cc +++ b/tests/hmc/Test_hmc_ScalarActionNxN.cc @@ -39,11 +39,50 @@ class ScalarActionParameters : Serializable { } }; - } + +using namespace Grid; +using namespace Grid::QCD; + +template +class MagLogger : public HmcObservable { +public: + typedef typename Impl::Field Field; + typedef typename Impl::Simd::scalar_type Trace; + + void TrajectoryComplete(int traj, + Field &U, + GridSerialRNG &sRNG, + GridParallelRNG &pRNG) { + + int def_prec = std::cout.precision(); + + std::cout << std::setprecision(std::numeric_limits::digits10 + 1); + std::cout << GridLogMessage + << "m= " << TensorRemove(trace(sum(U))) << std::endl; + std::cout << GridLogMessage + << "m^2= " << TensorRemove(trace(sum(U)*sum(U))) << std::endl; + std::cout.precision(def_prec); + + } +private: + +}; + +template +class MagMod: public ObservableModule, NoParameters>{ + typedef ObservableModule, NoParameters> ObsBase; + using ObsBase::ObsBase; // for constructors + + // acquire resource + virtual void initialize(){ + this->ObservablePtr.reset(new MagLogger()); + } +public: + MagMod(): ObsBase(NoParameters()){} +}; + int main(int argc, char **argv) { - using namespace Grid; - using namespace Grid::QCD; typedef Grid::JSONReader Serialiser; Grid_init(&argc, &argv); @@ -52,7 +91,7 @@ int main(int argc, char **argv) { std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl; // Typedefs to simplify notation - constexpr int Ncolours = 4; + constexpr int Ncolours = 2; constexpr int Ndimensions = 3; typedef ScalarNxNAdjGenericHMCRunner HMCWrapper; // Uses the default minimum norm, real scalar fields typedef ScalarAdjActionR ScalarAction; @@ -89,6 +128,11 @@ int main(int argc, char **argv) { RNGModuleParameters RNGpar(Reader); TheHMC.Resources.SetRNGSeeds(RNGpar); + + // Construct observables + typedef MagMod MagObs; + TheHMC.Resources.AddObservable(); + ///////////////////////////////////////////////////////////// // Collect actions, here use more encapsulation @@ -144,4 +188,4 @@ JSON XML example not provided yet -*/ \ No newline at end of file +*/ From 3f858d675557536feb6bac6312e4205c987857d9 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Wed, 17 May 2017 13:25:14 +0200 Subject: [PATCH 024/170] Scalar: phi^2 observable --- tests/hmc/Test_hmc_ScalarActionNxN.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/hmc/Test_hmc_ScalarActionNxN.cc b/tests/hmc/Test_hmc_ScalarActionNxN.cc index bcaee31d..a7490f51 100644 --- a/tests/hmc/Test_hmc_ScalarActionNxN.cc +++ b/tests/hmc/Test_hmc_ScalarActionNxN.cc @@ -62,6 +62,8 @@ public: << "m= " << TensorRemove(trace(sum(U))) << std::endl; std::cout << GridLogMessage << "m^2= " << TensorRemove(trace(sum(U)*sum(U))) << std::endl; + std::cout << GridLogMessage + << "phi^2= " << TensorRemove(sum(trace(U*U))) << std::endl; std::cout.precision(def_prec); } From a8c10b1933948d491371da0d4df32cb3059c3b97 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 25 May 2017 11:43:33 +0100 Subject: [PATCH 025/170] Use a global-X x Local-Y chunksize for parallel binary I/O. Gives O(32 x 8 x 18*8*8) chunk size on configuration I/O. At 150KB should be getting close to packet sizes and 4MB filesystem block sizes that are reasonably (!?) performant. We shall see once I move this off my laptop and over to BNL and time it. --- lib/parallelIO/BinaryIO.h | 196 +++++++++++++++++++++----------------- lib/parallelIO/NerscIO.h | 6 +- 2 files changed, 113 insertions(+), 89 deletions(-) diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h index afa7eb2e..ab449f92 100644 --- a/lib/parallelIO/BinaryIO.h +++ b/lib/parallelIO/BinaryIO.h @@ -217,32 +217,34 @@ class BinaryIO { Umu = zero; uint32_t csum=0; uint64_t bytes=0; - fobj file_object; - sobj munged; - + + int lx = grid->_fdimensions[0]; + std::vector file_object(lx); + std::vector munged(lx); for(int t=0;t_fdimensions[3];t++){ for(int z=0;z_fdimensions[2];z++){ for(int y=0;y_fdimensions[1];y++){ - for(int x=0;x_fdimensions[0];x++){ - - std::vector site({x,y,z,t}); - + { + bytes += sizeof(fobj)*lx; if (grid->IsBoss()) { - fin.read((char *)&file_object, sizeof(file_object));assert( fin.fail()==0); - bytes += sizeof(file_object); - if (ieee32big) be32toh_v((void *)&file_object, sizeof(file_object)); - if (ieee32) le32toh_v((void *)&file_object, sizeof(file_object)); - if (ieee64big) be64toh_v((void *)&file_object, sizeof(file_object)); - if (ieee64) le64toh_v((void *)&file_object, sizeof(file_object)); - - munge(file_object, munged, csum); + fin.read((char *)&file_object[0], sizeof(fobj)*lx); assert( fin.fail()==0); + for(int x=0;x site({x,y,z,t}); + // The boss who read the file has their value poked + pokeSite(munged[x],Umu,site); } - // The boss who read the file has their value poked - pokeSite(munged,Umu,site); }}}} timer.Stop(); std::cout<Broadcast(0,(void *)&csum,sizeof(csum)); return csum; @@ -274,31 +276,34 @@ class BinaryIO { } uint64_t bytes=0; uint32_t csum=0; - fobj file_object; - sobj unmunged; + int lx = grid->_fdimensions[0]; + std::vector file_object(lx); + std::vector unmunged(lx); for(int t=0;t_fdimensions[3];t++){ for(int z=0;z_fdimensions[2];z++){ for(int y=0;y_fdimensions[1];y++){ - for(int x=0;x_fdimensions[0];x++){ + { - std::vector site({x,y,z,t}); + std::vector site({0,y,z,t}); // peek & write - peekSite(unmunged,Umu,site); - - munge(unmunged,file_object,csum); - + for(int x=0;xIsBoss() ) { - if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object)); - if(ieee32) htole32_v((void *)&file_object,sizeof(file_object)); - if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object)); - if(ieee64) htole64_v((void *)&file_object,sizeof(file_object)); - - // NB could gather an xstrip as an optimisation. - fout.write((char *)&file_object,sizeof(file_object));assert( fout.fail()==0); - bytes+=sizeof(file_object); + for(int x=0;xGlobalIndexToGlobalCoor(gidx,gcoor); grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); int l_idx=parallel.generator_idx(o_idx,i_idx); - //std::cout << GridLogDebug << "l_idx " << l_idx << " o_idx " << o_idx - // << " i_idx " << i_idx << " rank " << rank << std::endl; if ( grid->IsBoss() ) { fin.read((char *)&saved[0],bytes);assert( fin.fail()==0); @@ -460,14 +463,12 @@ class BinaryIO { int ieee64 = (format == std::string("IEEE64")); - // Take into account block size of parallel file systems want about - // 4-16MB chunks. // Ideally one reader/writer per xy plane and read these contiguously // with comms from nominated I/O nodes. std::ifstream fin; int nd = grid->_ndimension; - std::vector parallel(nd,1); + std::vector parallel(nd,1); parallel[0] = 0; std::vector ioproc (nd); std::vector start(nd); std::vector range(nd); @@ -479,9 +480,15 @@ class BinaryIO { uint64_t slice_vol = 1; int IOnode = 1; - for(int d=0;d_ndimension;d++) { + int gstrip = grid->_gdimensions[0]; + int lstrip = grid->_ldimensions[0]; - if ( d == 0 ) parallel[d] = 0; + int chunk ; + if ( nd==1) chunk = gstrip; + else chunk = gstrip*grid->_ldimensions[1]; + + for(int d=0;d_ndimension;d++) { + if (parallel[d]) { range[d] = grid->_ldimensions[d]; start[d] = grid->_processor_coor[d]*range[d]; @@ -500,13 +507,16 @@ class BinaryIO { uint32_t tmp = IOnode; grid->GlobalSum(tmp); std::cout<< std::dec ; - std::cout<< GridLogMessage<< "Parallel read I/O to "<< file << " with " <_ndimension;d++){ std::cout<< range[d]; if( d< grid->_ndimension-1 ) std::cout<< " x "; } std::cout << std::endl; + std::cout<< GridLogMessage<< "Parallel I/O local strip size is "<< lstrip <ThisRank(); int iorank = grid->RankFromProcessorCoor(ioproc); - if (!ILDG.is_ILDG) - if ( IOnode ) { - fin.open(file,std::ios::binary|std::ios::in); - } + if (!ILDG.is_ILDG) { + if ( IOnode ) { + fin.open(file,std::ios::binary|std::ios::in); + } + } ////////////////////////////////////////////////////////// // Find the location of each site and send to primary node @@ -528,16 +539,15 @@ class BinaryIO { Umu = zero; static uint32_t csum; csum=0;//static for SHMEM - fobj fileObj; - static sobj siteObj; // Static to place in symmetric region for SHMEM + std::vector fileObj(chunk); // FIXME + std::vector siteObj(chunk); // Use comm allocator to place in symmetric region for SHMEM - // need to implement these loops in Nd independent way with a lexico conversion - for(int tlex=0;tlex tsite(nd); // temporary mixed up site std::vector gsite(nd); std::vector lsite(nd); - std::vector iosite(nd); Lexicographic::CoorFromIndex(tsite,tlex,range); @@ -546,53 +556,68 @@ class BinaryIO { gsite[d] = tsite[d]+start[d]; // global site } - - ///////////////////////// - // Get the rank of owner of data - ///////////////////////// + /////////////////////////////////////////// + // Get the global lexico base of the chunk + /////////////////////////////////////////// int rank, o_idx,i_idx, g_idx; grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gsite); grid->GlobalCoorToGlobalIndex(gsite,g_idx); - + //////////////////////////////// // iorank reads from the seek //////////////////////////////// if (myrank == iorank) { - if (ILDG.is_ILDG){ - // use C-LIME to populate the record - #ifdef HAVE_LIME - uint64_t sizeFO = sizeof(fileObj); +#ifdef HAVE_LIME + // use C-LIME to populate the record + uint64_t sizeFO = sizeof(fobj)*chunk; limeReaderSeek(ILDG.LR, g_idx*sizeFO, SEEK_SET); - int status = limeReaderReadData((void *)&fileObj, &sizeFO, ILDG.LR); - #endif + int status = limeReaderReadData((void *)&fileObj[0], &sizeFO, ILDG.LR); +#endif } else{ - fin.seekg(offset+g_idx*sizeof(fileObj)); - fin.read((char *)&fileObj,sizeof(fileObj)); + fin.seekg(offset+g_idx*sizeof(fobj)); + fin.read((char *)&fileObj[0],sizeof(fobj)*chunk); } - bytes+=sizeof(fileObj); + bytes+=sizeof(fobj)*chunk; - if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj)); - if(ieee32) le32toh_v((void *)&fileObj,sizeof(fileObj)); - if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj)); - if(ieee64) le64toh_v((void *)&fileObj,sizeof(fileObj)); + if(ieee32big) be32toh_v((void *)&fileObj[0],sizeof(fobj)*chunk); + if(ieee32) le32toh_v((void *)&fileObj[0],sizeof(fobj)*chunk); + if(ieee64big) be64toh_v((void *)&fileObj[0],sizeof(fobj)*chunk); + if(ieee64) le64toh_v((void *)&fileObj[0],sizeof(fobj)*chunk); - munge(fileObj,siteObj,csum); + for(int c=0;cSendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj)); + for(int cc=0;cc_ldimensions[d]; // local site + gsite[d] = tsite[d]+start[d]; // global site } + grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gsite); + + if ( rank != iorank ) { + if ( (myrank == rank) || (myrank==iorank) ) { + grid->SendRecvPacket((void *)&siteObj[cc],(void *)&siteObj[cc],iorank,rank,sizeof(sobj)*lstrip); + } + } + // Poke at destination + if ( myrank == rank ) { + for(int x=0;xBarrier(); // necessary? } - // Poke at destination - if ( myrank == rank ) { - pokeLocalSite(siteObj,Umu,lsite); - } - grid->Barrier(); // necessary? } grid->GlobalSum(csum); @@ -601,7 +626,7 @@ class BinaryIO { timer.Stop(); std::cout< tsite(nd); // temporary mixed up site std::vector gsite(nd); std::vector lsite(nd); - std::vector iosite(nd); Lexicographic::CoorFromIndex(tsite, tlex, range); diff --git a/lib/parallelIO/NerscIO.h b/lib/parallelIO/NerscIO.h index f0159d41..cd20c841 100644 --- a/lib/parallelIO/NerscIO.h +++ b/lib/parallelIO/NerscIO.h @@ -30,6 +30,9 @@ #ifndef GRID_NERSC_IO_H #define GRID_NERSC_IO_H +#define PARALLEL_READ +#undef PARALLEL_WRITE + #include #include #include @@ -326,8 +329,6 @@ namespace Grid { ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Now the meat: the object readers ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -#define PARALLEL_READ -#define PARALLEL_WRITE template static inline void readConfiguration(Lattice > &Umu,NerscField& header,std::string file) @@ -399,6 +400,7 @@ namespace Grid { <<" header "< Date: Thu, 25 May 2017 13:32:24 +0100 Subject: [PATCH 026/170] Attempts to speed up the parallel IO --- lib/parallelIO/BinaryIO.h | 204 +++++++++++++++++++++----------------- lib/parallelIO/NerscIO.h | 18 +++- tests/IO/Test_nersc_io.cc | 2 +- 3 files changed, 133 insertions(+), 91 deletions(-) diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h index ab449f92..c1fca348 100644 --- a/lib/parallelIO/BinaryIO.h +++ b/lib/parallelIO/BinaryIO.h @@ -228,11 +228,11 @@ class BinaryIO { bytes += sizeof(fobj)*lx; if (grid->IsBoss()) { fin.read((char *)&file_object[0], sizeof(fobj)*lx); assert( fin.fail()==0); + if (ieee32big) be32toh_v((void *)&file_object[0], sizeof(fobj)*lx); + if (ieee32) le32toh_v((void *)&file_object[0], sizeof(fobj)*lx); + if (ieee64big) be64toh_v((void *)&file_object[0], sizeof(fobj)*lx); + if (ieee64) le64toh_v((void *)&file_object[0], sizeof(fobj)*lx); for(int x=0;xIsBoss() ) { for(int x=0;xThisRank() ){ - // std::cout << "rank" << rank<<" Getting state for index "<Broadcast(rank, (void *)&saved[0], bytes); + + if ( rank != 0 ) { + grid->Broadcast(rank, (void *)&saved[0], bytes); + } + + grid->Barrier(); if ( grid->IsBoss() ) { Uint32Checksum((uint32_t *)&saved[0],bytes,csum); @@ -370,8 +375,9 @@ class BinaryIO { grid->Broadcast(0, (void *)&csum, sizeof(csum)); - if (grid->IsBoss()) + if (grid->IsBoss()) { fout.close(); + } timer.Stop(); @@ -426,6 +432,7 @@ class BinaryIO { } grid->Broadcast(0,(void *)&saved[0],bytes); + grid->Barrier(); if( rank == grid->ThisRank() ){ parallel.SetState(saved,l_idx); @@ -434,8 +441,8 @@ class BinaryIO { if ( grid->IsBoss() ) { fin.read((char *)&saved[0],bytes);assert( fin.fail()==0); - serial.SetState(saved,0); Uint32Checksum((uint32_t *)&saved[0],bytes,csum); + serial.SetState(saved,0); } std::cout << GridLogMessage << "RNG file checksum " << std::hex << csum << std::dec << std::endl; @@ -445,7 +452,6 @@ class BinaryIO { return csum; } - template static inline uint32_t readObjectParallel(Lattice &Umu, std::string file, @@ -528,6 +534,10 @@ class BinaryIO { if (!ILDG.is_ILDG) { if ( IOnode ) { fin.open(file,std::ios::binary|std::ios::in); + if ( !fin.is_open() ) { + std::cout << GridLogMessage << "readObjectParallel: Error opening file " << file << std::endl; + exit(0); + } } } @@ -540,7 +550,7 @@ class BinaryIO { static uint32_t csum; csum=0;//static for SHMEM std::vector fileObj(chunk); // FIXME - std::vector siteObj(chunk); // Use comm allocator to place in symmetric region for SHMEM + std::vector siteObj(chunk); // Use alignedAllocator to place in symmetric region for SHMEM // need to implement these loops in Nd independent way with a lexico conversion for(int tlex=0;tlex gsite(nd); std::vector lsite(nd); - Lexicographic::CoorFromIndex(tsite,tlex,range); - - for(int d=0;d_ldimensions[d]; // local site - gsite[d] = tsite[d]+start[d]; // global site - } + int rank, o_idx,i_idx, g_idx; /////////////////////////////////////////// // Get the global lexico base of the chunk /////////////////////////////////////////// - int rank, o_idx,i_idx, g_idx; + Lexicographic::CoorFromIndex(tsite,tlex,range); + for(int d=0;dGlobalCoorToRankIndex(rank,o_idx,i_idx,gsite); grid->GlobalCoorToGlobalIndex(gsite,g_idx); @@ -571,11 +577,14 @@ class BinaryIO { if (ILDG.is_ILDG){ #ifdef HAVE_LIME // use C-LIME to populate the record - uint64_t sizeFO = sizeof(fobj)*chunk; + uint64_t sizeFO = sizeof(fobj); + uint64_t sizeChunk= sizeFO*chunk; limeReaderSeek(ILDG.LR, g_idx*sizeFO, SEEK_SET); - int status = limeReaderReadData((void *)&fileObj[0], &sizeFO, ILDG.LR); + int status = limeReaderReadData((void *)&fileObj[0], &sizeChunk, ILDG.LR); +#else + assert(0); #endif - } else{ + } else { fin.seekg(offset+g_idx*sizeof(fobj)); fin.read((char *)&fileObj[0],sizeof(fobj)*chunk); } @@ -630,6 +639,7 @@ class BinaryIO { return csum; } + ////////////////////////////////////////////////////////// // Parallel writer ////////////////////////////////////////////////////////// @@ -643,9 +653,9 @@ class BinaryIO { GridBase *grid = Umu._grid; int ieee32big = (format == std::string("IEEE32BIG")); - int ieee32 = (format == std::string("IEEE32")); + int ieee32 = (format == std::string("IEEE32")); int ieee64big = (format == std::string("IEEE64BIG")); - int ieee64 = (format == std::string("IEEE64")); + int ieee64 = (format == std::string("IEEE64")); if (!(ieee32big || ieee32 || ieee64big || ieee64)) { std::cout << GridLogError << "Unrecognized file format " << format << std::endl; @@ -658,7 +668,9 @@ class BinaryIO { assert(grid->CheckerBoarded(d) == 0); } - std::vector parallel(nd, 1); + // Parallel in yzt, serial funnelled in "x". + // gx x ly chunk size + std::vector parallel(nd, 1); parallel[0] = 0; std::vector ioproc(nd); std::vector start(nd); std::vector range(nd); @@ -666,9 +678,13 @@ class BinaryIO { uint64_t slice_vol = 1; int IOnode = 1; + int gstrip = grid->_gdimensions[0]; + int lstrip = grid->_ldimensions[0]; + int chunk; + if ( nd==1) chunk = gstrip; + else chunk = gstrip*grid->_ldimensions[1]; for (int d = 0; d < grid->_ndimension; d++) { - if (d != grid->_ndimension - 1) parallel[d] = 0; if (parallel[d]) { range[d] = grid->_ldimensions[d]; @@ -688,14 +704,16 @@ class BinaryIO { { uint32_t tmp = IOnode; grid->GlobalSum(tmp); - std::cout<< GridLogMessage<< "Parallel write I/O from "<< file - << " with " <_ndimension;d++){ std::cout<< range[d]; if( d< grid->_ndimension-1 ) std::cout<< " x "; } std::cout << std::endl; + std::cout<< GridLogMessage<< "Parallel I/O local strip size is "<< lstrip <RankFromProcessorCoor(ioproc); // Take into account block size of parallel file systems want about - // 4-16MB chunks. // Ideally one reader/writer per xy plane and read these contiguously // with comms from nominated I/O nodes. std::ofstream fout; - if (!ILDG.is_ILDG) - if (IOnode){ - fout.open(file, std::ios::binary | std::ios::in | std::ios::out); - if (!fout.is_open()) { - std::cout << GridLogMessage << "writeObjectParallel: Error opening file " << file - << std::endl; - exit(0); - } - } - - + if (!ILDG.is_ILDG) { + if (IOnode){ + fout.open(file, std::ios::binary | std::ios::in | std::ios::out); + if (!fout.is_open()) { + std::cout << GridLogMessage << "writeObjectParallel: Error opening file " << file << std::endl; + exit(0); + } + } + } + ////////////////////////////////////////////////////////// // Find the location of each site and send to primary node // Take loop order from Chroma; defines loop order now that NERSC doc no @@ -729,72 +745,82 @@ class BinaryIO { ////////////////////////////////////////////////////////// uint32_t csum = 0; - fobj fileObj; - static sobj siteObj; // static for SHMEM target; otherwise dynamic allocate - // with AlignedAllocator + std::vector fileObj(chunk); + std::vector siteObj(chunk); // should aggregate a whole chunk and then write. // need to implement these loops in Nd independent way with a lexico // conversion - for (int tlex = 0; tlex < slice_vol; tlex++) { + for (int tlex = 0; tlex < slice_vol; tlex+=chunk) { std::vector tsite(nd); // temporary mixed up site std::vector gsite(nd); std::vector lsite(nd); - Lexicographic::CoorFromIndex(tsite, tlex, range); - - for(int d = 0;d < nd; d++){ - lsite[d] = tsite[d] % grid->_ldimensions[d]; // local site - gsite[d] = tsite[d] + start[d]; // global site - } - - ///////////////////////// - // Get the rank of owner of data - ///////////////////////// int rank, o_idx, i_idx, g_idx; - grid->GlobalCoorToRankIndex(rank, o_idx, i_idx, gsite); - grid->GlobalCoorToGlobalIndex(gsite, g_idx); - //////////////////////////////// - // iorank writes from the seek - //////////////////////////////// + // Possibly do transport through pt2pt + for(int cc=0;ccSendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj)); + for(int d=0;d_ldimensions[d]; // local site + gsite[d] = tsite[d]+start[d]; // global site + } + grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gsite); + + // Owner of data peeks it over lstrip + if ( myrank == rank ) { + for(int x=0;xSendRecvPacket((void *)&siteObj[cc],(void *)&siteObj[cc],rank,iorank,sizeof(sobj)*lstrip); + } } } grid->Barrier(); // necessary? + ///////////////////////// + // Get the global lexico base of the chunk + ///////////////////////// + Lexicographic::CoorFromIndex(tsite, tlex, range); + for(int d = 0;d < nd; d++){ gsite[d] = tsite[d] + start[d];} + grid->GlobalCoorToRankIndex(rank, o_idx, i_idx, gsite); + grid->GlobalCoorToGlobalIndex(gsite, g_idx); + if (myrank == iorank) { - munge(siteObj, fileObj, csum); - if (ieee32big) htobe32_v((void *)&fileObj, sizeof(fileObj)); - if (ieee32) htole32_v((void *)&fileObj, sizeof(fileObj)); - if (ieee64big) htobe64_v((void *)&fileObj, sizeof(fileObj)); - if (ieee64) htole64_v((void *)&fileObj, sizeof(fileObj)); + for(int c=0;cBarrier(); // necessary? - if (IOnode) - fout.close(); - + if (!ILDG.is_ILDG) { + if (IOnode) { + fout.close(); + } + } return csum; } diff --git a/lib/parallelIO/NerscIO.h b/lib/parallelIO/NerscIO.h index cd20c841..cf3e41e4 100644 --- a/lib/parallelIO/NerscIO.h +++ b/lib/parallelIO/NerscIO.h @@ -31,7 +31,7 @@ #define GRID_NERSC_IO_H #define PARALLEL_READ -#undef PARALLEL_WRITE +#define PARALLEL_WRITE #include #include @@ -401,6 +401,18 @@ namespace Grid { std::cout<= 1.0e-5 ) { + std::cout << " Plaquette mismatch "< uint32_t csum=BinaryIO::readRNGSerial(serial,parallel,file,offset); + if ( csum != header.checksum ) { + std::cerr << "checksum mismatch "< U(4,&Fine); - SU3::ColdConfiguration(pRNGa,Umu); + SU3::HotConfiguration(pRNGa,Umu); NerscField header; std::string file("./ckpoint_lat.4000"); From 69470ccc10e688908b9d17ea94a6e18759a8dc1a Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 25 May 2017 13:41:26 +0100 Subject: [PATCH 027/170] Update to do list --- TODO | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/TODO b/TODO index 672879cd..a5d4cabd 100644 --- a/TODO +++ b/TODO @@ -2,9 +2,9 @@ TODO: --------------- Peter's work list: -2)- Precision conversion and sort out localConvert <-- -3)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started -4)- Binary I/O speed up & x-strips +1)- Precision conversion and sort out localConvert <-- +2)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- + -- Profile CG, BlockCG, etc... Flop count/rate -- PARTIAL, time but no flop/s yet -- Physical propagator interface -- Conserved currents @@ -13,6 +13,7 @@ Peter's work list: -- HDCR resume Recent DONE +-- Binary I/O speed up & x-strips <-- DONE -- Cut down the exterior overhead <-- DONE -- Interior legs from SHM comms <-- DONE -- Half-precision comms <-- DONE From 725c513d9421732e212fe693120de64020299275 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 29 May 2017 16:47:32 -0400 Subject: [PATCH 028/170] Better MPI3 benchmarking --- benchmarks/Benchmark_comms.cc | 127 ++++++++++++++++++++-------------- 1 file changed, 75 insertions(+), 52 deletions(-) diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc index ce881ef6..532532f8 100644 --- a/benchmarks/Benchmark_comms.cc +++ b/benchmarks/Benchmark_comms.cc @@ -66,7 +66,7 @@ int main (int argc, char ** argv) int threads = GridThread::GetThreads(); std::cout<1) nmu++; @@ -88,6 +88,9 @@ int main (int argc, char ** argv) lat*mpi_layout[3]}); GridCartesian Grid(latt_size,simd_layout,mpi_layout); + RealD Nrank = Grid._Nprocessors; + RealD Nnode = Grid.NodeCount(); + RealD ppn = Nrank/Nnode; std::vector > xbuf(8,std::vector(lat*lat*lat*Ls)); std::vector > rbuf(8,std::vector(lat*lat*lat*Ls)); @@ -132,13 +135,13 @@ int main (int argc, char ** argv) } Grid.SendToRecvFromComplete(requests); Grid.Barrier(); - double stop=usecond(); - t_time[i] = stop-start; // microseconds + double stop=usecond(); + t_time[i] = stop-start; // microseconds } timestat.statistics(t_time); - double dbytes = bytes; + double dbytes = bytes*ppn; double xbytes = dbytes*2.0*ncomm; double rbytes = xbytes; double bidibytes = xbytes+rbytes; @@ -165,6 +168,9 @@ int main (int argc, char ** argv) std::vector latt_size ({lat,lat,lat,lat}); GridCartesian Grid(latt_size,simd_layout,mpi_layout); + RealD Nrank = Grid._Nprocessors; + RealD Nnode = Grid.NodeCount(); + RealD ppn = Nrank/Nnode; std::vector > xbuf(8,std::vector(lat*lat*lat*Ls)); std::vector > rbuf(8,std::vector(lat*lat*lat*Ls)); @@ -213,14 +219,14 @@ int main (int argc, char ** argv) } } Grid.Barrier(); - double stop=usecond(); - t_time[i] = stop-start; // microseconds + double stop=usecond(); + t_time[i] = stop-start; // microseconds } timestat.statistics(t_time); - double dbytes = bytes; + double dbytes = bytes*ppn; double xbytes = dbytes*2.0*ncomm; double rbytes = xbytes; double bidibytes = xbytes+rbytes; @@ -251,6 +257,9 @@ int main (int argc, char ** argv) lat*mpi_layout[3]}); GridCartesian Grid(latt_size,simd_layout,mpi_layout); + RealD Nrank = Grid._Nprocessors; + RealD Nnode = Grid.NodeCount(); + RealD ppn = Nrank/Nnode; std::vector xbuf(8); std::vector rbuf(8); @@ -258,59 +267,66 @@ int main (int argc, char ** argv) for(int d=0;d<8;d++){ xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); } int ncomm; int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + double dbytes; for(int i=0;i requests; - ncomm=0; for(int mu=0;mu<4;mu++){ + if (mpi_layout[mu]>1 ) { ncomm++; int comm_proc=1; int xmit_to_rank; int recv_from_rank; - Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); - Grid.StencilSendToRecvFromBegin(requests, - (void *)&xbuf[mu][0], - xmit_to_rank, - (void *)&rbuf[mu][0], - recv_from_rank, - bytes); + dbytes+= + Grid.StencilSendToRecvFromBegin(requests, + (void *)&xbuf[mu][0], + xmit_to_rank, + (void *)&rbuf[mu][0], + recv_from_rank, + bytes); comm_proc = mpi_layout[mu]-1; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); - Grid.StencilSendToRecvFromBegin(requests, - (void *)&xbuf[mu+4][0], - xmit_to_rank, - (void *)&rbuf[mu+4][0], - recv_from_rank, - bytes); + dbytes+= + Grid.StencilSendToRecvFromBegin(requests, + (void *)&xbuf[mu+4][0], + xmit_to_rank, + (void *)&rbuf[mu+4][0], + recv_from_rank, + bytes); } } Grid.StencilSendToRecvFromComplete(requests); Grid.Barrier(); - double stop=usecond(); - t_time[i] = stop-start; // microseconds - + double stop=usecond(); + t_time[i] = stop-start; // microseconds + } timestat.statistics(t_time); - double dbytes = bytes; - double xbytes = dbytes*2.0*ncomm; - double rbytes = xbytes; - double bidibytes = xbytes+rbytes; + dbytes=dbytes*ppn; + double xbytes = dbytes*0.5; + double rbytes = dbytes*0.5; + double bidibytes = dbytes; std::cout< xbuf(8); std::vector rbuf(8); @@ -345,16 +364,18 @@ int main (int argc, char ** argv) for(int d=0;d<8;d++){ xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); } int ncomm; int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); - + double dbytes; for(int i=0;i requests; - + dbytes=0; ncomm=0; for(int mu=0;mu<4;mu++){ @@ -366,41 +387,43 @@ int main (int argc, char ** argv) int recv_from_rank; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); - Grid.StencilSendToRecvFromBegin(requests, - (void *)&xbuf[mu][0], - xmit_to_rank, - (void *)&rbuf[mu][0], - recv_from_rank, - bytes); + dbytes+= + Grid.StencilSendToRecvFromBegin(requests, + (void *)&xbuf[mu][0], + xmit_to_rank, + (void *)&rbuf[mu][0], + recv_from_rank, + bytes); Grid.StencilSendToRecvFromComplete(requests); requests.resize(0); comm_proc = mpi_layout[mu]-1; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); - Grid.StencilSendToRecvFromBegin(requests, - (void *)&xbuf[mu+4][0], - xmit_to_rank, - (void *)&rbuf[mu+4][0], - recv_from_rank, - bytes); + dbytes+= + Grid.StencilSendToRecvFromBegin(requests, + (void *)&xbuf[mu+4][0], + xmit_to_rank, + (void *)&rbuf[mu+4][0], + recv_from_rank, + bytes); Grid.StencilSendToRecvFromComplete(requests); requests.resize(0); } } - Grid.Barrier(); - double stop=usecond(); - t_time[i] = stop-start; // microseconds - + Grid.Barrier(); + double stop=usecond(); + t_time[i] = stop-start; // microseconds + } timestat.statistics(t_time); - double dbytes = bytes; - double xbytes = dbytes*2.0*ncomm; - double rbytes = xbytes; - double bidibytes = xbytes+rbytes; + dbytes=dbytes*ppn; + double xbytes = dbytes*0.5; + double rbytes = dbytes*0.5; + double bidibytes = dbytes; std::cout< Date: Tue, 30 May 2017 23:37:02 +0100 Subject: [PATCH 029/170] Precision safe compile --- tests/forces/Test_contfrac_force.cc | 2 +- tests/forces/Test_dwf_force.cc | 2 +- tests/forces/Test_dwf_gpforce.cc | 6 +++--- tests/forces/Test_gp_rect_force.cc | 2 +- tests/forces/Test_gpdwf_force.cc | 2 +- tests/forces/Test_gpwilson_force.cc | 2 +- tests/forces/Test_laplacian_force.cc | 2 +- tests/forces/Test_mobius_force.cc | 2 +- tests/forces/Test_partfrac_force.cc | 2 +- tests/forces/Test_rect_force.cc | 2 +- tests/forces/Test_wilson_force.cc | 6 +++--- tests/forces/Test_zmobius_force.cc | 2 +- 12 files changed, 16 insertions(+), 16 deletions(-) diff --git a/tests/forces/Test_contfrac_force.cc b/tests/forces/Test_contfrac_force.cc index 227ad5a0..2afb4dde 100644 --- a/tests/forces/Test_contfrac_force.cc +++ b/tests/forces/Test_contfrac_force.cc @@ -139,7 +139,7 @@ int main (int argc, char ** argv) } - Complex dSpred = sum(dS); + ComplexD dSpred = sum(dS); std::cout << GridLogMessage << " S "< Date: Tue, 30 May 2017 23:38:02 +0100 Subject: [PATCH 030/170] Cleaner code --- lib/simd/Grid_vector_types.h | 31 +++++++------------------------ lib/simd/Grid_vector_unops.h | 7 ------- 2 files changed, 7 insertions(+), 31 deletions(-) diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 0048382f..1ebe7379 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -327,10 +327,6 @@ class Grid_simd { // provides support /////////////////////////////////////// - //#if (__GNUC__ == 5 ) || ( ( __GNUC__ == 6 ) && __GNUC_MINOR__ < 3 ) - //#pragma GCC push_options - //#pragma GCC optimize ("O0") - //#endif template friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) { Grid_simd ret; @@ -364,9 +360,6 @@ class Grid_simd { ret.v = cx.v; return ret; } - //#if (__GNUC__ == 5 ) || ( ( __GNUC__ == 6 ) && __GNUC_MINOR__ < 3 ) - //#pragma GCC pop_options - //#endif /////////////////////// // Exchange // Al Ah , Bl Bh -> Al Bl Ah,Bh @@ -428,7 +421,6 @@ class Grid_simd { }; // end of Grid_simd class definition - inline void permute(ComplexD &y,ComplexD b, int perm) { y=b; } inline void permute(ComplexF &y,ComplexF b, int perm) { y=b; } inline void permute(RealD &y,RealD b, int perm) { y=b; } @@ -838,8 +830,6 @@ inline void precisionChange(vComplexD *out,vComplexF *in,int nvec){ precisionCha inline void precisionChange(vComplexD *out,vComplexH *in,int nvec){ precisionChange((vRealD *)out,(vRealH *)in,nvec);} inline void precisionChange(vComplexF *out,vComplexH *in,int nvec){ precisionChange((vRealF *)out,(vRealH *)in,nvec);} - - // Check our vector types are of an appropriate size. #if defined QPX static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect"); @@ -854,21 +844,14 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc ///////////////////////////////////////// template struct is_simd : public std::false_type {}; -template <> -struct is_simd : public std::true_type {}; -template <> -struct is_simd : public std::true_type {}; -template <> -struct is_simd : public std::true_type {}; -template <> -struct is_simd : public std::true_type {}; -template <> -struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; -template -using IfSimd = Invoke::value, int> >; -template -using IfNotSimd = Invoke::value, unsigned> >; +template using IfSimd = Invoke::value, int> >; +template using IfNotSimd = Invoke::value, unsigned> >; } #endif diff --git a/lib/simd/Grid_vector_unops.h b/lib/simd/Grid_vector_unops.h index 2afac190..2244566f 100644 --- a/lib/simd/Grid_vector_unops.h +++ b/lib/simd/Grid_vector_unops.h @@ -179,13 +179,6 @@ inline Grid_simd div(const Grid_simd &r, Integer y) { //////////////////////////////////////////////////////////////////////////// // Allows us to assign into **conformable** real vectors from complex //////////////////////////////////////////////////////////////////////////// -// template < class S, class V > -// inline auto ComplexRemove(const Grid_simd &c) -> -// Grid_simd::Real,V> { -// Grid_simd::Real,V> ret; -// ret.v = c.v; -// return ret; -// } template struct AndFunctor { scalar operator()(const scalar &x, const scalar &y) const { return x & y; } From 58e8d0a10d69c794c2839e6b9093bee3c2b32da2 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 30 May 2017 23:38:30 +0100 Subject: [PATCH 031/170] reverse direction lexico mapping --- lib/lattice/Lattice_transfer.h | 50 +++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/lib/lattice/Lattice_transfer.h b/lib/lattice/Lattice_transfer.h index 68de52d0..c8ba0928 100644 --- a/lib/lattice/Lattice_transfer.h +++ b/lib/lattice/Lattice_transfer.h @@ -551,7 +551,10 @@ void Replicate(Lattice &coarse,Lattice & fine) //Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order template -typename std::enable_if::value && !isSIMDvectorized::value, void>::type unvectorizeToLexOrdArray(std::vector &out, const Lattice &in){ +typename std::enable_if::value && !isSIMDvectorized::value, void>::type +unvectorizeToLexOrdArray(std::vector &out, const Lattice &in) +{ + typedef typename vobj::vector_type vtype; GridBase* in_grid = in._grid; @@ -590,6 +593,51 @@ typename std::enable_if::value && !isSIMDvectorized extract1(in_vobj, out_ptrs, 0); } } +//Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order +template +typename std::enable_if::value && !isSIMDvectorized::value, void>::type +vectorizeFromLexOrdArray(std::vector &in, const Lattice &out) +{ + + typedef typename vobj::vector_type vtype; + + GridBase* grid = out._grid; + assert(in.size()==grid->lSites()); + + int ndim = grid->Nd(); + int nsimd = vtype::Nsimd(); + + std::vector > icoor(nsimd); + + for(int lane=0; lane < nsimd; lane++){ + icoor[lane].resize(ndim); + grid->iCoorFromIindex(icoor[lane],lane); + } + + parallel_for(int oidx = 0; oidx < grid->oSites(); oidx++){ //loop over outer index + //Assemble vector of pointers to output elements + std::vector ptrs(nsimd); + + std::vector ocoor(ndim); + grid->oCoorFromOindex(ocoor, oidx); + + std::vector lcoor(grid->Nd()); + + for(int lane=0; lane < nsimd; lane++){ + for(int mu=0;mu_rdimensions[mu]*icoor[lane][mu]; + + int lex; + Lexicographic::IndexFromCoor(lcoor, lex, grid->_ldimensions); + ptrs[lane] = &in[lex]; + } + + //pack from those ptrs + vobj vecobj; + merge1(vecobj, ptrs, 0); + out._odata[oidx] = vecobj; + } +} //Convert a Lattice from one precision to another template From e30fa9f4b8fcce40211e69d598617992899b03d4 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 30 May 2017 23:39:16 +0100 Subject: [PATCH 032/170] RankCount; need to clean up ambigious ProcessCount --- lib/communicator/Communicator_base.cc | 2 ++ lib/communicator/Communicator_base.h | 2 ++ lib/communicator/Communicator_mpi3.cc | 1 + 3 files changed, 5 insertions(+) diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc index 98d2abf4..557fef48 100644 --- a/lib/communicator/Communicator_base.cc +++ b/lib/communicator/Communicator_base.cc @@ -60,6 +60,7 @@ void CartesianCommunicator::ShmBufferFreeAll(void) { ///////////////////////////////// // Grid information queries ///////////////////////////////// +int CartesianCommunicator::Dimensions(void) { return _ndimension; }; int CartesianCommunicator::IsBoss(void) { return _processor==0; }; int CartesianCommunicator::BossRank(void) { return 0; }; int CartesianCommunicator::ThisRank(void) { return _processor; }; @@ -91,6 +92,7 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) #if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L) int CartesianCommunicator::NodeCount(void) { return ProcessorCount();}; +int CartesianCommunicator::RankCount(void) { return ProcessorCount();}; double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, void *xmit, diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h index e0b9f2c3..23d4f647 100644 --- a/lib/communicator/Communicator_base.h +++ b/lib/communicator/Communicator_base.h @@ -148,6 +148,7 @@ class CartesianCommunicator { int RankFromProcessorCoor(std::vector &coor); void ProcessorCoorFromRank(int rank,std::vector &coor); + int Dimensions(void) ; int IsBoss(void) ; int BossRank(void) ; int ThisRank(void) ; @@ -155,6 +156,7 @@ class CartesianCommunicator { const std::vector & ProcessorGrid(void) ; int ProcessorCount(void) ; int NodeCount(void) ; + int RankCount(void) ; //////////////////////////////////////////////////////////////////////////////// // very VERY rarely (Log, serial RNG) we need world without a grid diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index a8bffc14..54a0f9b5 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -65,6 +65,7 @@ std::vector CartesianCommunicator::MyGroup; std::vector CartesianCommunicator::ShmCommBufs; int CartesianCommunicator::NodeCount(void) { return GroupSize;}; +int CartesianCommunicator::RankCount(void) { return WorldSize;}; #undef FORCE_COMMS From 53a9aeb9653a312ffed057eccf65f7de0e193742 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 30 May 2017 23:39:53 +0100 Subject: [PATCH 033/170] Cosmetic only --- lib/tensors/Tensor_traits.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/tensors/Tensor_traits.h b/lib/tensors/Tensor_traits.h index ab20b807..c1ef397a 100644 --- a/lib/tensors/Tensor_traits.h +++ b/lib/tensors/Tensor_traits.h @@ -281,8 +281,8 @@ namespace Grid { template class getPrecision{ public: - typedef typename getVectorType::type vector_obj; //get the vector_obj (i.e. a grid Tensor) if its a Lattice, do nothing otherwise (i.e. if fundamental or grid Tensor) - + //get the vector_obj (i.e. a grid Tensor) if its a Lattice, do nothing otherwise (i.e. if fundamental or grid Tensor) + typedef typename getVectorType::type vector_obj; typedef typename GridTypeMapper::scalar_type scalar_type; //get the associated scalar type. Works on fundamental and tensor types typedef typename GridTypeMapper::Realified real_scalar_type; //remove any std::complex wrapper, should get us to the fundamental type From ef1b7db374ede8eee0011b1db3fc6cd076d9bfb8 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 30 May 2017 23:40:11 +0100 Subject: [PATCH 034/170] Diff comparison check --- tests/IO/Test_nersc_io.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/IO/Test_nersc_io.cc b/tests/IO/Test_nersc_io.cc index cf919a7d..8507df13 100644 --- a/tests/IO/Test_nersc_io.cc +++ b/tests/IO/Test_nersc_io.cc @@ -88,7 +88,12 @@ int main (int argc, char ** argv) int precision32 = 0; int tworow = 0; NerscIO::writeConfiguration(Umu,file,tworow,precision32); + Umu_saved = Umu; NerscIO::readConfiguration(Umu,header,file); + Umu_diff = Umu - Umu_saved; + //std::cout << "Umu_save "< Date: Tue, 30 May 2017 23:40:39 +0100 Subject: [PATCH 035/170] Beginning move to MPI IO --- lib/parallelIO/NerscIO.h | 43 ++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/lib/parallelIO/NerscIO.h b/lib/parallelIO/NerscIO.h index cf3e41e4..ab535dac 100644 --- a/lib/parallelIO/NerscIO.h +++ b/lib/parallelIO/NerscIO.h @@ -30,7 +30,10 @@ #ifndef GRID_NERSC_IO_H #define GRID_NERSC_IO_H -#define PARALLEL_READ +#undef PARALLEL_READ +#undef SERIAL_READ +#define MPI_READ + #define PARALLEL_WRITE #include @@ -355,7 +358,12 @@ namespace Grid { #ifdef PARALLEL_READ csum=BinaryIO::readObjectParallel, LorentzColour2x3F> (Umu,file,Nersc3x2munger(), offset,format); -#else +#endif +#ifdef MPI_READ + csum=BinaryIO::readObjectMPI, LorentzColour2x3F> + (Umu,file,Nersc3x2munger(), offset,format); +#endif +#ifdef SERIAL_READ csum=BinaryIO::readObjectSerial, LorentzColour2x3F> (Umu,file,Nersc3x2munger(), offset,format); #endif @@ -364,7 +372,12 @@ namespace Grid { #ifdef PARALLEL_READ csum=BinaryIO::readObjectParallel, LorentzColour2x3D> (Umu,file,Nersc3x2munger(),offset,format); -#else +#endif +#ifdef MPI_READ + csum=BinaryIO::readObjectMPI, LorentzColour2x3D> + (Umu,file,Nersc3x2munger(),offset,format); +#endif +#ifdef SERIAL_READ csum=BinaryIO::readObjectSerial, LorentzColour2x3D> (Umu,file,Nersc3x2munger(),offset,format); #endif @@ -374,7 +387,12 @@ namespace Grid { #ifdef PARALLEL_READ csum=BinaryIO::readObjectParallel,LorentzColourMatrixF> (Umu,file,NerscSimpleMunger(),offset,format); -#else +#endif +#ifdef MPI_READ + csum=BinaryIO::readObjectMPI,LorentzColourMatrixF> + (Umu,file,NerscSimpleMunger(),offset,format); +#endif +#ifdef SERIAL_READ csum=BinaryIO::readObjectSerial,LorentzColourMatrixF> (Umu,file,NerscSimpleMunger(),offset,format); #endif @@ -383,7 +401,12 @@ namespace Grid { #ifdef PARALLEL_READ csum=BinaryIO::readObjectParallel,LorentzColourMatrixD> (Umu,file,NerscSimpleMunger(),offset,format); -#else +#endif +#ifdef MPI_READ + csum=BinaryIO::readObjectMPI,LorentzColourMatrixD> + (Umu,file,NerscSimpleMunger(),offset,format); +#endif +#ifdef SERIAL_READ csum=BinaryIO::readObjectSerial,LorentzColourMatrixD> (Umu,file,NerscSimpleMunger(),offset,format); #endif @@ -411,13 +434,13 @@ namespace Grid { std::cerr << " plaqs " << clone.plaquette << " " << header.plaquette << std::endl; std::cerr << " trace " << clone.link_trace<< " " << header.link_trace<< std::endl; std::cerr << " csum " < From 1e429a0d57aa4c5efaa458a198cd7d7a49cb2f34 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 30 May 2017 23:41:07 +0100 Subject: [PATCH 036/170] Added MPI version --- lib/parallelIO/BinaryIO.h | 145 +++++++++++++++++++++++++++++++++++++- 1 file changed, 144 insertions(+), 1 deletion(-) diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h index c1fca348..cbc619ef 100644 --- a/lib/parallelIO/BinaryIO.h +++ b/lib/parallelIO/BinaryIO.h @@ -250,6 +250,149 @@ class BinaryIO { return csum; } + template + static inline uint32_t readObjectMPI(Lattice &Umu,std::string file,munger munge,int offset,const std::string &format) + { + typedef typename vobj::scalar_object sobj; + + GridBase *grid = Umu._grid; + + std::cout<< GridLogMessage<< "MPI read I/O "<< file<< std::endl; + GridStopWatch timer; timer.Start(); + + Umu = zero; + uint32_t csum=0; + uint64_t bytes=0; + + int ndim = grid->Dimensions(); + int nrank = grid->ProcessorCount(); + int myrank = grid->ThisRank(); + + std::vector psizes = grid->ProcessorGrid(); + std::vector pcoor = grid->ThisProcessorCoor(); + std::vector gLattice= grid->GlobalDimensions(); + std::vector lLattice= grid->LocalDimensions(); + + std::vector distribs(ndim,MPI_DISTRIBUTE_BLOCK); + std::vector dargs (ndim,MPI_DISTRIBUTE_DFLT_DARG); + + std::vector lStart(ndim); + std::vector gStart(ndim); + + // Flatten the file + int lsites = grid->lSites(); + std::vector scalardata(lsites); + std::vector iodata(lsites); // Munge, checksum, byte order in here + + for(int d=0;dcommunicator, file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); + assert(ierr==0); + ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); + // std::cout<< "MPI File set view returned " <GlobalSum(csum); + grid->Barrier(); + + vectorizeFromLexOrdArray(scalardata,Umu); + + timer.Stop(); + std::cout< static inline uint32_t writeObjectSerial(Lattice &Umu,std::string file,munger munge,int offset, const std::string & format) @@ -597,7 +740,7 @@ class BinaryIO { for(int c=0;c Date: Thu, 1 Jun 2017 17:36:18 -0400 Subject: [PATCH 037/170] As local vols increase, use 64 bits for safety --- lib/lattice/Lattice_transfer.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/lib/lattice/Lattice_transfer.h b/lib/lattice/Lattice_transfer.h index c8ba0928..cbf31f86 100644 --- a/lib/lattice/Lattice_transfer.h +++ b/lib/lattice/Lattice_transfer.h @@ -595,8 +595,9 @@ unvectorizeToLexOrdArray(std::vector &out, const Lattice &in) } //Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order template -typename std::enable_if::value && !isSIMDvectorized::value, void>::type -vectorizeFromLexOrdArray(std::vector &in, const Lattice &out) +typename std::enable_if::value + && !isSIMDvectorized::value, void>::type +vectorizeFromLexOrdArray( std::vector &in, Lattice &out) { typedef typename vobj::vector_type vtype; @@ -614,7 +615,7 @@ vectorizeFromLexOrdArray(std::vector &in, const Lattice &out) grid->iCoorFromIindex(icoor[lane],lane); } - parallel_for(int oidx = 0; oidx < grid->oSites(); oidx++){ //loop over outer index + parallel_for(uint64_t oidx = 0; oidx < grid->oSites(); oidx++){ //loop over outer index //Assemble vector of pointers to output elements std::vector ptrs(nsimd); @@ -624,8 +625,10 @@ vectorizeFromLexOrdArray(std::vector &in, const Lattice &out) std::vector lcoor(grid->Nd()); for(int lane=0; lane < nsimd; lane++){ - for(int mu=0;mu_rdimensions[mu]*icoor[lane][mu]; + } int lex; Lexicographic::IndexFromCoor(lcoor, lex, grid->_ldimensions); @@ -663,7 +666,7 @@ void precisionChange(Lattice &out, const Lattice &in){ std::vector in_slex_conv(in_grid->lSites()); unvectorizeToLexOrdArray(in_slex_conv, in); - parallel_for(int out_oidx=0;out_oidxoSites();out_oidx++){ + parallel_for(uint64_t out_oidx=0;out_oidxoSites();out_oidx++){ std::vector out_ocoor(ndim); out_grid->oCoorFromOindex(out_ocoor, out_oidx); From 21421656abb44bc872ca85c3364eed638fff8a5f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 1 Jun 2017 17:36:53 -0400 Subject: [PATCH 038/170] Big changes improving the code to use MPI IO --- lib/parallelIO/BinaryIO.h | 1065 +++++++++++-------------------------- 1 file changed, 297 insertions(+), 768 deletions(-) diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h index cbc619ef..13341927 100644 --- a/lib/parallelIO/BinaryIO.h +++ b/lib/parallelIO/BinaryIO.h @@ -38,7 +38,12 @@ #include #include +namespace Grid { + +///////////////////////////////////////////////////////////////////////////////// +// Byte reversal garbage +///////////////////////////////////////////////////////////////////////////////// inline uint32_t byte_reverse32(uint32_t f) { f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; return f; @@ -60,63 +65,155 @@ inline uint64_t Grid_ntohll(uint64_t A) { } #endif -namespace Grid { - - // A little helper - inline void removeWhitespace(std::string &key) - { - key.erase(std::remove_if(key.begin(), key.end(), ::isspace),key.end()); +///////////////////////////////////////////////////////////////////////////////// +// Simple classes for precision conversion +///////////////////////////////////////////////////////////////////////////////// +template +struct BinarySimpleUnmunger { + typedef typename getPrecision::real_scalar_type fobj_stype; + typedef typename getPrecision::real_scalar_type sobj_stype; + + void operator()(sobj &in, fobj &out) { + // take word by word and transform accoding to the status + fobj_stype *out_buffer = (fobj_stype *)&out; + sobj_stype *in_buffer = (sobj_stype *)∈ + size_t fobj_words = sizeof(out) / sizeof(fobj_stype); + size_t sobj_words = sizeof(in) / sizeof(sobj_stype); + assert(fobj_words == sobj_words); + + for (unsigned int word = 0; word < sobj_words; word++) + out_buffer[word] = in_buffer[word]; // type conversion on the fly + } +}; +template +struct BinarySimpleMunger { + typedef typename getPrecision::real_scalar_type fobj_stype; + typedef typename getPrecision::real_scalar_type sobj_stype; + + void operator()(fobj &in, sobj &out) { + // take word by word and transform accoding to the status + fobj_stype *in_buffer = (fobj_stype *)∈ + sobj_stype *out_buffer = (sobj_stype *)&out; + size_t fobj_words = sizeof(in) / sizeof(fobj_stype); + size_t sobj_words = sizeof(out) / sizeof(sobj_stype); + assert(fobj_words == sobj_words); + + for (unsigned int word = 0; word < sobj_words; word++) + out_buffer[word] = in_buffer[word]; // type conversion on the fly + + } +}; +// A little helper +inline void removeWhitespace(std::string &key) +{ + key.erase(std::remove_if(key.begin(), key.end(), ::isspace),key.end()); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Static class holding the parallel IO code +// Could just use a namespace +/////////////////////////////////////////////////////////////////////////////////////////////////// class BinaryIO { - public: + ///////////////////////////////////////////////////////////////////////////// + // more byte manipulation helpers + ///////////////////////////////////////////////////////////////////////////// + static inline void Uint32Checksum(uint32_t *buf,uint64_t buf_size_bytes,uint32_t &csum) + { +#pragma omp parallel + { + uint32_t csum_thr=0; + uint64_t count = buf_size_bytes/sizeof(uint32_t); +#pragma omp for + for(uint64_t i=0;i>8) | ((f&0xFF000000UL)>>24) ; fp[i] = ntohl(f); } } - // BE is same as network - static inline void be64toh_v(void *file_object,uint32_t bytes) + static inline void be64toh_v(void *file_object,uint64_t bytes) { uint64_t * f = (uint64_t *)file_object; - for(int i=0;i*sizeof(uint64_t)>8) | ((f&0xFF000000UL)>>24) ; @@ -126,143 +223,23 @@ class BinaryIO { fp[i] = Grid_ntohll(g); } } - - template static inline void Uint32Checksum(Lattice &lat,munger munge,uint32_t &csum) + ///////////////////////////////////////////////////////////////////////////// + // Real action: + // Read or Write distributed lexico array of ANY object to a specific location in file + ////////////////////////////////////////////////////////////////////////////////////// + template + static inline uint32_t IOobject(word w, + GridBase *grid, + std::vector &iodata, + std::string file, + int offset, + const std::string &format, int doread) { - typedef typename vobj::scalar_object sobj; - GridBase *grid = lat._grid ; - std::cout <Barrier(); + GridStopWatch timer; + GridStopWatch bstimer; - csum = 0; - std::vector lcoor; - for(int l=0;llSites();l++){ - Lexicographic::CoorFromIndex(lcoor,l,grid->_ldimensions); - peekLocalSite(siteObj,lat,lcoor); - munge(siteObj,fileObj,csum); - } - grid->GlobalSum(csum); - } - - static inline void Uint32Checksum(uint32_t *buf,uint32_t buf_size_bytes,uint32_t &csum) - { - for(int i=0;i*sizeof(uint32_t) - struct BinarySimpleUnmunger { - typedef typename getPrecision::real_scalar_type fobj_stype; - typedef typename getPrecision::real_scalar_type sobj_stype; - - void operator()(sobj &in, fobj &out, uint32_t &csum) { - // take word by word and transform accoding to the status - fobj_stype *out_buffer = (fobj_stype *)&out; - sobj_stype *in_buffer = (sobj_stype *)∈ - size_t fobj_words = sizeof(out) / sizeof(fobj_stype); - size_t sobj_words = sizeof(in) / sizeof(sobj_stype); - assert(fobj_words == sobj_words); - - for (unsigned int word = 0; word < sobj_words; word++) - out_buffer[word] = in_buffer[word]; // type conversion on the fly - - BinaryIO::Uint32Checksum((uint32_t *)&out, sizeof(out), csum); - } - }; - - template - struct BinarySimpleMunger { - typedef typename getPrecision::real_scalar_type fobj_stype; - typedef typename getPrecision::real_scalar_type sobj_stype; - - void operator()(fobj &in, sobj &out, uint32_t &csum) { - // take word by word and transform accoding to the status - fobj_stype *in_buffer = (fobj_stype *)∈ - sobj_stype *out_buffer = (sobj_stype *)&out; - size_t fobj_words = sizeof(in) / sizeof(fobj_stype); - size_t sobj_words = sizeof(out) / sizeof(sobj_stype); - assert(fobj_words == sobj_words); - - for (unsigned int word = 0; word < sobj_words; word++) - out_buffer[word] = in_buffer[word]; // type conversion on the fly - - BinaryIO::Uint32Checksum((uint32_t *)&in, sizeof(in), csum); - } - }; - - template - static inline uint32_t readObjectSerial(Lattice &Umu,std::string file,munger munge,int offset,const std::string &format) - { - typedef typename vobj::scalar_object sobj; - - GridBase *grid = Umu._grid; - - std::cout<< GridLogMessage<< "Serial read I/O "<< file<< std::endl; - GridStopWatch timer; timer.Start(); - - int ieee32big = (format == std::string("IEEE32BIG")); - int ieee32 = (format == std::string("IEEE32")); - int ieee64big = (format == std::string("IEEE64BIG")); - int ieee64 = (format == std::string("IEEE64")); - - // Find the location of each site and send to primary node - // Take loop order from Chroma; defines loop order now that NERSC doc no longer - // available (how short sighted is that?) - std::ifstream fin(file,std::ios::binary|std::ios::in); - fin.seekg(offset); - - Umu = zero; uint32_t csum=0; - uint64_t bytes=0; - - int lx = grid->_fdimensions[0]; - std::vector file_object(lx); - std::vector munged(lx); - for(int t=0;t_fdimensions[3];t++){ - for(int z=0;z_fdimensions[2];z++){ - for(int y=0;y_fdimensions[1];y++){ - { - bytes += sizeof(fobj)*lx; - if (grid->IsBoss()) { - fin.read((char *)&file_object[0], sizeof(fobj)*lx); assert( fin.fail()==0); - if (ieee32big) be32toh_v((void *)&file_object[0], sizeof(fobj)*lx); - if (ieee32) le32toh_v((void *)&file_object[0], sizeof(fobj)*lx); - if (ieee64big) be64toh_v((void *)&file_object[0], sizeof(fobj)*lx); - if (ieee64) le64toh_v((void *)&file_object[0], sizeof(fobj)*lx); - for(int x=0;x site({x,y,z,t}); - // The boss who read the file has their value poked - pokeSite(munged[x],Umu,site); - } - }}}} - timer.Stop(); - std::cout<Broadcast(0,(void *)&csum,sizeof(csum)); - return csum; - } - - template - static inline uint32_t readObjectMPI(Lattice &Umu,std::string file,munger munge,int offset,const std::string &format) - { - typedef typename vobj::scalar_object sobj; - - GridBase *grid = Umu._grid; - - std::cout<< GridLogMessage<< "MPI read I/O "<< file<< std::endl; - GridStopWatch timer; timer.Start(); - - Umu = zero; - uint32_t csum=0; - uint64_t bytes=0; int ndim = grid->Dimensions(); int nrank = grid->ProcessorCount(); @@ -280,9 +257,8 @@ class BinaryIO { std::vector gStart(ndim); // Flatten the file - int lsites = grid->lSites(); - std::vector scalardata(lsites); - std::vector iodata(lsites); // Munge, checksum, byte order in here + uint64_t lsites = grid->lSites(); + iodata.resize(lsites); for(int d=0;dcommunicator, file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); - assert(ierr==0); - ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); - // std::cout<< "MPI File set view returned " <communicator, file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); assert(ierr==0); + ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0); + ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0); + timer.Stop(); + grid->Barrier(); + + bstimer.Start(); + if (ieee32big) be32toh_v((void *)&iodata[0], sizeof(fobj)*lsites,csum); + if (ieee32) le32toh_v((void *)&iodata[0], sizeof(fobj)*lsites,csum); + if (ieee64big) be64toh_v((void *)&iodata[0], sizeof(fobj)*lsites,csum); + if (ieee64) le64toh_v((void *)&iodata[0], sizeof(fobj)*lsites,csum); + bstimer.Stop(); + + } else { + std::cout<< GridLogMessage<< "MPI write I/O "<< file<< std::endl; + bstimer.Start(); + if (ieee32big) htobe32_v((void *)&iodata[0], sizeof(fobj)*lsites,csum); + if (ieee32) htole32_v((void *)&iodata[0], sizeof(fobj)*lsites,csum); + if (ieee64big) htobe64_v((void *)&iodata[0], sizeof(fobj)*lsites,csum); + if (ieee64) htole64_v((void *)&iodata[0], sizeof(fobj)*lsites,csum); + bstimer.Stop(); + + grid->Barrier(); + + timer.Start(); + ierr=MPI_File_open(grid->communicator, file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0); + ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0); + ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0); + timer.Stop(); + + } + ////////////////////////////////////////////////////////////////////////////// - // Munge [ .e.g 3rd row recon ] + // Finish up MPI I/O ////////////////////////////////////////////////////////////////////////////// - for(int x=0;xBarrier(); grid->GlobalSum(csum); grid->Barrier(); - vectorizeFromLexOrdArray(scalardata,Umu); - - timer.Stop(); - std::cout< - static inline uint32_t writeObjectSerial(Lattice &Umu,std::string file,munger munge,int offset, - const std::string & format) + ///////////////////////////////////////////////////////////////////////////// + // Read a Lattice of object + ////////////////////////////////////////////////////////////////////////////////////// + template + static inline uint32_t readLatticeObject(Lattice &Umu,std::string file,munger munge,int offset,const std::string &format) { - typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_object sobj; + typedef typename vobj::Realified::scalar_type word; word w=0; GridBase *grid = Umu._grid; + int lsites = grid->lSites(); - int ieee32big = (format == std::string("IEEE32BIG")); - int ieee32 = (format == std::string("IEEE32")); - int ieee64big = (format == std::string("IEEE64BIG")); - int ieee64 = (format == std::string("IEEE64")); - - ////////////////////////////////////////////////// - // Serialise through node zero - ////////////////////////////////////////////////// - std::cout<< GridLogMessage<< "Serial write I/O "<< file< scalardata(lsites); + std::vector iodata(lsites); // Munge, checksum, byte order in here - std::ofstream fout; - if ( grid->IsBoss() ) { - fout.open(file,std::ios::binary|std::ios::out|std::ios::in); - fout.seekp(offset); - } - uint64_t bytes=0; - uint32_t csum=0; - int lx = grid->_fdimensions[0]; - std::vector file_object(lx); - std::vector unmunged(lx); - for(int t=0;t_fdimensions[3];t++){ - for(int z=0;z_fdimensions[2];z++){ - for(int y=0;y_fdimensions[1];y++){ - { + int doread=1; + uint32_t csum= IOobject(w,grid,iodata,file,offset,format,doread); - std::vector site({0,y,z,t}); - // peek & write - for(int x=0;xIsBoss() ) { - for(int x=0;xBarrier(); timer.Stop(); - std::cout<Broadcast(0,(void *)&csum,sizeof(csum)); + std::cout< + static inline uint32_t writeLatticeObject(Lattice &Umu,std::string file,munger munge,int offset,const std::string &format) + { + typedef typename vobj::scalar_object sobj; + typedef typename vobj::Realified::scalar_type word; word w=0; + GridBase *grid = Umu._grid; + int lsites = grid->lSites(); + + std::vector scalardata(lsites); + std::vector iodata(lsites); // Munge, checksum, byte order in here + + ////////////////////////////////////////////////////////////////////////////// + // Munge [ .e.g 3rd row recon ] + ////////////////////////////////////////////////////////////////////////////// + GridStopWatch timer; timer.Start(); + unvectorizeToLexOrdArray(scalardata,Umu); + + parallel_for(int x=0;xBarrier(); + timer.Stop(); + + int dowrite=0; + uint32_t csum= IOobject(w,grid,iodata,file,offset,format,dowrite); + + std::cout< RNGstate; + typedef RngStateType word; word w=0; - GridBase *grid = parallel._grid; - int gsites = grid->_gsites; - - GridStopWatch timer; timer.Start(); - ////////////////////////////////////////////////// - // Serialise through node zero - ////////////////////////////////////////////////// - std::ofstream fout; - if (grid->IsBoss()) { - fout.open(file, std::ios::binary | std::ios::out); - if (!fout.is_open()) { - std::cout << GridLogMessage << "writeRNGSerial: Error opening file " << file << std::endl; - exit(0);// write better error handling - } - fout.seekp(offset); - } - - std::cout << GridLogMessage << "Serial RNG write I/O on file " << file << std::endl; uint32_t csum = 0; - std::vector saved(RngStateCount); - int bytes = sizeof(RngStateType) * saved.size(); - std::cout << GridLogDebug << "RngStateCount: " << RngStateCount << std::endl; - std::cout << GridLogDebug << "Type has " << bytes << " bytes" << std::endl; - std::vector gcoor; - - for(int gidx=0;gidxGlobalIndexToGlobalCoor(gidx,gcoor); - grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); - int l_idx=parallel.generator_idx(o_idx,i_idx); - - if( rank == grid->ThisRank() ){ - parallel.GetState(saved,l_idx); - } - - if ( rank != 0 ) { - grid->Broadcast(rank, (void *)&saved[0], bytes); - } - - grid->Barrier(); - - if ( grid->IsBoss() ) { - Uint32Checksum((uint32_t *)&saved[0],bytes,csum); - fout.write((char *)&saved[0],bytes);assert( fout.fail()==0); - } - - } - - if ( grid->IsBoss() ) { - serial.GetState(saved,0); - Uint32Checksum((uint32_t *)&saved[0],bytes,csum); - fout.write((char *)&saved[0],bytes);assert( fout.fail()==0); - } - - grid->Broadcast(0, (void *)&csum, sizeof(csum)); - - if (grid->IsBoss()) { - fout.close(); - } - - timer.Stop(); - - std::cout << GridLogMessage << "RNG file checksum " << std::hex << csum << std::dec << std::endl; - std::cout << GridLogMessage << "RNG state saved in " << timer.Elapsed() << std::endl; - return csum; - } - - - static inline uint32_t readRNGSerial(GridSerialRNG &serial,GridParallelRNG ¶llel,std::string file,int offset) - { - typedef typename GridSerialRNG::RngStateType RngStateType; - const int RngStateCount = GridSerialRNG::RngStateCount; + std::string format = "IEEE32BIG"; GridBase *grid = parallel._grid; - int gsites = grid->_gsites; + int gsites = grid->gSites(); + int lsites = grid->lSites(); - ////////////////////////////////////////////////// - // Serialise through node zero - ////////////////////////////////////////////////// - std::cout<< GridLogMessage<< "Serial RNG read I/O of file "<IsBoss()) { - fin.open(file, std::ios::binary | std::ios::in); - if (!fin.is_open()) { - std::cout << GridLogMessage << "readRNGSerial: Error opening file " << file << std::endl; - exit(0);// write better error handling - } - fin.seekg(offset); - } - - - uint32_t csum=0; - std::vector saved(RngStateCount); - int bytes = sizeof(RngStateType)*saved.size(); - std::cout << GridLogDebug << "RngStateCount: " << RngStateCount << std::endl; - std::cout << GridLogDebug << "Type has " << bytes << " bytes" << std::endl; - std::vector gcoor; - - std::cout << GridLogDebug << "gsites: " << gsites << " loop" << std::endl; - for(int gidx=0;gidxGlobalIndexToGlobalCoor(gidx,gcoor); - grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); - int l_idx=parallel.generator_idx(o_idx,i_idx); - - if ( grid->IsBoss() ) { - fin.read((char *)&saved[0],bytes);assert( fin.fail()==0); - Uint32Checksum((uint32_t *)&saved[0],bytes,csum); - } - - grid->Broadcast(0,(void *)&saved[0],bytes); - grid->Barrier(); - - if( rank == grid->ThisRank() ){ - parallel.SetState(saved,l_idx); - } - } - - if ( grid->IsBoss() ) { - fin.read((char *)&saved[0],bytes);assert( fin.fail()==0); - Uint32Checksum((uint32_t *)&saved[0],bytes,csum); - serial.SetState(saved,0); - } - - std::cout << GridLogMessage << "RNG file checksum " << std::hex << csum << std::dec << std::endl; - - grid->Broadcast(0,(void *)&csum,sizeof(csum)); - - return csum; - } - - template - static inline uint32_t readObjectParallel(Lattice &Umu, - std::string file, - munger munge, - int offset, - const std::string &format, - ILDGtype ILDG = ILDGtype()) { - typedef typename vobj::scalar_object sobj; - - GridBase *grid = Umu._grid; - - int ieee32big = (format == std::string("IEEE32BIG")); - int ieee32 = (format == std::string("IEEE32")); - int ieee64big = (format == std::string("IEEE64BIG")); - int ieee64 = (format == std::string("IEEE64")); - - - // Ideally one reader/writer per xy plane and read these contiguously - // with comms from nominated I/O nodes. - std::ifstream fin; - - int nd = grid->_ndimension; - std::vector parallel(nd,1); parallel[0] = 0; - std::vector ioproc (nd); - std::vector start(nd); - std::vector range(nd); - - for(int d=0;dCheckerBoarded(d) == 0); - } - - uint64_t slice_vol = 1; - - int IOnode = 1; - int gstrip = grid->_gdimensions[0]; - int lstrip = grid->_ldimensions[0]; - - int chunk ; - if ( nd==1) chunk = gstrip; - else chunk = gstrip*grid->_ldimensions[1]; - - for(int d=0;d_ndimension;d++) { - - if (parallel[d]) { - range[d] = grid->_ldimensions[d]; - start[d] = grid->_processor_coor[d]*range[d]; - ioproc[d]= grid->_processor_coor[d]; - } else { - range[d] = grid->_gdimensions[d]; - start[d] = 0; - ioproc[d]= 0; - - if ( grid->_processor_coor[d] != 0 ) IOnode = 0; - } - slice_vol = slice_vol * range[d]; - } - - { - uint32_t tmp = IOnode; - grid->GlobalSum(tmp); - std::cout<< std::dec ; - std::cout<< GridLogMessage<< "Parallel read I/O from "<< file << " with " <_ndimension;d++){ - std::cout<< range[d]; - if( d< grid->_ndimension-1 ) - std::cout<< " x "; - } - std::cout << std::endl; - std::cout<< GridLogMessage<< "Parallel I/O local strip size is "<< lstrip <ThisRank(); - int iorank = grid->RankFromProcessorCoor(ioproc); - - if (!ILDG.is_ILDG) { - if ( IOnode ) { - fin.open(file,std::ios::binary|std::ios::in); - if ( !fin.is_open() ) { - std::cout << GridLogMessage << "readObjectParallel: Error opening file " << file << std::endl; - exit(0); - } - } - } - - ////////////////////////////////////////////////////////// - // Find the location of each site and send to primary node - // Take loop order from Chroma; defines loop order now that NERSC doc no longer - // available (how short sighted is that?) - ////////////////////////////////////////////////////////// - Umu = zero; - static uint32_t csum; csum=0;//static for SHMEM - - std::vector fileObj(chunk); // FIXME - std::vector siteObj(chunk); // Use alignedAllocator to place in symmetric region for SHMEM - - // need to implement these loops in Nd independent way with a lexico conversion - for(int tlex=0;tlex tsite(nd); // temporary mixed up site - std::vector gsite(nd); - std::vector lsite(nd); - - int rank, o_idx,i_idx, g_idx; - - /////////////////////////////////////////// - // Get the global lexico base of the chunk - /////////////////////////////////////////// - Lexicographic::CoorFromIndex(tsite,tlex,range); - for(int d=0;dGlobalCoorToRankIndex(rank,o_idx,i_idx,gsite); - grid->GlobalCoorToGlobalIndex(gsite,g_idx); - - //////////////////////////////// - // iorank reads from the seek - //////////////////////////////// - if (myrank == iorank) { - - if (ILDG.is_ILDG){ -#ifdef HAVE_LIME - // use C-LIME to populate the record - uint64_t sizeFO = sizeof(fobj); - uint64_t sizeChunk= sizeFO*chunk; - limeReaderSeek(ILDG.LR, g_idx*sizeFO, SEEK_SET); - int status = limeReaderReadData((void *)&fileObj[0], &sizeChunk, ILDG.LR); -#else - assert(0); -#endif - } else { - fin.seekg(offset+g_idx*sizeof(fobj)); - fin.read((char *)&fileObj[0],sizeof(fobj)*chunk); - } - bytes+=sizeof(fobj)*chunk; - - if(ieee32big) be32toh_v((void *)&fileObj[0],sizeof(fobj)*chunk); - if(ieee32) le32toh_v((void *)&fileObj[0],sizeof(fobj)*chunk); - if(ieee64big) be64toh_v((void *)&fileObj[0],sizeof(fobj)*chunk); - if(ieee64) le64toh_v((void *)&fileObj[0],sizeof(fobj)*chunk); - - for(int c=0;c_ldimensions[d]; // local site - gsite[d] = tsite[d]+start[d]; // global site - } - grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gsite); - - if ( rank != iorank ) { - if ( (myrank == rank) || (myrank==iorank) ) { - grid->SendRecvPacket((void *)&siteObj[cc],(void *)&siteObj[cc],iorank,rank,sizeof(sobj)*lstrip); - } - } - // Poke at destination - if ( myrank == rank ) { - for(int x=0;xBarrier(); // necessary? - } - } - - grid->GlobalSum(csum); - grid->GlobalSum(bytes); - grid->Barrier(); - - timer.Stop(); - std::cout< - static inline uint32_t writeObjectParallel(Lattice &Umu, - std::string file, munger munge, - int offset, - const std::string &format, - ILDGtype ILDG = ILDGtype()) { - typedef typename vobj::scalar_object sobj; - GridBase *grid = Umu._grid; - - int ieee32big = (format == std::string("IEEE32BIG")); - int ieee32 = (format == std::string("IEEE32")); - int ieee64big = (format == std::string("IEEE64BIG")); - int ieee64 = (format == std::string("IEEE64")); - - if (!(ieee32big || ieee32 || ieee64big || ieee64)) { - std::cout << GridLogError << "Unrecognized file format " << format << std::endl; - std::cout << GridLogError << "Allowed: IEEE32BIG | IEEE32 | IEEE64BIG | IEEE64" << std::endl; - exit(0); - } - - int nd = grid->_ndimension; - for (int d = 0; d < nd; d++) { - assert(grid->CheckerBoarded(d) == 0); - } - - // Parallel in yzt, serial funnelled in "x". - // gx x ly chunk size - std::vector parallel(nd, 1); parallel[0] = 0; - std::vector ioproc(nd); - std::vector start(nd); - std::vector range(nd); - - uint64_t slice_vol = 1; - - int IOnode = 1; - int gstrip = grid->_gdimensions[0]; - int lstrip = grid->_ldimensions[0]; - int chunk; - if ( nd==1) chunk = gstrip; - else chunk = gstrip*grid->_ldimensions[1]; - - for (int d = 0; d < grid->_ndimension; d++) { - - if (parallel[d]) { - range[d] = grid->_ldimensions[d]; - start[d] = grid->_processor_coor[d]*range[d]; - ioproc[d]= grid->_processor_coor[d]; - } else { - range[d] = grid->_gdimensions[d]; - start[d] = 0; - ioproc[d]= 0; - - if ( grid->_processor_coor[d] != 0 ) IOnode = 0; - } - - slice_vol = slice_vol * range[d]; - } - - { - uint32_t tmp = IOnode; - grid->GlobalSum(tmp); - std::cout<< GridLogMessage<< "Parallel write I/O from "<< file << " with " <_ndimension;d++){ - std::cout<< range[d]; - if( d< grid->_ndimension-1 ) - std::cout<< " x "; - } - std::cout << std::endl; - std::cout<< GridLogMessage<< "Parallel I/O local strip size is "<< lstrip < iodata(lsites); + csum= IOobject(w,grid,iodata,file,offset,format,doread); + timer.Start(); - uint64_t bytes=0; - - int myrank = grid->ThisRank(); - int iorank = grid->RankFromProcessorCoor(ioproc); - - // Take into account block size of parallel file systems want about - // Ideally one reader/writer per xy plane and read these contiguously - // with comms from nominated I/O nodes. - std::ofstream fout; - if (!ILDG.is_ILDG) { - if (IOnode){ - fout.open(file, std::ios::binary | std::ios::in | std::ios::out); - if (!fout.is_open()) { - std::cout << GridLogMessage << "writeObjectParallel: Error opening file " << file << std::endl; - exit(0); - } - } + parallel_for(int lidx=0;lidx tmp(RngStateCount); + std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin()); + parallel.SetState(tmp,lidx); } - - ////////////////////////////////////////////////////////// - // Find the location of each site and send to primary node - // Take loop order from Chroma; defines loop order now that NERSC doc no - // longer - // available (how short sighted is that?) - ////////////////////////////////////////////////////////// + timer.Stop(); + + std::cout << GridLogMessage << "RNG file checksum " << std::hex << csum << std::dec << std::endl; + std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl; + return csum; + } + ///////////////////////////////////////////////////////////////////////////// + // Write a RNG; lexico map to an array of state and use IOobject + ////////////////////////////////////////////////////////////////////////////////////// + static inline uint32_t writeRNG(GridSerialRNG &serial,GridParallelRNG ¶llel,std::string file,int offset) + { + typedef typename GridSerialRNG::RngStateType RngStateType; + typedef RngStateType word; word w=0; + const int RngStateCount = GridSerialRNG::RngStateCount; + typedef std::array RNGstate; uint32_t csum = 0; - std::vector fileObj(chunk); - std::vector siteObj(chunk); - // should aggregate a whole chunk and then write. - // need to implement these loops in Nd independent way with a lexico - // conversion - for (int tlex = 0; tlex < slice_vol; tlex+=chunk) { + GridBase *grid = parallel._grid; + int gsites = grid->gSites(); + int lsites = grid->lSites(); - std::vector tsite(nd); // temporary mixed up site - std::vector gsite(nd); - std::vector lsite(nd); + GridStopWatch timer; + std::string format = "IEEE32BIG"; - int rank, o_idx, i_idx, g_idx; + std::cout << GridLogMessage << "RNG write I/O on file " << file << std::endl; - // Possibly do transport through pt2pt - for(int cc=0;cc_ldimensions[d]; // local site - gsite[d] = tsite[d]+start[d]; // global site - } - grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gsite); - - // Owner of data peeks it over lstrip - if ( myrank == rank ) { - for(int x=0;xSendRecvPacket((void *)&siteObj[cc],(void *)&siteObj[cc],rank,iorank,sizeof(sobj)*lstrip); - } - } - } - - grid->Barrier(); // necessary? - - ///////////////////////// - // Get the global lexico base of the chunk - ///////////////////////// - Lexicographic::CoorFromIndex(tsite, tlex, range); - for(int d = 0;d < nd; d++){ gsite[d] = tsite[d] + start[d];} - grid->GlobalCoorToRankIndex(rank, o_idx, i_idx, gsite); - grid->GlobalCoorToGlobalIndex(gsite, g_idx); - - if (myrank == iorank) { - - for(int c=0;c iodata(lsites); + parallel_for(int lidx=0;lidx tmp(RngStateCount); + parallel.GetState(tmp,lidx); + std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin()); } - - grid->GlobalSum(csum); - grid->GlobalSum(bytes); - timer.Stop(); - std::cout << GridLogPerformance << "writeObjectParallel: wrote " << bytes - << " bytes in " << timer.Elapsed() << " " - << (double)bytes / timer.useconds() << " MB/s " << std::endl; - grid->Barrier(); // necessary? - if (!ILDG.is_ILDG) { - if (IOnode) { - fout.close(); - } - } + int dowrite=0; + csum= IOobject(w,grid,iodata,file,offset,format,dowrite); + std::cout << GridLogMessage << "RNG file checksum " << std::hex << csum << std::dec << std::endl; + std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl; return csum; } }; } - #endif From 1a1f6d55f9ac7c94b7ddd1f129d26ddf87d29c9c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 1 Jun 2017 17:37:26 -0400 Subject: [PATCH 039/170] Roll over to MPI IO for parallel IO --- lib/parallelIO/NerscIO.h | 175 ++++++++++++--------------------------- 1 file changed, 52 insertions(+), 123 deletions(-) diff --git a/lib/parallelIO/NerscIO.h b/lib/parallelIO/NerscIO.h index ab535dac..ba9d23de 100644 --- a/lib/parallelIO/NerscIO.h +++ b/lib/parallelIO/NerscIO.h @@ -30,12 +30,6 @@ #ifndef GRID_NERSC_IO_H #define GRID_NERSC_IO_H -#undef PARALLEL_READ -#undef SERIAL_READ -#define MPI_READ - -#define PARALLEL_WRITE - #include #include #include @@ -133,10 +127,6 @@ namespace Grid { ////////////////////////////////////////////////////////////////////// // Utilities ; these are QCD aware ////////////////////////////////////////////////////////////////////// - inline void NerscChecksum(uint32_t *buf,uint32_t buf_size_bytes,uint32_t &csum) - { - BinaryIO::Uint32Checksum(buf,buf_size_bytes,csum); - } inline void reconstruct3(LorentzColourMatrix & cm) { const int x=0; @@ -151,43 +141,38 @@ namespace Grid { template struct NerscSimpleMunger{ - void operator()(fobj &in, sobj &out, uint32_t &csum) { + void operator()(fobj &in, sobj &out) { for (int mu = 0; mu < Nd; mu++) { for (int i = 0; i < Nc; i++) { - for (int j = 0; j < Nc; j++) { - out(mu)()(i, j) = in(mu)()(i, j); - } - } + for (int j = 0; j < Nc; j++) { + out(mu)()(i, j) = in(mu)()(i, j); + }} } - NerscChecksum((uint32_t *)&in, sizeof(in), csum); }; }; template struct NerscSimpleUnmunger { - void operator()(sobj &in, fobj &out, uint32_t &csum) { + + void operator()(sobj &in, fobj &out) { for (int mu = 0; mu < Nd; mu++) { for (int i = 0; i < Nc; i++) { - for (int j = 0; j < Nc; j++) { - out(mu)()(i, j) = in(mu)()(i, j); - } - } + for (int j = 0; j < Nc; j++) { + out(mu)()(i, j) = in(mu)()(i, j); + }} } - NerscChecksum((uint32_t *)&out, sizeof(out), csum); }; }; template struct Nersc3x2munger{ - void operator() (fobj &in,sobj &out,uint32_t &csum){ - - NerscChecksum((uint32_t *)&in,sizeof(in),csum); + void operator() (fobj &in,sobj &out){ for(int mu=0;mu<4;mu++){ for(int i=0;i<2;i++){ - for(int j=0;j<3;j++){ - out(mu)()(i,j) = in(mu)(i)(j); - }} + for(int j=0;j<3;j++){ + out(mu)()(i,j) = in(mu)(i)(j); + }} } reconstruct3(out); } @@ -196,18 +181,13 @@ namespace Grid { template struct Nersc3x2unmunger{ - void operator() (sobj &in,fobj &out,uint32_t &csum){ - - + void operator() (sobj &in,fobj &out){ for(int mu=0;mu<4;mu++){ for(int i=0;i<2;i++){ - for(int j=0;j<3;j++){ - out(mu)(i)(j) = in(mu)()(i,j); - }} + for(int j=0;j<3;j++){ + out(mu)(i)(j) = in(mu)()(i,j); + }} } - - NerscChecksum((uint32_t *)&out,sizeof(out),csum); - } }; @@ -333,9 +313,9 @@ namespace Grid { // Now the meat: the object readers ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - template - static inline void readConfiguration(Lattice > &Umu,NerscField& header,std::string file) - { + template + static inline void readConfiguration(Lattice > &Umu,NerscField& header,std::string file) + { typedef Lattice > GaugeField; GridBase *grid = Umu._grid; @@ -354,62 +334,22 @@ namespace Grid { // depending on datatype, set up munger; // munger is a function of if ( header.data_type == std::string("4D_SU3_GAUGE") ) { - if ( ieee32 || ieee32big ) { -#ifdef PARALLEL_READ - csum=BinaryIO::readObjectParallel, LorentzColour2x3F> - (Umu,file,Nersc3x2munger(), offset,format); -#endif -#ifdef MPI_READ - csum=BinaryIO::readObjectMPI, LorentzColour2x3F> - (Umu,file,Nersc3x2munger(), offset,format); -#endif -#ifdef SERIAL_READ - csum=BinaryIO::readObjectSerial, LorentzColour2x3F> - (Umu,file,Nersc3x2munger(), offset,format); -#endif - } - if ( ieee64 || ieee64big ) { -#ifdef PARALLEL_READ - csum=BinaryIO::readObjectParallel, LorentzColour2x3D> - (Umu,file,Nersc3x2munger(),offset,format); -#endif -#ifdef MPI_READ - csum=BinaryIO::readObjectMPI, LorentzColour2x3D> - (Umu,file,Nersc3x2munger(),offset,format); -#endif -#ifdef SERIAL_READ - csum=BinaryIO::readObjectSerial, LorentzColour2x3D> - (Umu,file,Nersc3x2munger(),offset,format); -#endif - } - } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) { if ( ieee32 || ieee32big ) { -#ifdef PARALLEL_READ - csum=BinaryIO::readObjectParallel,LorentzColourMatrixF> - (Umu,file,NerscSimpleMunger(),offset,format); -#endif -#ifdef MPI_READ - csum=BinaryIO::readObjectMPI,LorentzColourMatrixF> - (Umu,file,NerscSimpleMunger(),offset,format); -#endif -#ifdef SERIAL_READ - csum=BinaryIO::readObjectSerial,LorentzColourMatrixF> - (Umu,file,NerscSimpleMunger(),offset,format); -#endif + csum=BinaryIO::readLatticeObject, LorentzColour2x3F> + (Umu,file,Nersc3x2munger(), offset,format); } if ( ieee64 || ieee64big ) { -#ifdef PARALLEL_READ - csum=BinaryIO::readObjectParallel,LorentzColourMatrixD> + csum=BinaryIO::readLatticeObject, LorentzColour2x3D> + (Umu,file,Nersc3x2munger(),offset,format); + } + } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) { + if ( ieee32 || ieee32big ) { + csum=BinaryIO::readLatticeObject,LorentzColourMatrixF> + (Umu,file,NerscSimpleMunger(),offset,format); + } + if ( ieee64 || ieee64big ) { + csum=BinaryIO::readLatticeObject,LorentzColourMatrixD> (Umu,file,NerscSimpleMunger(),offset,format); -#endif -#ifdef MPI_READ - csum=BinaryIO::readObjectMPI,LorentzColourMatrixD> - (Umu,file,NerscSimpleMunger(),offset,format); -#endif -#ifdef SERIAL_READ - csum=BinaryIO::readObjectSerial,LorentzColourMatrixD> - (Umu,file,NerscSimpleMunger(),offset,format); -#endif } } else { assert(0); @@ -434,14 +374,14 @@ namespace Grid { std::cerr << " plaqs " << clone.plaquette << " " << header.plaquette << std::endl; std::cerr << " trace " << clone.link_trace<< " " << header.link_trace<< std::endl; std::cerr << " csum " < static inline void writeConfiguration(Lattice > &Umu,std::string file, int two_row,int bits32) @@ -466,41 +406,29 @@ namespace Grid { NerscStatistics(Umu,header); NerscMachineCharacteristics(header); - uint32_t csum; int offset; truncate(file); if ( two_row ) { - header.floating_point = std::string("IEEE64BIG"); header.data_type = std::string("4D_SU3_GAUGE"); Nersc3x2unmunger munge; - BinaryIO::Uint32Checksum(Umu, munge,header.checksum); offset = writeHeader(header,file); -#ifdef PARALLEL_WRITE - csum=BinaryIO::writeObjectParallel(Umu,file,munge,offset,header.floating_point); -#else - csum=BinaryIO::writeObjectSerial(Umu,file,munge,offset,header.floating_point); -#endif + header.checksum=BinaryIO::writeLatticeObject(Umu,file,munge,offset,header.floating_point); + writeHeader(header,file); } else { header.floating_point = std::string("IEEE64BIG"); header.data_type = std::string("4D_SU3_GAUGE_3x3"); NerscSimpleUnmunger munge; - BinaryIO::Uint32Checksum(Umu, munge,header.checksum); offset = writeHeader(header,file); -#ifdef PARALLEL_WRITE - csum=BinaryIO::writeObjectParallel(Umu,file,munge,offset,header.floating_point); -#else - csum=BinaryIO::writeObjectSerial(Umu,file,munge,offset,header.floating_point); -#endif + header.checksum=BinaryIO::writeLatticeObject(Umu,file,munge,offset,header.floating_point); + writeHeader(header,file); } - - std::cout< - uint32_t csum=BinaryIO::readRNGSerial(serial,parallel,file,offset); + uint32_t csum=BinaryIO::readRNG(serial,parallel,file,offset); if ( csum != header.checksum ) { std::cerr << "checksum mismatch "< Date: Thu, 1 Jun 2017 17:38:18 -0400 Subject: [PATCH 040/170] Roll over to MPI version of I/O --- tests/IO/Test_nersc_io.cc | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/IO/Test_nersc_io.cc b/tests/IO/Test_nersc_io.cc index 8507df13..0a0f8977 100644 --- a/tests/IO/Test_nersc_io.cc +++ b/tests/IO/Test_nersc_io.cc @@ -38,10 +38,13 @@ int main (int argc, char ** argv) { Grid_init(&argc,&argv); + std::cout < simd_layout = GridDefaultSimd(4,vComplex::Nsimd()); std::vector mpi_layout = GridDefaultMpi(); - std::vector latt_size ({16,16,16,16}); + std::vector latt_size ({48,48,48,96}); + //std::vector latt_size ({32,32,32,32}); + //std::vector latt_size ({16,16,16,32}); std::vector clatt_size ({4,4,4,8}); int orthodir=3; int orthosz =latt_size[orthodir]; @@ -49,14 +52,17 @@ int main (int argc, char ** argv) GridCartesian Fine(latt_size,simd_layout,mpi_layout); GridCartesian Coarse(clatt_size,simd_layout,mpi_layout); + GridParallelRNG pRNGa(&Fine); GridParallelRNG pRNGb(&Fine); GridSerialRNG sRNGa; GridSerialRNG sRNGb; + std::cout <({45,12,81,9})); sRNGa.SeedFixedIntegers(std::vector({45,12,81,9})); - + std::cout < Plaq_T(orthosz); sliceSum(Plaq,Plaq_T,Nd-1); From 094c3d091afb3f29e7e370562cb0def29b3b26f0 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 2 Jun 2017 00:38:58 +0100 Subject: [PATCH 041/170] Improved and RNG's now survive checkpoint --- lib/parallelIO/BinaryIO.h | 175 ++++++++++++------ .../hmc/checkpointers/BinaryCheckpointer.h | 12 +- tests/IO/Test_nersc_io.cc | 4 +- 3 files changed, 124 insertions(+), 67 deletions(-) diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h index 13341927..e427a25b 100644 --- a/lib/parallelIO/BinaryIO.h +++ b/lib/parallelIO/BinaryIO.h @@ -133,7 +133,6 @@ class BinaryIO { } #pragma omp critical csum = csum + csum_thr; - } } // Network is big endian @@ -227,13 +226,20 @@ class BinaryIO { // Real action: // Read or Write distributed lexico array of ANY object to a specific location in file ////////////////////////////////////////////////////////////////////////////////////// + + static const int BINARYIO_MASTER_APPEND = 0x10; + static const int BINARYIO_UNORDERED = 0x08; + static const int BINARYIO_LEXICOGRAPHIC = 0x04; + static const int BINARYIO_READ = 0x02; + static const int BINARYIO_WRITE = 0x01; + template - static inline uint32_t IOobject(word w, - GridBase *grid, - std::vector &iodata, - std::string file, - int offset, - const std::string &format, int doread) + static inline uint32_t IOobject(word w, + GridBase *grid, + std::vector &iodata, + std::string file, + int offset, + const std::string &format, int control) { grid->Barrier(); GridStopWatch timer; @@ -250,21 +256,24 @@ class BinaryIO { std::vector gLattice= grid->GlobalDimensions(); std::vector lLattice= grid->LocalDimensions(); - std::vector distribs(ndim,MPI_DISTRIBUTE_BLOCK); - std::vector dargs (ndim,MPI_DISTRIBUTE_DFLT_DARG); - std::vector lStart(ndim); std::vector gStart(ndim); // Flatten the file uint64_t lsites = grid->lSites(); - iodata.resize(lsites); - + if ( control & BINARYIO_MASTER_APPEND ) { + assert(iodata.size()==1); + } else { + assert(lsites==iodata.size()); + } for(int d=0;d distribs(ndim,MPI_DISTRIBUTE_BLOCK); + std::vector dargs (ndim,MPI_DISTRIBUTE_DFLT_DARG); MPI_Datatype mpiObject; MPI_Datatype fileArray; MPI_Datatype localArray; @@ -281,7 +290,6 @@ class BinaryIO { numword = sizeof(fobj)/sizeof(double); mpiword = MPI_DOUBLE; } - ////////////////////////////////////////////////////////////////////////////// // Sobj in MPI phrasing @@ -301,6 +309,7 @@ class BinaryIO { ////////////////////////////////////////////////////////////////////////////// ierr=MPI_Type_create_subarray(ndim,&lLattice[0],&lLattice[0],&lStart[0],MPI_ORDER_FORTRAN, mpiObject,&localArray); assert(ierr==0); ierr=MPI_Type_commit(&localArray); assert(ierr==0); +#endif ////////////////////////////////////////////////////////////////////////////// // Byte order @@ -311,55 +320,91 @@ class BinaryIO { int ieee64 = (format == std::string("IEEE64")); ////////////////////////////////////////////////////////////////////////////// - // Do the MPI I/O read + // Do the I/O ////////////////////////////////////////////////////////////////////////////// - if ( doread ) { - std::cout<< GridLogMessage<< "MPI read I/O "<< file<< std::endl; + if ( control & BINARYIO_READ ) { + timer.Start(); - ierr=MPI_File_open(grid->communicator, file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); assert(ierr==0); - ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0); - ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0); + + if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) { +#ifdef USE_MPI_IO + std::cout<< GridLogMessage<< "MPI read I/O "<< file<< std::endl; + ierr=MPI_File_open(grid->communicator, file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); assert(ierr==0); + ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0); + ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0); + MPI_File_close(&fh); + MPI_Type_free(&fileArray); + MPI_Type_free(&localArray); +#else + assert(0); +#endif + } else { + std::cout<< GridLogMessage<< "C++ read I/O "<< file<< std::endl; + std::ifstream fin; + fin.open(file,std::ios::binary|std::ios::in); + if ( control & BINARYIO_MASTER_APPEND ) { + fin.seekg(-sizeof(fobj),fin.end); + } else { + fin.seekg(offset+myrank*lsites*sizeof(fobj)); + } + fin.read((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fin.fail()==0); + fin.close(); + } timer.Stop(); grid->Barrier(); bstimer.Start(); - if (ieee32big) be32toh_v((void *)&iodata[0], sizeof(fobj)*lsites,csum); - if (ieee32) le32toh_v((void *)&iodata[0], sizeof(fobj)*lsites,csum); - if (ieee64big) be64toh_v((void *)&iodata[0], sizeof(fobj)*lsites,csum); - if (ieee64) le64toh_v((void *)&iodata[0], sizeof(fobj)*lsites,csum); + if (ieee32big) be32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum); + if (ieee32) le32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum); + if (ieee64big) be64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum); + if (ieee64) le64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum); bstimer.Stop(); - - } else { - std::cout<< GridLogMessage<< "MPI write I/O "<< file<< std::endl; - bstimer.Start(); - if (ieee32big) htobe32_v((void *)&iodata[0], sizeof(fobj)*lsites,csum); - if (ieee32) htole32_v((void *)&iodata[0], sizeof(fobj)*lsites,csum); - if (ieee64big) htobe64_v((void *)&iodata[0], sizeof(fobj)*lsites,csum); - if (ieee64) htole64_v((void *)&iodata[0], sizeof(fobj)*lsites,csum); - bstimer.Stop(); - - grid->Barrier(); - - timer.Start(); - ierr=MPI_File_open(grid->communicator, file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0); - ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0); - ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0); - timer.Stop(); - } - - ////////////////////////////////////////////////////////////////////////////// - // Finish up MPI I/O - ////////////////////////////////////////////////////////////////////////////// - MPI_File_close(&fh); - MPI_Type_free(&fileArray); - MPI_Type_free(&localArray); + + if ( control & BINARYIO_WRITE ) { + + bstimer.Start(); + if (ieee32big) htobe32_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum); + if (ieee32) htole32_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum); + if (ieee64big) htobe64_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum); + if (ieee64) htole64_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum); + bstimer.Stop(); + + grid->Barrier(); + + timer.Start(); + if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) { +#ifdef USE_MPI_IO + std::cout<< GridLogMessage<< "MPI write I/O "<< file<< std::endl; + ierr=MPI_File_open(grid->communicator, file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0); + ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0); + ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0); + MPI_File_close(&fh); + MPI_Type_free(&fileArray); + MPI_Type_free(&localArray); +#else + assert(0); +#endif + } else { + std::cout<< GridLogMessage<< "C++ write I/O "<< file<< std::endl; + std::ofstream fout; + fout.open(file,std::ios::binary|std::ios::out|std::ios::in); + if ( control & BINARYIO_MASTER_APPEND ) { + fout.seekp(0,fout.end); + } else { + fout.seekp(offset+myrank*lsites*sizeof(fobj)); + } + fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fout.fail()==0); + fout.close(); + } + timer.Stop(); + } std::cout< scalardata(lsites); std::vector iodata(lsites); // Munge, checksum, byte order in here - int doread=1; - uint32_t csum= IOobject(w,grid,iodata,file,offset,format,doread); + uint32_t csum= IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC); GridStopWatch timer; timer.Start(); @@ -432,8 +476,7 @@ class BinaryIO { grid->Barrier(); timer.Stop(); - int dowrite=0; - uint32_t csum= IOobject(w,grid,iodata,file,offset,format,dowrite); + uint32_t csum= IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC); std::cout< iodata(lsites); - csum= IOobject(w,grid,iodata,file,offset,format,doread); + csum= IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC); timer.Start(); parallel_for(int lidx=0;lidx tmp(RngStateCount); + std::copy(iodata[0].begin(),iodata[0].end(),tmp.begin()); + serial.SetState(tmp,0); + } + std::cout << GridLogMessage << "RNG file checksum " << std::hex << csum << std::dec << std::endl; std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl; return csum; @@ -507,9 +557,16 @@ class BinaryIO { } timer.Stop(); - int dowrite=0; - csum= IOobject(w,grid,iodata,file,offset,format,dowrite); + csum= IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC); + iodata.resize(1); + { + std::vector tmp(RngStateCount); + serial.GetState(tmp,0); + std::copy(tmp.begin(),tmp.end(),iodata[0].begin()); + } + csum+= IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_MASTER_APPEND); + std::cout << GridLogMessage << "RNG file checksum " << std::hex << csum << std::dec << std::endl; std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl; return csum; diff --git a/lib/qcd/hmc/checkpointers/BinaryCheckpointer.h b/lib/qcd/hmc/checkpointers/BinaryCheckpointer.h index 251ed042..6116a46c 100644 --- a/lib/qcd/hmc/checkpointers/BinaryCheckpointer.h +++ b/lib/qcd/hmc/checkpointers/BinaryCheckpointer.h @@ -68,11 +68,11 @@ class BinaryHmcCheckpointer : public BaseHmcCheckpointer { std::string config, rng; this->build_filenames(traj, Params, config, rng); - BinaryIO::BinarySimpleUnmunger munge; + BinarySimpleUnmunger munge; truncate(rng); - BinaryIO::writeRNGSerial(sRNG, pRNG, rng, 0); + BinaryIO::writeRNG(sRNG, pRNG, rng, 0); truncate(config); - uint32_t csum = BinaryIO::writeObjectParallel( + uint32_t csum = BinaryIO::writeLatticeObject( U, config, munge, 0, Params.format); std::cout << GridLogMessage << "Written Binary Configuration " << config @@ -85,9 +85,9 @@ class BinaryHmcCheckpointer : public BaseHmcCheckpointer { std::string config, rng; this->build_filenames(traj, Params, config, rng); - BinaryIO::BinarySimpleMunger munge; - BinaryIO::readRNGSerial(sRNG, pRNG, rng, 0); - uint32_t csum = BinaryIO::readObjectParallel( + BinarySimpleMunger munge; + BinaryIO::readRNG(sRNG, pRNG, rng, 0); + uint32_t csum = BinaryIO::readLatticeObject( U, config, munge, 0, Params.format); std::cout << GridLogMessage << "Read Binary Configuration " << config diff --git a/tests/IO/Test_nersc_io.cc b/tests/IO/Test_nersc_io.cc index 0a0f8977..14c6080d 100644 --- a/tests/IO/Test_nersc_io.cc +++ b/tests/IO/Test_nersc_io.cc @@ -42,9 +42,9 @@ int main (int argc, char ** argv) std::vector simd_layout = GridDefaultSimd(4,vComplex::Nsimd()); std::vector mpi_layout = GridDefaultMpi(); - std::vector latt_size ({48,48,48,96}); + //std::vector latt_size ({48,48,48,96}); //std::vector latt_size ({32,32,32,32}); - //std::vector latt_size ({16,16,16,32}); + std::vector latt_size ({16,16,16,32}); std::vector clatt_size ({4,4,4,8}); int orthodir=3; int orthosz =latt_size[orthodir]; From 092dcd4e04c1e069fe63984cfc7d9f1a0da9e703 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 2 Jun 2017 22:50:25 +0100 Subject: [PATCH 042/170] MPI I/O only if MPI compiled --- lib/parallelIO/BinaryIO.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h index e427a25b..8b8d4165 100644 --- a/lib/parallelIO/BinaryIO.h +++ b/lib/parallelIO/BinaryIO.h @@ -29,12 +29,16 @@ #ifndef GRID_BINARY_IO_H #define GRID_BINARY_IO_H - -#include "IldgIOtypes.h" +#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) +#define USE_MPI_IO +#else +#undef USE_MPI_IO +#endif #ifdef HAVE_ENDIAN_H #include #endif + #include #include From 009f48a9045c87c13ec2fde2b3630446bd65bbb6 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 7 Jun 2017 16:34:09 +0100 Subject: [PATCH 043/170] QedFVol: Add missing factor of 2 in free vacuum polarisation --- extras/Hadrons/Modules/MScalar/ScalarVP.cc | 58 ++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.cc b/extras/Hadrons/Modules/MScalar/ScalarVP.cc index f6f40700..13591d83 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.cc +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.cc @@ -185,6 +185,63 @@ void TScalarVP::execute(void) ScalarField &propSun = *env().createLattice(propSunName_); ScalarField &propTad = *env().createLattice(propTadName_); chargedProp(propQ, propSun, propTad, *GFSrc_, fft); + // // OUTPUT IF NECESSARY + // if (!par().output.empty()) + // { + // ScalarField fullProp = (*prop0_) + q*propQ + q*q*propSun + q*q*propTad; + // std::string filename = par().output + "_prop_000." + + // std::to_string(env().getTrajectory()); + + // LOG(Message) << "Saving zero-momentum projection to '" + // << filename << "'..." << std::endl; + + // CorrWriter writer(filename); + // std::vector vecBuf; + // std::vector result; + + // write(writer, "charge", q); + + // // Write full propagator + // sliceSum(fullProp, vecBuf, Tp); + // result.resize(vecBuf.size()); + // for (unsigned int t = 0; t < vecBuf.size(); ++t) + // { + // result[t] = TensorRemove(vecBuf[t]); + // } + // write(writer, "prop", result); + + // // Write free propagator + // sliceSum(*prop0_, vecBuf, Tp); + // for (unsigned int t = 0; t < vecBuf.size(); ++t) + // { + // result[t] = TensorRemove(vecBuf[t]); + // } + // write(writer, "prop_0", result); + + // // Write propagator D1 term + // sliceSum(propD1, vecBuf, Tp); + // for (unsigned int t = 0; t < vecBuf.size(); ++t) + // { + // result[t] = TensorRemove(vecBuf[t]); + // } + // write(writer, "prop_q", result); + + // // Write propagator D1D1 term + // sliceSum(propD1D1, vecBuf, Tp); + // for (unsigned int t = 0; t < vecBuf.size(); ++t) + // { + // result[t] = TensorRemove(vecBuf[t]); + // } + // write(writer, "prop_sun", result); + + // // Write propagator D2 term + // sliceSum(propD2, vecBuf, Tp); + // for (unsigned int t = 0; t < vecBuf.size(); ++t) + // { + // result[t] = TensorRemove(vecBuf[t]); + // } + // write(writer, "prop_tad", result); + // } // Propagators from shifted sources LOG(Message) << "Computing O(q) charged scalar propagators..." @@ -281,6 +338,7 @@ void TScalarVP::execute(void) * (-0.5)*q*q*Amu*Amu * prop1; + freeVpTensor[mu][nu] = 2.0*real(freeVpTensor[mu][nu]); vpTensor[mu][nu] = 2.0*real(vpTensor[mu][nu]); } } From e38612e6fa71c57bf64fca975580a036be021cbb Mon Sep 17 00:00:00 2001 From: James Harrison Date: Wed, 7 Jun 2017 17:42:00 +0100 Subject: [PATCH 044/170] QedFVol: Update ScalarVP module for compatibility with new scalar action --- extras/Hadrons/Modules/MScalar/ScalarVP.cc | 2 +- extras/Hadrons/Modules/MScalar/ScalarVP.hpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.cc b/extras/Hadrons/Modules/MScalar/ScalarVP.cc index 13591d83..c91b98ae 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.cc +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.cc @@ -125,7 +125,7 @@ void TScalarVP::execute(void) LOG(Message) << "Caching momentum space free scalar propagator" << " (mass= " << par().mass << ")..." << std::endl; freeMomProp_ = env().createLattice(freeMomPropName_); - Scalar::MomentumSpacePropagator(*freeMomProp_, par().mass); + SIMPL::MomentumSpacePropagator(*freeMomProp_, par().mass); } else { diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.hpp b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp index 4629f6e6..fdbad6f6 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.hpp +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp @@ -1,5 +1,5 @@ -#ifndef Hadrons_ScalarVP_hpp_ -#define Hadrons_ScalarVP_hpp_ +#ifndef Hadrons_MScalar_ScalarVP_hpp_ +#define Hadrons_MScalar_ScalarVP_hpp_ #include #include @@ -66,4 +66,4 @@ END_MODULE_NAMESPACE END_HADRONS_NAMESPACE -#endif // Hadrons_ScalarVP_hpp_ +#endif // Hadrons_MScalar_ScalarVP_hpp_ From 5f55bca378f0e379b8595a82d096e79e8a7ed92d Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Wed, 7 Jun 2017 20:10:48 -0500 Subject: [PATCH 045/170] Hadrons: Quark module renamed MFermion::GaugeProp --- extras/Hadrons/Modules.hpp | 2 +- .../{Quark.hpp => MFermion/GaugeProp.hpp} | 79 +++++++------------ extras/Hadrons/modules.inc | 4 +- tests/hadrons/Test_hadrons_meson_3pt.cc | 14 ++-- tests/hadrons/Test_hadrons_spectrum.cc | 6 +- 5 files changed, 42 insertions(+), 63 deletions(-) rename extras/Hadrons/Modules/{Quark.hpp => MFermion/GaugeProp.hpp} (65%) diff --git a/extras/Hadrons/Modules.hpp b/extras/Hadrons/Modules.hpp index 42a1f651..c27254aa 100644 --- a/extras/Hadrons/Modules.hpp +++ b/extras/Hadrons/Modules.hpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -22,4 +23,3 @@ #include #include #include -#include diff --git a/extras/Hadrons/Modules/Quark.hpp b/extras/Hadrons/Modules/MFermion/GaugeProp.hpp similarity index 65% rename from extras/Hadrons/Modules/Quark.hpp rename to extras/Hadrons/Modules/MFermion/GaugeProp.hpp index cf7d4c28..b4f9edcc 100644 --- a/extras/Hadrons/Modules/Quark.hpp +++ b/extras/Hadrons/Modules/MFermion/GaugeProp.hpp @@ -1,34 +1,5 @@ -/************************************************************************************* - -Grid physics library, www.github.com/paboyle/Grid - -Source file: extras/Hadrons/Modules/Quark.hpp - -Copyright (C) 2015 -Copyright (C) 2016 - -Author: Antonin Portelli - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along -with this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - -See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ - -#ifndef Hadrons_Quark_hpp_ -#define Hadrons_Quark_hpp_ +#ifndef Hadrons_MFermion_GaugeProp_hpp_ +#define Hadrons_MFermion_GaugeProp_hpp_ #include #include @@ -37,27 +8,29 @@ See the full license in the file "LICENSE" in the top level distribution directo BEGIN_HADRONS_NAMESPACE /****************************************************************************** - * TQuark * + * GaugeProp * ******************************************************************************/ -class QuarkPar: Serializable +BEGIN_MODULE_NAMESPACE(MFermion) + +class GaugePropPar: Serializable { public: - GRID_SERIALIZABLE_CLASS_MEMBERS(QuarkPar, + GRID_SERIALIZABLE_CLASS_MEMBERS(GaugePropPar, std::string, source, std::string, solver); }; template -class TQuark: public Module +class TGaugeProp: public Module { public: FGS_TYPE_ALIASES(FImpl,); public: // constructor - TQuark(const std::string name); + TGaugeProp(const std::string name); // destructor - virtual ~TQuark(void) = default; - // dependencies/products + virtual ~TGaugeProp(void) = default; + // dependency relation virtual std::vector getInput(void); virtual std::vector getOutput(void); // setup @@ -69,20 +42,20 @@ private: SolverFn *solver_{nullptr}; }; -MODULE_REGISTER(Quark, TQuark); +MODULE_REGISTER_NS(GaugeProp, TGaugeProp, MFermion); /****************************************************************************** - * TQuark implementation * + * TGaugeProp implementation * ******************************************************************************/ // constructor ///////////////////////////////////////////////////////////////// template -TQuark::TQuark(const std::string name) -: Module(name) +TGaugeProp::TGaugeProp(const std::string name) +: Module(name) {} // dependencies/products /////////////////////////////////////////////////////// template -std::vector TQuark::getInput(void) +std::vector TGaugeProp::getInput(void) { std::vector in = {par().source, par().solver}; @@ -90,7 +63,7 @@ std::vector TQuark::getInput(void) } template -std::vector TQuark::getOutput(void) +std::vector TGaugeProp::getOutput(void) { std::vector out = {getName(), getName() + "_5d"}; @@ -99,7 +72,7 @@ std::vector TQuark::getOutput(void) // setup /////////////////////////////////////////////////////////////////////// template -void TQuark::setup(void) +void TGaugeProp::setup(void) { Ls_ = env().getObjectLs(par().solver); env().template registerLattice(getName()); @@ -111,13 +84,13 @@ void TQuark::setup(void) // execution /////////////////////////////////////////////////////////////////// template -void TQuark::execute(void) +void TGaugeProp::execute(void) { LOG(Message) << "Computing quark propagator '" << getName() << "'" - << std::endl; + << std::endl; FermionField source(env().getGrid(Ls_)), sol(env().getGrid(Ls_)), - tmp(env().getGrid()); + tmp(env().getGrid()); std::string propName = (Ls_ == 1) ? getName() : (getName() + "_5d"); PropagatorField &prop = *env().template createLattice(propName); PropagatorField &fullSrc = *env().template getObject(par().source); @@ -128,12 +101,12 @@ void TQuark::execute(void) } LOG(Message) << "Inverting using solver '" << par().solver - << "' on source '" << par().source << "'" << std::endl; + << "' on source '" << par().source << "'" << std::endl; for (unsigned int s = 0; s < Ns; ++s) for (unsigned int c = 0; c < Nc; ++c) { LOG(Message) << "Inversion for spin= " << s << ", color= " << c - << std::endl; + << std::endl; // source conversion for 4D sources if (!env().isObject5d(par().source)) { @@ -170,7 +143,7 @@ void TQuark::execute(void) if (Ls_ > 1) { PropagatorField &p4d = - *env().template getObject(getName()); + *env().template getObject(getName()); axpby_ssp_pminus(sol, 0., sol, 1., sol, 0, 0); axpby_ssp_pplus(sol, 1., sol, 1., sol, 0, Ls_-1); @@ -180,6 +153,8 @@ void TQuark::execute(void) } } +END_MODULE_NAMESPACE + END_HADRONS_NAMESPACE -#endif // Hadrons_Quark_hpp_ +#endif // Hadrons_MFermion_GaugeProp_hpp_ diff --git a/extras/Hadrons/modules.inc b/extras/Hadrons/modules.inc index f51ede5a..669b08ba 100644 --- a/extras/Hadrons/modules.inc +++ b/extras/Hadrons/modules.inc @@ -20,6 +20,7 @@ modules_hpp =\ Modules/MContraction/WeakHamiltonianEye.hpp \ Modules/MContraction/WeakHamiltonianNonEye.hpp \ Modules/MContraction/WeakNeutral4ptDisc.hpp \ + Modules/MFermion/GaugeProp.hpp \ Modules/MGauge/Load.hpp \ Modules/MGauge/Random.hpp \ Modules/MGauge/StochEm.hpp \ @@ -33,6 +34,5 @@ modules_hpp =\ Modules/MSource/Point.hpp \ Modules/MSource/SeqGamma.hpp \ Modules/MSource/Wall.hpp \ - Modules/MSource/Z2.hpp \ - Modules/Quark.hpp + Modules/MSource/Z2.hpp diff --git a/tests/hadrons/Test_hadrons_meson_3pt.cc b/tests/hadrons/Test_hadrons_meson_3pt.cc index 7e487153..382c39d4 100644 --- a/tests/hadrons/Test_hadrons_meson_3pt.cc +++ b/tests/hadrons/Test_hadrons_meson_3pt.cc @@ -65,6 +65,10 @@ int main(int argc, char *argv[]) // set fermion boundary conditions to be periodic space, antiperiodic time. std::string boundary = "1 1 1 -1"; + // sink + MSink::Point::Par sinkPar; + sinkPar.mom = "0 0 0"; + application.createModule("sink", sinkPar); for (unsigned int i = 0; i < flavour.size(); ++i) { // actions @@ -115,15 +119,15 @@ int main(int argc, char *argv[]) } // propagators - Quark::Par quarkPar; + MFermion::GaugeProp::Par quarkPar; quarkPar.solver = "CG_" + flavour[i]; quarkPar.source = srcName; - application.createModule(qName[i], quarkPar); + application.createModule(qName[i], quarkPar); for (unsigned int mu = 0; mu < Nd; ++mu) { quarkPar.source = seqName[i][mu]; seqName[i][mu] = "Q_" + flavour[i] + "-" + seqName[i][mu]; - application.createModule(seqName[i][mu], quarkPar); + application.createModule(seqName[i][mu], quarkPar); } } @@ -136,7 +140,7 @@ int main(int argc, char *argv[]) mesPar.q1 = qName[i]; mesPar.q2 = qName[j]; mesPar.gammas = "all"; - mesPar.mom = "0. 0. 0. 0."; + mesPar.sink = "sink"; application.createModule("meson_Z2_" + std::to_string(t) + "_" @@ -155,7 +159,7 @@ int main(int argc, char *argv[]) mesPar.q1 = qName[i]; mesPar.q2 = seqName[j][mu]; mesPar.gammas = "all"; - mesPar.mom = "0. 0. 0. 0."; + mesPar.sink = "sink"; application.createModule("3pt_Z2_" + std::to_string(t) + "_" diff --git a/tests/hadrons/Test_hadrons_spectrum.cc b/tests/hadrons/Test_hadrons_spectrum.cc index 8f7b30c8..801674f7 100644 --- a/tests/hadrons/Test_hadrons_spectrum.cc +++ b/tests/hadrons/Test_hadrons_spectrum.cc @@ -90,12 +90,12 @@ int main(int argc, char *argv[]) solverPar); // propagators - Quark::Par quarkPar; + MFermion::GaugeProp::Par quarkPar; quarkPar.solver = "CG_" + flavour[i]; quarkPar.source = "pt"; - application.createModule("Qpt_" + flavour[i], quarkPar); + application.createModule("Qpt_" + flavour[i], quarkPar); quarkPar.source = "z2"; - application.createModule("QZ2_" + flavour[i], quarkPar); + application.createModule("QZ2_" + flavour[i], quarkPar); } for (unsigned int i = 0; i < flavour.size(); ++i) for (unsigned int j = i; j < flavour.size(); ++j) From 24908162970faae02a878ce3298d3ebc79a47fb9 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Wed, 7 Jun 2017 20:11:02 -0500 Subject: [PATCH 046/170] Hadrons: rare kaon program removed --- tests/hadrons/Test_hadrons.hpp | 368 ------------------------- tests/hadrons/Test_hadrons_rarekaon.cc | 342 ----------------------- 2 files changed, 710 deletions(-) delete mode 100644 tests/hadrons/Test_hadrons.hpp delete mode 100644 tests/hadrons/Test_hadrons_rarekaon.cc diff --git a/tests/hadrons/Test_hadrons.hpp b/tests/hadrons/Test_hadrons.hpp deleted file mode 100644 index 26d02a5c..00000000 --- a/tests/hadrons/Test_hadrons.hpp +++ /dev/null @@ -1,368 +0,0 @@ -/******************************************************************************* - Grid physics library, www.github.com/paboyle/Grid - - Source file: tests/hadrons/Test_hadrons.hpp - - Copyright (C) 2017 - - Author: Andrew Lawson - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution - directory. - *******************************************************************************/ - -#include - -using namespace Grid; -using namespace Hadrons; - -/******************************************************************************* - * Macros to reduce code duplication. - ******************************************************************************/ -// Useful definitions -#define ZERO_MOM "0. 0. 0. 0." -#define INIT_INDEX(s, n) (std::string(s) + "_" + std::to_string(n)) -#define ADD_INDEX(s, n) (s + "_" + std::to_string(n)) -#define LABEL_3PT(s, t1, t2) ADD_INDEX(INIT_INDEX(s, t1), t2) -#define LABEL_4PT(s, t1, t2, t3) ADD_INDEX(ADD_INDEX(INIT_INDEX(s, t1), t2), t3) -#define LABEL_4PT_NOISE(s, t1, t2, t3, nn) ADD_INDEX(ADD_INDEX(ADD_INDEX(INIT_INDEX(s, t1), t2), t3), nn) - -// Wall source/sink macros -#define NAME_3MOM_WALL_SOURCE(t, mom) ("wall_" + std::to_string(t) + "_" + mom) -#define NAME_WALL_SOURCE(t) NAME_3MOM_WALL_SOURCE(t, ZERO_MOM) -#define NAME_POINT_SOURCE(pos) ("point_" + pos) - -#define MAKE_3MOM_WALL_PROP(tW, mom, propName, solver)\ -{\ - std::string srcName = NAME_3MOM_WALL_SOURCE(tW, mom);\ - makeWallSource(application, srcName, tW, mom);\ - makePropagator(application, propName, srcName, solver);\ -} - -#define MAKE_WALL_PROP(tW, propName, solver)\ - MAKE_3MOM_WALL_PROP(tW, ZERO_MOM, propName, solver) - -// Sequential source macros -#define MAKE_SEQUENTIAL_PROP(tS, qSrc, mom, propName, solver)\ -{\ - std::string srcName = ADD_INDEX(qSrc + "_seq", tS);\ - makeSequentialSource(application, srcName, qSrc, tS, mom);\ - makePropagator(application, propName, srcName, solver);\ -} - -// Point source macros -#define MAKE_POINT_PROP(pos, propName, solver)\ -{\ - std::string srcName = NAME_POINT_SOURCE(pos);\ - makePointSource(application, srcName, pos);\ - makePropagator(application, propName, srcName, solver);\ -} - -/******************************************************************************* - * Functions for propagator construction. - ******************************************************************************/ - -/******************************************************************************* - * Name: makePointSource - * Purpose: Construct point source and add to application module. - * Parameters: application - main application that stores modules. - * srcName - name of source module to create. - * pos - Position of point source. - * Returns: None. - ******************************************************************************/ -inline void makePointSource(Application &application, std::string srcName, - std::string pos) -{ - // If the source already exists, don't make the module again. - if (!(Environment::getInstance().hasModule(srcName))) - { - MSource::Point::Par pointPar; - pointPar.position = pos; - application.createModule(srcName, pointPar); - } -} - -/******************************************************************************* - * Name: makeSequentialSource - * Purpose: Construct sequential source and add to application module. - * Parameters: application - main application that stores modules. - * srcName - name of source module to create. - * qSrc - Input quark for sequential inversion. - * tS - sequential source timeslice. - * mom - momentum insertion (default is zero). - * Returns: None. - ******************************************************************************/ -inline void makeSequentialSource(Application &application, std::string srcName, - std::string qSrc, unsigned int tS, - std::string mom = ZERO_MOM) -{ - // If the source already exists, don't make the module again. - if (!(Environment::getInstance().hasModule(srcName))) - { - MSource::SeqGamma::Par seqPar; - seqPar.q = qSrc; - seqPar.tA = tS; - seqPar.tB = tS; - seqPar.mom = mom; - seqPar.gamma = Gamma::Algebra::GammaT; - application.createModule(srcName, seqPar); - } -} - -/******************************************************************************* - * Name: makeWallSource - * Purpose: Construct wall source and add to application module. - * Parameters: application - main application that stores modules. - * srcName - name of source module to create. - * tW - wall source timeslice. - * mom - momentum insertion (default is zero). - * Returns: None. - ******************************************************************************/ -inline void makeWallSource(Application &application, std::string srcName, - unsigned int tW, std::string mom = ZERO_MOM) -{ - // If the source already exists, don't make the module again. - if (!(Environment::getInstance().hasModule(srcName))) - { - MSource::Wall::Par wallPar; - wallPar.tW = tW; - wallPar.mom = mom; - application.createModule(srcName, wallPar); - } -} - -/******************************************************************************* - * Name: makeWallSink - * Purpose: Wall sink smearing of a propagator. - * Parameters: application - main application that stores modules. - * propName - name of input propagator. - * wallName - name of smeared propagator. - * mom - momentum insertion (default is zero). - * Returns: None. - ******************************************************************************/ -inline void makeWallSink(Application &application, std::string propName, - std::string wallName, std::string mom = ZERO_MOM) -{ - // If the propagator has already been smeared, don't smear it again. - // Temporarily removed, strategy for sink smearing likely to change. - /*if (!(Environment::getInstance().hasModule(wallName))) - { - MSink::Wall::Par wallPar; - wallPar.q = propName; - wallPar.mom = mom; - application.createModule(wallName, wallPar); - }*/ -} - -/******************************************************************************* - * Name: makePropagator - * Purpose: Construct source and propagator then add to application module. - * Parameters: application - main application that stores modules. - * propName - name of propagator module to create. - * srcName - name of source module to use. - * solver - solver to use (default is CG). - * Returns: None. - ******************************************************************************/ -inline void makePropagator(Application &application, std::string &propName, - std::string &srcName, std::string &solver) -{ - // If the propagator already exists, don't make the module again. - if (!(Environment::getInstance().hasModule(propName))) - { - Quark::Par quarkPar; - quarkPar.source = srcName; - quarkPar.solver = solver; - application.createModule(propName, quarkPar); - } -} - -/******************************************************************************* - * Name: makeLoop - * Purpose: Use noise source and inversion result to make loop propagator, then - * add to application module. - * Parameters: application - main application that stores modules. - * propName - name of propagator module to create. - * srcName - name of noise source module to use. - * resName - name of inversion result on given noise source. - * Returns: None. - ******************************************************************************/ -inline void makeLoop(Application &application, std::string &propName, - std::string &srcName, std::string &resName) -{ - // If the loop propagator already exists, don't make the module again. - if (!(Environment::getInstance().hasModule(propName))) - { - MLoop::NoiseLoop::Par loopPar; - loopPar.q = resName; - loopPar.eta = srcName; - application.createModule(propName, loopPar); - } -} - -/******************************************************************************* - * Contraction module creation. - ******************************************************************************/ - -/******************************************************************************* - * Name: mesonContraction - * Purpose: Create meson contraction module and add to application module. - * Parameters: application - main application that stores modules. - * npt - specify n-point correlator (for labelling). - * q1 - quark propagator 1. - * q2 - quark propagator 2. - * label - unique label to construct module name. - * mom - momentum to project (default is zero) - * gammas - gamma insertions at source and sink. - * Returns: None. - ******************************************************************************/ -inline void mesonContraction(Application &application, unsigned int npt, - std::string &q1, std::string &q2, - std::string &label, - std::string mom = ZERO_MOM, - std::string gammas = "") -{ - std::string modName = std::to_string(npt) + "pt_" + label; - if (!(Environment::getInstance().hasModule(modName))) - { - MContraction::Meson::Par mesPar; - mesPar.output = std::to_string(npt) + "pt/" + label; - mesPar.q1 = q1; - mesPar.q2 = q2; - mesPar.mom = mom; - mesPar.gammas = gammas; - application.createModule(modName, mesPar); - } - } - -/******************************************************************************* - * Name: gamma3ptContraction - * Purpose: Create gamma3pt contraction module and add to application module. - * Parameters: application - main application that stores modules. - * npt - specify n-point correlator (for labelling). - * q1 - quark propagator 1. - * q2 - quark propagator 2. - * q3 - quark propagator 3. - * label - unique label to construct module name. - * gamma - gamma insertions between q2 and q3. - * Returns: None. - ******************************************************************************/ -inline void gamma3ptContraction(Application &application, unsigned int npt, - std::string &q1, std::string &q2, - std::string &q3, std::string &label, - Gamma::Algebra gamma = Gamma::Algebra::Identity) -{ - std::string modName = std::to_string(npt) + "pt_" + label; - if (!(Environment::getInstance().hasModule(modName))) - { - MContraction::Gamma3pt::Par gamma3ptPar; - gamma3ptPar.output = std::to_string(npt) + "pt/" + label; - gamma3ptPar.q1 = q1; - gamma3ptPar.q2 = q2; - gamma3ptPar.q3 = q3; - gamma3ptPar.gamma = gamma; - application.createModule(modName, gamma3ptPar); - } - } - -/******************************************************************************* - * Name: weakContraction[Eye,NonEye] - * Purpose: Create Weak Hamiltonian contraction module for Eye/NonEye topology - * and add to application module. - * Parameters: application - main application that stores modules. - * npt - specify n-point correlator (for labelling). - * q1 - quark propagator 1. - * q2 - quark propagator 2. - * q3 - quark propagator 3. - * q4 - quark propagator 4. - * label - unique label to construct module name. - * Returns: None. - ******************************************************************************/ -#define HW_CONTRACTION(top) \ -inline void weakContraction##top(Application &application, unsigned int npt,\ - std::string &q1, std::string &q2, \ - std::string &q3, std::string &q4, \ - std::string &label)\ -{\ - std::string modName = std::to_string(npt) + "pt_" + label;\ - if (!(Environment::getInstance().hasModule(modName)))\ - {\ - MContraction::WeakHamiltonian##top::Par weakPar;\ - weakPar.output = std::to_string(npt) + "pt/" + label;\ - weakPar.q1 = q1;\ - weakPar.q2 = q2;\ - weakPar.q3 = q3;\ - weakPar.q4 = q4;\ - application.createModule(modName, weakPar);\ - }\ -} -HW_CONTRACTION(Eye) // weakContractionEye -HW_CONTRACTION(NonEye) // weakContractionNonEye - -/******************************************************************************* - * Name: disc0Contraction - * Purpose: Create contraction module for 4pt Weak Hamiltonian + current - * disconnected topology for neutral mesons and add to application - * module. - * Parameters: application - main application that stores modules. - * q1 - quark propagator 1. - * q2 - quark propagator 2. - * q3 - quark propagator 3. - * q4 - quark propagator 4. - * label - unique label to construct module name. - * Returns: None. - ******************************************************************************/ -inline void disc0Contraction(Application &application, - std::string &q1, std::string &q2, - std::string &q3, std::string &q4, - std::string &label) -{ - std::string modName = "4pt_" + label; - if (!(Environment::getInstance().hasModule(modName))) - { - MContraction::WeakNeutral4ptDisc::Par disc0Par; - disc0Par.output = "4pt/" + label; - disc0Par.q1 = q1; - disc0Par.q2 = q2; - disc0Par.q3 = q3; - disc0Par.q4 = q4; - application.createModule(modName, disc0Par); - } - } - -/******************************************************************************* - * Name: discLoopContraction - * Purpose: Create contraction module for disconnected loop and add to - * application module. - * Parameters: application - main application that stores modules. - * q_loop - loop quark propagator. - * modName - unique module name. - * gamma - gamma matrix to use in contraction. - * Returns: None. - ******************************************************************************/ -inline void discLoopContraction(Application &application, - std::string &q_loop, std::string &modName, - Gamma::Algebra gamma = Gamma::Algebra::Identity) -{ - if (!(Environment::getInstance().hasModule(modName))) - { - MContraction::DiscLoop::Par discPar; - discPar.output = "disc/" + modName; - discPar.q_loop = q_loop; - discPar.gamma = gamma; - application.createModule(modName, discPar); - } - } diff --git a/tests/hadrons/Test_hadrons_rarekaon.cc b/tests/hadrons/Test_hadrons_rarekaon.cc deleted file mode 100644 index ab4d3ef1..00000000 --- a/tests/hadrons/Test_hadrons_rarekaon.cc +++ /dev/null @@ -1,342 +0,0 @@ -/******************************************************************************* - Grid physics library, www.github.com/paboyle/Grid - - Source file: tests/hadrons/Test_hadrons_rarekaon.cc - - Copyright (C) 2017 - - Author: Andrew Lawson - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution - directory. - *******************************************************************************/ - -#include "Test_hadrons.hpp" - -using namespace Grid; -using namespace Hadrons; - -enum quarks -{ - light = 0, - strange = 1, - charm = 2 -}; - -int main(int argc, char *argv[]) -{ - // parse command line ////////////////////////////////////////////////////// - std::string configStem; - - if (argc < 2) - { - std::cerr << "usage: " << argv[0] << " [Grid options]"; - std::cerr << std::endl; - std::exit(EXIT_FAILURE); - } - configStem = argv[1]; - - // initialization ////////////////////////////////////////////////////////// - Grid_init(&argc, &argv); - HadronsLogError.Active(GridLogError.isActive()); - HadronsLogWarning.Active(GridLogWarning.isActive()); - HadronsLogMessage.Active(GridLogMessage.isActive()); - HadronsLogIterative.Active(GridLogIterative.isActive()); - HadronsLogDebug.Active(GridLogDebug.isActive()); - LOG(Message) << "Grid initialized" << std::endl; - - // run setup /////////////////////////////////////////////////////////////// - Application application; - std::vector mass = {.01, .04, .2}; - std::vector flavour = {"l", "s", "c"}; - std::vector solvers = {"CG_l", "CG_s", "CG_c"}; - std::string kmom = "0. 0. 0. 0."; - std::string pmom = "1. 0. 0. 0."; - std::string qmom = "-1. 0. 0. 0."; - std::string mqmom = "1. 0. 0. 0."; - std::vector tKs = {0}; - unsigned int dt_pi = 16; - std::vector tJs = {8}; - unsigned int n_noise = 1; - unsigned int nt = 32; - bool do_disconnected(false); - - // Global parameters. - Application::GlobalPar globalPar; - globalPar.trajCounter.start = 1500; - globalPar.trajCounter.end = 1520; - globalPar.trajCounter.step = 20; - globalPar.seed = "1 2 3 4"; - globalPar.genetic.maxGen = 1000; - globalPar.genetic.maxCstGen = 200; - globalPar.genetic.popSize = 20; - globalPar.genetic.mutationRate = .1; - application.setPar(globalPar); - - // gauge field - if (configStem == "None") - { - application.createModule("gauge"); - } - else - { - MGauge::Load::Par gaugePar; - gaugePar.file = configStem; - application.createModule("gauge", gaugePar); - } - - // set fermion boundary conditions to be periodic space, antiperiodic time. - std::string boundary = "1 1 1 -1"; - - for (unsigned int i = 0; i < flavour.size(); ++i) - { - // actions - MAction::DWF::Par actionPar; - actionPar.gauge = "gauge"; - actionPar.Ls = 16; - actionPar.M5 = 1.8; - actionPar.mass = mass[i]; - actionPar.boundary = boundary; - application.createModule("DWF_" + flavour[i], actionPar); - - // solvers - // RBPrecCG -> CG - MSolver::RBPrecCG::Par solverPar; - solverPar.action = "DWF_" + flavour[i]; - solverPar.residual = 1.0e-8; - application.createModule(solvers[i], - solverPar); - } - - // Create noise propagators for loops. - std::vector noiseSrcs; - std::vector> noiseRes; - std::vector> noiseProps; - if (n_noise > 0) - { - MSource::Z2::Par noisePar; - noisePar.tA = 0; - noisePar.tB = nt - 1; - std::string loop_stem = "loop_"; - - noiseRes.resize(flavour.size()); - noiseProps.resize(flavour.size()); - for (unsigned int nn = 0; nn < n_noise; ++nn) - { - std::string eta = INIT_INDEX("noise", nn); - application.createModule(eta, noisePar); - noiseSrcs.push_back(eta); - - for (unsigned int f = 0; f < flavour.size(); ++f) - { - std::string loop_prop = INIT_INDEX(loop_stem + flavour[f], nn); - std::string loop_res = loop_prop + "_res"; - makePropagator(application, loop_res, eta, solvers[f]); - makeLoop(application, loop_prop, eta, loop_res); - noiseRes[f].push_back(loop_res); - noiseProps[f].push_back(loop_prop); - } - } - } - - // Translate rare kaon decay across specified timeslices. - for (unsigned int i = 0; i < tKs.size(); ++i) - { - // Zero-momentum wall source propagators for kaon and pion. - unsigned int tK = tKs[i]; - unsigned int tpi = (tK + dt_pi) % nt; - std::string q_Kl_0 = INIT_INDEX("Q_l_0", tK); - std::string q_pil_0 = INIT_INDEX("Q_l_0", tpi); - MAKE_WALL_PROP(tK, q_Kl_0, solvers[light]); - MAKE_WALL_PROP(tpi, q_pil_0, solvers[light]); - - // Wall sources for kaon and pion with momentum insertion. If either - // p or k are zero, or p = k, re-use the existing name to avoid - // duplicating a propagator. - std::string q_Ks_k = INIT_INDEX("Q_Ks_k", tK); - std::string q_Ks_p = INIT_INDEX((kmom == pmom) ? "Q_Ks_k" : "Q_Ks_p", tK); - std::string q_pil_k = INIT_INDEX((kmom == ZERO_MOM) ? "Q_l_0" : "Q_l_k", tpi); - std::string q_pil_p = INIT_INDEX((pmom == kmom) ? q_pil_k : ((pmom == ZERO_MOM) ? "Q_l_0" : "Q_l_p"), tpi); - MAKE_3MOM_WALL_PROP(tK, kmom, q_Ks_k, solvers[strange]); - MAKE_3MOM_WALL_PROP(tK, pmom, q_Ks_p, solvers[strange]); - MAKE_3MOM_WALL_PROP(tpi, kmom, q_pil_k, solvers[light]); - MAKE_3MOM_WALL_PROP(tpi, pmom, q_pil_p, solvers[light]); - - /*********************************************************************** - * CONTRACTIONS: pi and K 2pt contractions with mom = p, k. - **********************************************************************/ - // Wall-Point - std::string PW_K_k = INIT_INDEX("PW_K_k", tK); - std::string PW_K_p = INIT_INDEX("PW_K_p", tK); - std::string PW_pi_k = INIT_INDEX("PW_pi_k", tpi); - std::string PW_pi_p = INIT_INDEX("PW_pi_p", tpi); - mesonContraction(application, 2, q_Kl_0, q_Ks_k, PW_K_k, kmom); - mesonContraction(application, 2, q_Kl_0, q_Ks_p, PW_K_p, pmom); - mesonContraction(application, 2, q_pil_k, q_pil_0, PW_pi_k, kmom); - mesonContraction(application, 2, q_pil_p, q_pil_0, PW_pi_p, pmom); - // Wall-Wall, to be done - requires modification of meson module. - - /*********************************************************************** - * CONTRACTIONS: 3pt Weak Hamiltonian, C & W (non-Eye type) classes. - **********************************************************************/ - std::string HW_CW_k = LABEL_3PT("HW_CW_k", tK, tpi); - std::string HW_CW_p = LABEL_3PT("HW_CW_p", tK, tpi); - weakContractionNonEye(application, 3, q_Kl_0, q_Ks_k, q_pil_k, q_pil_0, HW_CW_k); - weakContractionNonEye(application, 3, q_Kl_0, q_Ks_p, q_pil_p, q_pil_0, HW_CW_p); - - /*********************************************************************** - * CONTRACTIONS: 3pt sd insertion. - **********************************************************************/ - // Note: eventually will use wall sink smeared q_Kl_0 instead. - std::string sd_k = LABEL_3PT("sd_k", tK, tpi); - std::string sd_p = LABEL_3PT("sd_p", tK, tpi); - gamma3ptContraction(application, 3, q_Kl_0, q_Ks_k, q_pil_k, sd_k); - gamma3ptContraction(application, 3, q_Kl_0, q_Ks_p, q_pil_p, sd_p); - - for (unsigned int nn = 0; nn < n_noise; ++nn) - { - /******************************************************************* - * CONTRACTIONS: 3pt Weak Hamiltonian, S and E (Eye type) classes. - ******************************************************************/ - // Note: eventually will use wall sink smeared q_Kl_0 instead. - for (unsigned int f = 0; f < flavour.size(); ++f) - { - if ((f != strange) || do_disconnected) - { - std::string HW_SE_k = LABEL_3PT("HW_SE_k_" + flavour[f], tK, tpi); - std::string HW_SE_p = LABEL_3PT("HW_SE_p_" + flavour[f], tK, tpi); - std::string loop_q = noiseProps[f][nn]; - weakContractionEye(application, 3, q_Kl_0, q_Ks_k, q_pil_k, loop_q, HW_CW_k); - weakContractionEye(application, 3, q_Kl_0, q_Ks_p, q_pil_p, loop_q, HW_CW_p); - } - } - } - - // Perform separate contractions for each t_J position. - for (unsigned int j = 0; j < tJs.size(); ++j) - { - // Sequential sources for current insertions. Local for now, - // gamma_0 only. - unsigned int tJ = (tJs[j] + tK) % nt; - MSource::SeqGamma::Par seqPar; - std::string q_KlCl_q = LABEL_3PT("Q_KlCl_q", tK, tJ); - std::string q_KsCs_mq = LABEL_3PT("Q_KsCs_mq", tK, tJ); - std::string q_pilCl_q = LABEL_3PT("Q_pilCl_q", tpi, tJ); - std::string q_pilCl_mq = LABEL_3PT("Q_pilCl_mq", tpi, tJ); - MAKE_SEQUENTIAL_PROP(tJ, q_Kl_0, qmom, q_KlCl_q, solvers[light]); - MAKE_SEQUENTIAL_PROP(tJ, q_Ks_k, mqmom, q_KsCs_mq, solvers[strange]); - MAKE_SEQUENTIAL_PROP(tJ, q_pil_p, qmom, q_pilCl_q, solvers[light]); - MAKE_SEQUENTIAL_PROP(tJ, q_pil_0, mqmom, q_pilCl_mq, solvers[light]); - - /******************************************************************* - * CONTRACTIONS: pi and K 3pt contractions with current insertion. - ******************************************************************/ - // Wall-Point - std::string C_PW_Kl = LABEL_3PT("C_PW_Kl", tK, tJ); - std::string C_PW_Ksb = LABEL_3PT("C_PW_Ksb", tK, tJ); - std::string C_PW_pilb = LABEL_3PT("C_PW_pilb", tK, tJ); - std::string C_PW_pil = LABEL_3PT("C_PW_pil", tK, tJ); - mesonContraction(application, 3, q_KlCl_q, q_Ks_k, C_PW_Kl, pmom); - mesonContraction(application, 3, q_Kl_0, q_KsCs_mq, C_PW_Ksb, pmom); - mesonContraction(application, 3, q_pil_0, q_pilCl_q, C_PW_pilb, kmom); - mesonContraction(application, 3, q_pilCl_mq, q_pil_p, C_PW_pil, kmom); - // Wall-Wall, to be done. - - /******************************************************************* - * CONTRACTIONS: 4pt contractions, C & W classes. - ******************************************************************/ - std::string CW_Kl = LABEL_4PT("CW_Kl", tK, tJ, tpi); - std::string CW_Ksb = LABEL_4PT("CW_Ksb", tK, tJ, tpi); - std::string CW_pilb = LABEL_4PT("CW_pilb", tK, tJ, tpi); - std::string CW_pil = LABEL_4PT("CW_pil", tK, tJ, tpi); - weakContractionNonEye(application, 4, q_KlCl_q, q_Ks_k, q_pil_p, q_pil_0, CW_Kl); - weakContractionNonEye(application, 4, q_Kl_0, q_KsCs_mq, q_pil_p, q_pil_0, CW_Ksb); - weakContractionNonEye(application, 4, q_Kl_0, q_Ks_k, q_pilCl_q, q_pil_0, CW_pilb); - weakContractionNonEye(application, 4, q_Kl_0, q_Ks_k, q_pil_p, q_pilCl_mq, CW_pil); - - /******************************************************************* - * CONTRACTIONS: 4pt contractions, sd insertions. - ******************************************************************/ - // Note: eventually will use wall sink smeared q_Kl_0/q_KlCl_q instead. - std::string sd_Kl = LABEL_4PT("sd_Kl", tK, tJ, tpi); - std::string sd_Ksb = LABEL_4PT("sd_Ksb", tK, tJ, tpi); - std::string sd_pilb = LABEL_4PT("sd_pilb", tK, tJ, tpi); - gamma3ptContraction(application, 4, q_KlCl_q, q_Ks_k, q_pil_p, sd_Kl); - gamma3ptContraction(application, 4, q_Kl_0, q_KsCs_mq, q_pil_p, sd_Ksb); - gamma3ptContraction(application, 4, q_Kl_0, q_Ks_k, q_pilCl_q, sd_pilb); - - // Sequential sources for each noise propagator. - for (unsigned int nn = 0; nn < n_noise; ++nn) - { - std::string loop_stem = "loop_"; - - // Contraction required for each quark flavour - alternatively - // drop the strange loop if not performing disconnected - // contractions or neglecting H_W operators Q_3 -> Q_10. - for (unsigned int f = 0; f < flavour.size(); ++f) - { - if ((f != strange) || do_disconnected) - { - std::string eta = noiseSrcs[nn]; - std::string loop_q = noiseProps[f][nn]; - std::string loop_qCq = LABEL_3PT(loop_stem + flavour[f], tJ, nn); - std::string loop_qCq_res = loop_qCq + "_res"; - MAKE_SEQUENTIAL_PROP(tJ, noiseRes[f][nn], qmom, - loop_qCq_res, solvers[f]); - makeLoop(application, loop_qCq, eta, loop_qCq_res); - - /******************************************************* - * CONTRACTIONS: 4pt contractions, S & E classes. - ******************************************************/ - // Note: eventually will use wall sink smeared q_Kl_0/q_KlCl_q instead. - std::string SE_Kl = LABEL_4PT_NOISE("SE_Kl", tK, tJ, tpi, nn); - std::string SE_Ksb = LABEL_4PT_NOISE("SE_Ksb", tK, tJ, tpi, nn); - std::string SE_pilb = LABEL_4PT_NOISE("SE_pilb", tK, tJ, tpi, nn); - std::string SE_loop = LABEL_4PT_NOISE("SE_loop", tK, tJ, tpi, nn); - weakContractionEye(application, 4, q_KlCl_q, q_Ks_k, q_pil_p, loop_q, SE_Kl); - weakContractionEye(application, 4, q_Kl_0, q_KsCs_mq, q_pil_p, loop_q, SE_Ksb); - weakContractionEye(application, 4, q_Kl_0, q_Ks_k, q_pilCl_q, loop_q, SE_pilb); - weakContractionEye(application, 4, q_Kl_0, q_Ks_k, q_pil_p, loop_qCq, SE_loop); - - /******************************************************* - * CONTRACTIONS: 4pt contractions, pi0 disconnected - * loop. - ******************************************************/ - std::string disc0 = LABEL_4PT_NOISE("disc0", tK, tJ, tpi, nn); - disc0Contraction(application, q_Kl_0, q_Ks_k, q_pilCl_q, loop_q, disc0); - - /******************************************************* - * CONTRACTIONS: Disconnected loop. - ******************************************************/ - std::string discLoop = "disc_" + loop_qCq; - discLoopContraction(application, loop_qCq, discLoop); - } - } - } - } - } - // execution - std::string par_file_name = "rarekaon_000_100_tK0_tpi16_tJ8_noloop_mc0.2.xml"; - application.saveParameterFile(par_file_name); - application.run(); - - // epilogue - LOG(Message) << "Grid is finalizing now" << std::endl; - Grid_finalize(); - - return EXIT_SUCCESS; -} From 20ac13fdf36d2d82fd0403b1d92d9771849889d6 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Thu, 8 Jun 2017 17:43:39 +0100 Subject: [PATCH 047/170] QedFVol: add ChargedProp as an input to ScalarVP module, instead of calculating scalar propagator within ScalarVP. --- extras/Hadrons/Modules/MScalar/ChargedProp.cc | 50 ++-- .../Hadrons/Modules/MScalar/ChargedProp.hpp | 2 +- extras/Hadrons/Modules/MScalar/ScalarVP.cc | 240 ++---------------- extras/Hadrons/Modules/MScalar/ScalarVP.hpp | 8 +- 4 files changed, 51 insertions(+), 249 deletions(-) diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.cc b/extras/Hadrons/Modules/MScalar/ChargedProp.cc index be64b5ec..a9089056 100644 --- a/extras/Hadrons/Modules/MScalar/ChargedProp.cc +++ b/extras/Hadrons/Modules/MScalar/ChargedProp.cc @@ -23,8 +23,8 @@ std::vector TChargedProp::getInput(void) std::vector TChargedProp::getOutput(void) { - std::vector out = {getName(), getName()+"_0", getName()+"_D1", - getName()+"_D1D1", getName()+"_D2"}; + std::vector out = {getName(), getName()+"_Q", + getName()+"_Sun", getName()+"_Tad"}; return out; } @@ -40,9 +40,9 @@ void TChargedProp::setup(void) } GFSrcName_ = "_" + getName() + "_DinvSrc"; prop0Name_ = getName() + "_0"; - propD1Name_ = getName() + "_D1"; - propD1D1Name_ = getName() + "_D1D1"; - propD2Name_ = getName() + "_D2"; + propQName_ = getName() + "_Q"; + propSunName_ = getName() + "_Sun"; + propTadName_ = getName() + "_Tad"; if (!env().hasRegisteredObject(freeMomPropName_)) { env().registerLattice(freeMomPropName_); @@ -63,9 +63,9 @@ void TChargedProp::setup(void) env().registerLattice(prop0Name_); } env().registerLattice(getName()); - env().registerLattice(propD1Name_); - env().registerLattice(propD1D1Name_); - env().registerLattice(propD2Name_); + env().registerLattice(propQName_); + env().registerLattice(propSunName_); + env().registerLattice(propTadName_); } // execution /////////////////////////////////////////////////////////////////// @@ -140,9 +140,9 @@ void TChargedProp::execute(void) << ", charge= " << par().charge << ")..." << std::endl; ScalarField &prop = *env().createLattice(getName()); - ScalarField &propD1 = *env().createLattice(propD1Name_); - ScalarField &propD1D1 = *env().createLattice(propD1D1Name_); - ScalarField &propD2 = *env().createLattice(propD2Name_); + ScalarField &propQ = *env().createLattice(propQName_); + ScalarField &propSun = *env().createLattice(propSunName_); + ScalarField &propTad = *env().createLattice(propTadName_); ScalarField buf(env().getGrid()); ScalarField &GFSrc = *GFSrc_, &G = *freeMomProp_; double q = par().charge; @@ -151,22 +151,22 @@ void TChargedProp::execute(void) buf = GFSrc; momD1(buf, fft); buf = -G*buf; - fft.FFT_all_dim(propD1, buf, FFT::backward); + fft.FFT_all_dim(propQ, buf, FFT::backward); // G*momD1*G*momD1*G*F*Src (here buf = G*momD1*G*F*Src) buf = -buf; momD1(buf, fft); - propD1D1 = G*buf; - fft.FFT_all_dim(propD1D1, propD1D1, FFT::backward); + propSun = G*buf; + fft.FFT_all_dim(propSun, propSun, FFT::backward); // -G*momD2*G*F*Src (momD2 = F*D2*Finv) buf = GFSrc; momD2(buf, fft); buf = -G*buf; - fft.FFT_all_dim(propD2, buf, FFT::backward); + fft.FFT_all_dim(propTad, buf, FFT::backward); // full charged scalar propagator - prop = (*prop0_) + q*propD1 + q*q*propD1D1 + q*q*propD2; + prop = (*prop0_) + q*propQ + q*q*propSun + q*q*propTad; // OUTPUT IF NECESSARY if (!par().output.empty()) @@ -200,29 +200,29 @@ void TChargedProp::execute(void) } write(writer, "prop_0", result); - // Write propagator D1 term - sliceSum(propD1, vecBuf, Tp); + // Write propagator O(q) term + sliceSum(propQ, vecBuf, Tp); for (unsigned int t = 0; t < vecBuf.size(); ++t) { result[t] = TensorRemove(vecBuf[t]); } - write(writer, "prop_D1", result); + write(writer, "prop_Q", result); - // Write propagator D1D1 term - sliceSum(propD1D1, vecBuf, Tp); + // Write propagator sunset term + sliceSum(propSun, vecBuf, Tp); for (unsigned int t = 0; t < vecBuf.size(); ++t) { result[t] = TensorRemove(vecBuf[t]); } - write(writer, "prop_D1D1", result); + write(writer, "prop_Sun", result); - // Write propagator D2 term - sliceSum(propD2, vecBuf, Tp); + // Write propagator tadpole term + sliceSum(propTad, vecBuf, Tp); for (unsigned int t = 0; t < vecBuf.size(); ++t) { result[t] = TensorRemove(vecBuf[t]); } - write(writer, "prop_D2", result); + write(writer, "prop_Tad", result); } } diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.hpp b/extras/Hadrons/Modules/MScalar/ChargedProp.hpp index aeb92179..369fff30 100644 --- a/extras/Hadrons/Modules/MScalar/ChargedProp.hpp +++ b/extras/Hadrons/Modules/MScalar/ChargedProp.hpp @@ -46,7 +46,7 @@ private: void momD2(ScalarField &s, FFT &fft); private: std::string freeMomPropName_, GFSrcName_, prop0Name_, - propD1Name_, propD1D1Name_, propD2Name_; + propQName_, propSunName_, propTadName_; std::vector phaseName_; ScalarField *freeMomProp_, *GFSrc_, *prop0_; std::vector phase_; diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.cc b/extras/Hadrons/Modules/MScalar/ScalarVP.cc index c91b98ae..7a3b4f9e 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.cc +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.cc @@ -1,3 +1,4 @@ +#include #include #include @@ -16,16 +17,19 @@ TScalarVP::TScalarVP(const std::string name) // dependencies/products /////////////////////////////////////////////////////// std::vector TScalarVP::getInput(void) { - std::vector in = {par().source, par().emField}; + propQName_ = par().scalarProp + "_Q"; + propSunName_ = par().scalarProp + "_Sun"; + propTadName_ = par().scalarProp + "_Tad"; + + std::vector in = {par().emField, propQName_, propSunName_, + propTadName_}; return in; } std::vector TScalarVP::getOutput(void) { - std::vector out = {getName()+"_propQ", - getName()+"_propSun", - getName()+"_propTad"}; + std::vector out; for (unsigned int mu = 0; mu < env().getNd(); ++mu) { @@ -44,12 +48,9 @@ std::vector TScalarVP::getOutput(void) // setup /////////////////////////////////////////////////////////////////////// void TScalarVP::setup(void) { - freeMomPropName_ = FREEMOMPROP(par().mass); - GFSrcName_ = "_" + getName() + "_DinvSrc"; - prop0Name_ = getName() + "_prop0"; - propQName_ = getName() + "_propQ"; - propSunName_ = getName() + "_propSun"; - propTadName_ = getName() + "_propTad"; + freeMomPropName_ = FREEMOMPROP(static_cast(env().getModule(par().scalarProp))->par().mass); + GFSrcName_ = "_" + par().scalarProp + "_DinvSrc"; + prop0Name_ = par().scalarProp + "_0"; phaseName_.clear(); muPropQName_.clear(); @@ -74,174 +75,38 @@ void TScalarVP::setup(void) freeVpTensorName_.push_back(freeVpTensorName_mu); } - if (!env().hasRegisteredObject(freeMomPropName_)) - { - env().registerLattice(freeMomPropName_); - } - if (!env().hasRegisteredObject(phaseName_[0])) - { - for (unsigned int mu = 0; mu < env().getNd(); ++mu) - { - env().registerLattice(phaseName_[mu]); - } - } - if (!env().hasRegisteredObject(GFSrcName_)) - { - env().registerLattice(GFSrcName_); - } - if (!env().hasRegisteredObject(prop0Name_)) - { - env().registerLattice(prop0Name_); - } - env().registerLattice(propQName_); for (unsigned int mu = 0; mu < env().getNd(); ++mu) { env().registerLattice(muPropQName_[mu]); - } - env().registerLattice(propSunName_); - env().registerLattice(propTadName_); - for (unsigned int mu = 0; mu < env().getNd(); ++mu) - { + for (unsigned int nu = 0; nu < env().getNd(); ++nu) { env().registerLattice(vpTensorName_[mu][nu]); env().registerLattice(freeVpTensorName_[mu][nu]); } - } + } } // execution /////////////////////////////////////////////////////////////////// void TScalarVP::execute(void) { - // CACHING ANALYTIC EXPRESSIONS - ScalarField &source = *env().getObject(par().source); + // Get objects cached by ChargedProp module Complex ci(0.0,1.0); FFT fft(env().getGrid()); - Real q = par().charge; + Real q = static_cast(env().getModule(par().scalarProp))->par().charge; - // cache momentum-space free scalar propagator - if (!env().hasCreatedObject(freeMomPropName_)) + freeMomProp_ = env().getObject(freeMomPropName_); + for (unsigned int mu = 0; mu < env().getNd(); ++mu) { - LOG(Message) << "Caching momentum space free scalar propagator" - << " (mass= " << par().mass << ")..." << std::endl; - freeMomProp_ = env().createLattice(freeMomPropName_); - SIMPL::MomentumSpacePropagator(*freeMomProp_, par().mass); - } - else - { - freeMomProp_ = env().getObject(freeMomPropName_); - } - // cache phases - if (!env().hasCreatedObject(phaseName_[0])) - { - std::vector &l = env().getGrid()->_fdimensions; - - LOG(Message) << "Caching shift phases..." << std::endl; - for (unsigned int mu = 0; mu < env().getNd(); ++mu) - { - Real twoPiL = M_PI*2./l[mu]; - - phase_.push_back(env().createLattice(phaseName_[mu])); - LatticeCoordinate(*(phase_[mu]), mu); - *(phase_[mu]) = exp(ci*twoPiL*(*(phase_[mu]))); - } - } - else - { - for (unsigned int mu = 0; mu < env().getNd(); ++mu) - { - phase_.push_back(env().getObject(phaseName_[mu])); - } - } - // cache G*F*src - if (!env().hasCreatedObject(GFSrcName_)) - { - GFSrc_ = env().createLattice(GFSrcName_); - fft.FFT_all_dim(*GFSrc_, source, FFT::forward); - *GFSrc_ = (*freeMomProp_)*(*GFSrc_); - } - else - { - GFSrc_ = env().getObject(GFSrcName_); - } - // cache position-space free scalar propagators - if (!env().hasCreatedObject(prop0Name_)) - { - prop0_ = env().createLattice(prop0Name_); - fft.FFT_all_dim(*prop0_, *GFSrc_, FFT::backward); - } - else - { - prop0_ = env().getObject(prop0Name_); + phase_.push_back(env().getObject(phaseName_[mu])); } + GFSrc_ = env().getObject(GFSrcName_); + prop0_ = env().getObject(prop0Name_); - // PROPAGATOR CALCULATION // Propagator from unshifted source - LOG(Message) << "Computing O(alpha) charged scalar propagator" - << " (mass= " << par().mass - << ", charge= " << q << ")..." - << std::endl; - ScalarField &propQ = *env().createLattice(propQName_); - ScalarField &propSun = *env().createLattice(propSunName_); - ScalarField &propTad = *env().createLattice(propTadName_); - chargedProp(propQ, propSun, propTad, *GFSrc_, fft); - // // OUTPUT IF NECESSARY - // if (!par().output.empty()) - // { - // ScalarField fullProp = (*prop0_) + q*propQ + q*q*propSun + q*q*propTad; - // std::string filename = par().output + "_prop_000." + - // std::to_string(env().getTrajectory()); - - // LOG(Message) << "Saving zero-momentum projection to '" - // << filename << "'..." << std::endl; - - // CorrWriter writer(filename); - // std::vector vecBuf; - // std::vector result; - - // write(writer, "charge", q); - - // // Write full propagator - // sliceSum(fullProp, vecBuf, Tp); - // result.resize(vecBuf.size()); - // for (unsigned int t = 0; t < vecBuf.size(); ++t) - // { - // result[t] = TensorRemove(vecBuf[t]); - // } - // write(writer, "prop", result); - - // // Write free propagator - // sliceSum(*prop0_, vecBuf, Tp); - // for (unsigned int t = 0; t < vecBuf.size(); ++t) - // { - // result[t] = TensorRemove(vecBuf[t]); - // } - // write(writer, "prop_0", result); - - // // Write propagator D1 term - // sliceSum(propD1, vecBuf, Tp); - // for (unsigned int t = 0; t < vecBuf.size(); ++t) - // { - // result[t] = TensorRemove(vecBuf[t]); - // } - // write(writer, "prop_q", result); - - // // Write propagator D1D1 term - // sliceSum(propD1D1, vecBuf, Tp); - // for (unsigned int t = 0; t < vecBuf.size(); ++t) - // { - // result[t] = TensorRemove(vecBuf[t]); - // } - // write(writer, "prop_sun", result); - - // // Write propagator D2 term - // sliceSum(propD2, vecBuf, Tp); - // for (unsigned int t = 0; t < vecBuf.size(); ++t) - // { - // result[t] = TensorRemove(vecBuf[t]); - // } - // write(writer, "prop_tad", result); - // } + ScalarField &propQ = *env().getObject(propQName_); + ScalarField &propSun = *env().getObject(propSunName_); + ScalarField &propTad = *env().getObject(propTadName_); // Propagators from shifted sources LOG(Message) << "Computing O(q) charged scalar propagators..." @@ -357,7 +222,7 @@ void TScalarVP::execute(void) std::vector result; write(writer, "charge", q); - write(writer, "mass", par().mass); + write(writer, "mass", static_cast(env().getModule(par().scalarProp))->par().mass); for (unsigned int mu = 0; mu < env().getNd(); ++mu) { @@ -386,34 +251,6 @@ void TScalarVP::execute(void) } } -// Calculate O(q) and O(q^2) terms of position-space charged propagator -void TScalarVP::chargedProp(ScalarField &prop_q, ScalarField &prop_sun, - ScalarField &prop_tad, ScalarField &GFSrc, - FFT &fft) -{ - Complex ci(0.0,1.0); - ScalarField &G = *freeMomProp_; - ScalarField buf(env().getGrid()); - - // -G*momD1*G*F*Src (momD1 = F*D1*Finv) - buf = GFSrc; - momD1(buf, fft); - buf = G*buf; - prop_q = -buf; - fft.FFT_all_dim(prop_q, prop_q, FFT::backward); - - // G*momD1*G*momD1*G*F*Src - momD1(buf, fft); - prop_sun = G*buf; - fft.FFT_all_dim(prop_sun, prop_sun, FFT::backward); - - // -G*momD2*G*F*Src (momD2 = F*D2*Finv) - buf = GFSrc; - momD2(buf, fft); - prop_tad = -G*buf; - fft.FFT_all_dim(prop_tad, prop_tad, FFT::backward); -} - void TScalarVP::momD1(ScalarField &s, FFT &fft) { EmField &A = *env().getObject(par().emField); @@ -443,32 +280,3 @@ void TScalarVP::momD1(ScalarField &s, FFT &fft) s = result; } - -void TScalarVP::momD2(ScalarField &s, FFT &fft) -{ - EmField &A = *env().getObject(par().emField); - ScalarField buf(env().getGrid()), result(env().getGrid()), - Amu(env().getGrid()); - - result = zero; - - for (unsigned int mu = 0; mu < env().getNd(); ++mu) - { - Amu = peekLorentz(A, mu); - buf = (*phase_[mu])*s; - fft.FFT_all_dim(buf, buf, FFT::backward); - buf = Amu*Amu*buf; - fft.FFT_all_dim(buf, buf, FFT::forward); - result = result + .5*buf; - } - fft.FFT_all_dim(s, s, FFT::backward); - for (unsigned int mu = 0; mu < env().getNd(); ++mu) - { - Amu = peekLorentz(A, mu); - buf = Amu*Amu*s; - fft.FFT_all_dim(buf, buf, FFT::forward); - result = result + .5*adj(*phase_[mu])*buf; - } - - s = result; -} diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.hpp b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp index fdbad6f6..81071ca0 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.hpp +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp @@ -17,9 +17,7 @@ class ScalarVPPar: Serializable public: GRID_SERIALIZABLE_CLASS_MEMBERS(ScalarVPPar, std::string, emField, - std::string, source, - double, mass, - double, charge, + std::string, scalarProp, std::string, output); }; @@ -42,11 +40,7 @@ public: // execution virtual void execute(void); private: - void chargedProp(ScalarField &prop_q, ScalarField &prop_sun, - ScalarField &prop_tad, ScalarField &GFSrc, - FFT &fft); void momD1(ScalarField &s, FFT &fft); - void momD2(ScalarField &s, FFT &fft); private: std::string freeMomPropName_, GFSrcName_, prop0Name_, propQName_, From 2bc4d0a20ec038786f6544783b368fed3bbfb804 Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Thu, 8 Jun 2017 22:21:25 +0100 Subject: [PATCH 048/170] Move code into utils --- tests/core/Test_fft_gfix.cc | 242 ++++-------------------------------- 1 file changed, 26 insertions(+), 216 deletions(-) diff --git a/tests/core/Test_fft_gfix.cc b/tests/core/Test_fft_gfix.cc index 7938241e..9732eb85 100644 --- a/tests/core/Test_fft_gfix.cc +++ b/tests/core/Test_fft_gfix.cc @@ -28,212 +28,6 @@ Author: Peter Boyle /* END LEGAL */ #include -using namespace Grid; -using namespace Grid::QCD; - -template -class FourierAcceleratedGaugeFixer : public Gimpl { - public: - INHERIT_GIMPL_TYPES(Gimpl); - - typedef typename Gimpl::GaugeLinkField GaugeMat; - typedef typename Gimpl::GaugeField GaugeLorentz; - - static void GaugeLinkToLieAlgebraField(const std::vector &U,std::vector &A) { - for(int mu=0;mu &A,GaugeMat &dmuAmu) { - dmuAmu=zero; - for(int mu=0;mu::avgPlaquette(Umu); - Real org_link_trace=WilsonLoops::linkTrace(Umu); - Real old_trace = org_link_trace; - Real trG; - - std::vector U(Nd,grid); - GaugeMat dmuAmu(grid); - - for(int i=0;i(Umu,mu); - //trG = SteepestDescentStep(U,alpha,dmuAmu); - trG = FourierAccelSteepestDescentStep(U,alpha,dmuAmu); - for(int mu=0;mu(Umu,U[mu],mu); - // Monitor progress and convergence test - // infrequently to minimise cost overhead - if ( i %20 == 0 ) { - Real plaq =WilsonLoops::avgPlaquette(Umu); - Real link_trace=WilsonLoops::linkTrace(Umu); - - std::cout << GridLogMessage << " Iteration "< &U,Real & alpha, GaugeMat & dmuAmu) { - GridBase *grid = U[0]._grid; - - std::vector A(Nd,grid); - GaugeMat g(grid); - - GaugeLinkToLieAlgebraField(U,A); - ExpiAlphaDmuAmu(A,g,alpha,dmuAmu); - - - Real vol = grid->gSites(); - Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc; - - SU::GaugeTransform(U,g); - - return trG; - } - - static Real FourierAccelSteepestDescentStep(std::vector &U,Real & alpha, GaugeMat & dmuAmu) { - - GridBase *grid = U[0]._grid; - - Real vol = grid->gSites(); - - FFT theFFT((GridCartesian *)grid); - - LatticeComplex Fp(grid); - LatticeComplex psq(grid); psq=zero; - LatticeComplex pmu(grid); - LatticeComplex one(grid); one = Complex(1.0,0.0); - - GaugeMat g(grid); - GaugeMat dmuAmu_p(grid); - std::vector A(Nd,grid); - - GaugeLinkToLieAlgebraField(U,A); - - DmuAmu(A,dmuAmu); - - theFFT.FFT_all_dim(dmuAmu_p,dmuAmu,FFT::forward); - - ////////////////////////////////// - // Work out Fp = psq_max/ psq... - ////////////////////////////////// - std::vector latt_size = grid->GlobalDimensions(); - std::vector coor(grid->_ndimension,0); - for(int mu=0;mu::taExp(ciadmam,g); - - Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc; - - SU::GaugeTransform(U,g); - - return trG; - } - - static void ExpiAlphaDmuAmu(const std::vector &A,GaugeMat &g,Real & alpha, GaugeMat &dmuAmu) { - GridBase *grid = g._grid; - Complex cialpha(0.0,-alpha); - GaugeMat ciadmam(grid); - DmuAmu(A,dmuAmu); - ciadmam = dmuAmu*cialpha; - SU::taExp(ciadmam,g); - } -/* - //////////////////////////////////////////////////////////////// - // NB The FT for fields living on links has an extra phase in it - // Could add these to the FFT class as a later task since this code - // might be reused elsewhere ???? - //////////////////////////////////////////////////////////////// - static void InverseFourierTransformAmu(FFT &theFFT,const std::vector &Ap,std::vector &Ax) { - GridBase * grid = theFFT.Grid(); - std::vector latt_size = grid->GlobalDimensions(); - - ComplexField pmu(grid); - ComplexField pha(grid); - GaugeMat Apha(grid); - - Complex ci(0.0,1.0); - - for(int mu=0;mu &Ax,std::vector &Ap) { - GridBase * grid = theFFT.Grid(); - std::vector latt_size = grid->GlobalDimensions(); - - ComplexField pmu(grid); - ComplexField pha(grid); - Complex ci(0.0,1.0); - - // Sign convention for FFTW calls: - // A(x)= Sum_p e^ipx A(p) / V - // A(p)= Sum_p e^-ipx A(x) - - for(int mu=0;mu seeds({1,2,3,4}); @@ -264,22 +58,24 @@ int main (int argc, char ** argv) std::cout<< "*****************************************************************" <::avgPlaquette(Umu); std::cout << " Initial plaquette "<::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-10, 1.0e-10); + Umu = Urnd; + FourierAcceleratedGaugeFixer::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-12, 1.0e-12,false); plaq=WilsonLoops::avgPlaquette(Umu); std::cout << " Final plaquette "<::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-12, 1.0e-12,true); - // std::cout<< "*****************************************************************" <::avgPlaquette(Umu); + std::cout << " Final plaquette "<::avgPlaquette(Umu); + std::cout << " Initial plaquette "<::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-12, 1.0e-12,true); + + plaq=WilsonLoops::avgPlaquette(Umu); + std::cout << " Final plaquette "< Date: Thu, 8 Jun 2017 22:21:50 +0100 Subject: [PATCH 049/170] Move Gfix into utils --- lib/Grid.h | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/Grid.h b/lib/Grid.h index 543b0330..bf548211 100644 --- a/lib/Grid.h +++ b/lib/Grid.h @@ -41,6 +41,7 @@ Author: paboyle #include #include #include +#include #include #include From 70ab598c96401761996b78f6d0343f16267c6e73 Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Thu, 8 Jun 2017 22:22:23 +0100 Subject: [PATCH 050/170] Move gfix into utils --- lib/qcd/utils/GaugeFix.h | 188 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 lib/qcd/utils/GaugeFix.h diff --git a/lib/qcd/utils/GaugeFix.h b/lib/qcd/utils/GaugeFix.h new file mode 100644 index 00000000..4ff216e4 --- /dev/null +++ b/lib/qcd/utils/GaugeFix.h @@ -0,0 +1,188 @@ + /************************************************************************************* + + grid` physics library, www.github.com/paboyle/Grid + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +//#include + +using namespace Grid; +using namespace Grid::QCD; + +template +class FourierAcceleratedGaugeFixer : public Gimpl { + public: + INHERIT_GIMPL_TYPES(Gimpl); + + typedef typename Gimpl::GaugeLinkField GaugeMat; + typedef typename Gimpl::GaugeField GaugeLorentz; + + static void GaugeLinkToLieAlgebraField(const std::vector &U,std::vector &A) { + for(int mu=0;mu &A,GaugeMat &dmuAmu) { + dmuAmu=zero; + for(int mu=0;mu::avgPlaquette(Umu); + Real org_link_trace=WilsonLoops::linkTrace(Umu); + Real old_trace = org_link_trace; + Real trG; + + std::vector U(Nd,grid); + GaugeMat dmuAmu(grid); + + for(int i=0;i(Umu,mu); + if ( Fourier==false ) { + trG = SteepestDescentStep(U,alpha,dmuAmu); + } else { + trG = FourierAccelSteepestDescentStep(U,alpha,dmuAmu); + } + for(int mu=0;mu(Umu,U[mu],mu); + // Monitor progress and convergence test + // infrequently to minimise cost overhead + if ( i %20 == 0 ) { + Real plaq =WilsonLoops::avgPlaquette(Umu); + Real link_trace=WilsonLoops::linkTrace(Umu); + + if (Fourier) + std::cout << GridLogMessage << "Fourier Iteration "< &U,Real & alpha, GaugeMat & dmuAmu) { + GridBase *grid = U[0]._grid; + + std::vector A(Nd,grid); + GaugeMat g(grid); + + GaugeLinkToLieAlgebraField(U,A); + ExpiAlphaDmuAmu(A,g,alpha,dmuAmu); + + + Real vol = grid->gSites(); + Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc; + + SU::GaugeTransform(U,g); + + return trG; + } + + static Real FourierAccelSteepestDescentStep(std::vector &U,Real & alpha, GaugeMat & dmuAmu) { + + GridBase *grid = U[0]._grid; + + Real vol = grid->gSites(); + + FFT theFFT((GridCartesian *)grid); + + LatticeComplex Fp(grid); + LatticeComplex psq(grid); psq=zero; + LatticeComplex pmu(grid); + LatticeComplex one(grid); one = Complex(1.0,0.0); + + GaugeMat g(grid); + GaugeMat dmuAmu_p(grid); + std::vector A(Nd,grid); + + GaugeLinkToLieAlgebraField(U,A); + + DmuAmu(A,dmuAmu); + + theFFT.FFT_all_dim(dmuAmu_p,dmuAmu,FFT::forward); + + ////////////////////////////////// + // Work out Fp = psq_max/ psq... + ////////////////////////////////// + std::vector latt_size = grid->GlobalDimensions(); + std::vector coor(grid->_ndimension,0); + for(int mu=0;mu::taExp(ciadmam,g); + + Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc; + + SU::GaugeTransform(U,g); + + return trG; + } + + static void ExpiAlphaDmuAmu(const std::vector &A,GaugeMat &g,Real & alpha, GaugeMat &dmuAmu) { + GridBase *grid = g._grid; + Complex cialpha(0.0,-alpha); + GaugeMat ciadmam(grid); + DmuAmu(A,dmuAmu); + ciadmam = dmuAmu*cialpha; + SU::taExp(ciadmam,g); + } +}; + From 42f0afcbfa7c1ecbaf57380a8722f46d00d892d7 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Fri, 9 Jun 2017 18:08:40 +0100 Subject: [PATCH 051/170] QedFVol: Output all scalar VP diagrams separately --- extras/Hadrons/Modules/MScalar/ScalarVP.cc | 276 ++++++++++++++++----- 1 file changed, 214 insertions(+), 62 deletions(-) diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.cc b/extras/Hadrons/Modules/MScalar/ScalarVP.cc index 7a3b4f9e..19cdbb9a 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.cc +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.cc @@ -126,7 +126,7 @@ void TScalarVP::execute(void) // CONTRACTIONS ScalarField prop1(env().getGrid()), prop2(env().getGrid()); EmField &A = *env().getObject(par().emField); - ScalarField Amu(env().getGrid()); + ScalarField Amu(env().getGrid()), tmp_vp(env().getGrid()); TComplex Anu0; std::vector coor0 = {0, 0, 0, 0}; std::vector > vpTensor, freeVpTensor; @@ -143,6 +143,33 @@ void TScalarVP::execute(void) freeVpTensor.push_back(freeVpTensor_mu); } + // Open output files if necessary + CorrWriter *writer, *writer0, *writerD; + std::vector vecBuf; + std::vector result; + if (!par().output.empty()) + { + std::string filename = par().output + "." + + std::to_string(env().getTrajectory()); + std::string filename0 = par().output + "_free." + + std::to_string(env().getTrajectory()); + std::string filenameD = par().output + "_diagrams." + + std::to_string(env().getTrajectory()); + + // LOG(Message) << "Saving zero-momentum projection to '" + // << filename << "'..." << std::endl; + writer = new CorrWriter(filename); + writer0 = new CorrWriter(filename0); + writerD = new CorrWriter(filenameD); + + write(*writer, "charge", q); + write(*writer, "mass", static_cast(env().getModule(par().scalarProp))->par().mass); + write(*writer0, "charge", 0.0); + write(*writer0, "mass", static_cast(env().getModule(par().scalarProp))->par().mass); + write(*writerD, "charge", q); + write(*writerD, "mass", static_cast(env().getModule(par().scalarProp))->par().mass); + } + for (unsigned int nu = 0; nu < env().getNd(); ++nu) { peekSite(Anu0, peekLorentz(A, nu), coor0); @@ -158,75 +185,207 @@ void TScalarVP::execute(void) prop2 = Cshift(*prop0_, nu, -1); freeVpTensor[mu][nu] = adj(prop2) * Cshift(prop1, mu, 1); freeVpTensor[mu][nu] -= Cshift(adj(prop2), mu, 1) * prop1; + freeVpTensor[mu][nu] = 2.0*real(freeVpTensor[mu][nu]); + + // Output if necessary + if (!par().output.empty()) + { + sliceSum(freeVpTensor[mu][nu], vecBuf, Tp); + result.resize(vecBuf.size()); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(*writer0, + "Pi_"+std::to_string(mu)+"_"+std::to_string(nu), + result); + } // "Exchange" terms prop1 += q*propQ; prop2 += q*muPropQ[nu]; - vpTensor[mu][nu] = adj(prop2) * (1.0 + ci*q*Amu) - * Cshift(prop1, mu, 1) * (1.0 + ci*q*Anu0); - vpTensor[mu][nu] -= Cshift(adj(prop2), mu, 1) * (1.0 - ci*q*Amu) - * prop1 * (1.0 + ci*q*Anu0); + tmp_vp = adj(prop2) * (1.0 + ci*q*Amu) + * Cshift(prop1, mu, 1) * (1.0 + ci*q*Anu0); + tmp_vp -= Cshift(adj(prop2), mu, 1) * (1.0 - ci*q*Amu) + * prop1 * (1.0 + ci*q*Anu0); + tmp_vp = 2.0*real(tmp_vp); + vpTensor[mu][nu] = tmp_vp*1.0; + + // Output if necessary + if (!par().output.empty()) + { + sliceSum(tmp_vp, vecBuf, Tp); + result.resize(vecBuf.size()); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(*writerD, + "Pi_exchange_"+std::to_string(mu)+"_"+std::to_string(nu), + result); + } // Subtract O(alpha^2) term prop1 = q*propQ; prop2 = q*muPropQ[nu]; - vpTensor[mu][nu] -= adj(prop2) * ci*q*Amu - * Cshift(prop1, mu, 1) * ci*q*Anu0; - vpTensor[mu][nu] += Cshift(adj(prop2), mu, 1) * (-ci)*q*Amu - * prop1 * ci*q*Anu0; + tmp_vp = Cshift(adj(prop2), mu, 1) * (-ci)*q*Amu + * prop1 * ci*q*Anu0; + tmp_vp -= adj(prop2) * ci*q*Amu + * Cshift(prop1, mu, 1) * ci*q*Anu0; + tmp_vp = 2.0*real(tmp_vp); + vpTensor[mu][nu] += tmp_vp; - // Sunset+tadpole from source - prop1 = q*q*(propSun + propTad); + // Output if necessary + if (!par().output.empty()) + { + sliceSum(tmp_vp, vecBuf, Tp); + result.resize(vecBuf.size()); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(*writerD, + "Pi_alpha2_"+std::to_string(mu)+"_"+std::to_string(nu), + result); + } + + // Sunset from unshifted source + prop1 = q*q*propSun; prop2 = Cshift(*prop0_, nu, -1); - vpTensor[mu][nu] += adj(prop2) * Cshift(prop1, mu, 1); - vpTensor[mu][nu] -= Cshift(adj(prop2), mu, 1) * prop1; + tmp_vp = adj(prop2) * Cshift(prop1, mu, 1); + tmp_vp -= Cshift(adj(prop2), mu, 1) * prop1; + tmp_vp = 2.0*real(tmp_vp); + vpTensor[mu][nu] += tmp_vp; - // Sunset+tadpole from shifted source + // Output if necessary + if (!par().output.empty()) + { + sliceSum(tmp_vp, vecBuf, Tp); + result.resize(vecBuf.size()); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(*writerD, + "Pi_sunset_unshifted_"+std::to_string(mu)+"_"+std::to_string(nu), + result); + } + + // Sunset from shifted source prop1 = Cshift(prop1, nu, -1); - vpTensor[mu][nu] += Cshift(adj(*prop0_), mu, 1) * prop1; - vpTensor[mu][nu] -= adj(*prop0_) * Cshift(prop1, mu, 1); + tmp_vp = Cshift(adj(*prop0_), mu, 1) * prop1; + tmp_vp -= adj(*prop0_) * Cshift(prop1, mu, 1); + tmp_vp = 2.0*real(tmp_vp); + vpTensor[mu][nu] += tmp_vp; + + // Output if necessary + if (!par().output.empty()) + { + sliceSum(tmp_vp, vecBuf, Tp); + result.resize(vecBuf.size()); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(*writerD, + "Pi_sunset_shifted_"+std::to_string(mu)+"_"+std::to_string(nu), + result); + } + + // Tadpole from unshifted source + prop1 = q*q*propTad; + prop2 = Cshift(*prop0_, nu, -1); + tmp_vp = adj(prop2) * Cshift(prop1, mu, 1); + tmp_vp -= Cshift(adj(prop2), mu, 1) * prop1; + tmp_vp = 2.0*real(tmp_vp); + vpTensor[mu][nu] += tmp_vp; + + // Output if necessary + if (!par().output.empty()) + { + sliceSum(tmp_vp, vecBuf, Tp); + result.resize(vecBuf.size()); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(*writerD, + "Pi_tadpole_unshifted_"+std::to_string(mu)+"_"+std::to_string(nu), + result); + } + + // Tadpole from shifted source + prop1 = Cshift(prop1, nu, -1); + tmp_vp = Cshift(adj(*prop0_), mu, 1) * prop1; + tmp_vp -= adj(*prop0_) * Cshift(prop1, mu, 1); + tmp_vp = 2.0*real(tmp_vp); + vpTensor[mu][nu] += tmp_vp; + + // Output if necessary + if (!par().output.empty()) + { + sliceSum(tmp_vp, vecBuf, Tp); + result.resize(vecBuf.size()); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(*writerD, + "Pi_tadpole_shifted_"+std::to_string(mu)+"_"+std::to_string(nu), + result); + } // Source tadpole prop1 = *prop0_; - vpTensor[mu][nu] += adj(prop2) - * Cshift(prop1, mu, 1) - * (-0.5)*q*q*Anu0*Anu0; - vpTensor[mu][nu] -= Cshift(adj(prop2), mu, 1) - * prop1 - * (-0.5)*q*q*Anu0*Anu0; + tmp_vp = adj(prop2) + * Cshift(prop1, mu, 1) + * (-0.5)*q*q*Anu0*Anu0; + tmp_vp -= Cshift(adj(prop2), mu, 1) + * prop1 + * (-0.5)*q*q*Anu0*Anu0; + tmp_vp = 2.0*real(tmp_vp); + vpTensor[mu][nu] += tmp_vp; + + // Output if necessary + if (!par().output.empty()) + { + sliceSum(tmp_vp, vecBuf, Tp); + result.resize(vecBuf.size()); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(*writerD, + "Pi_sourcetadpole_"+std::to_string(mu)+"_"+std::to_string(nu), + result); + } // Sink tadpole - vpTensor[mu][nu] += adj(prop2) - * (-0.5)*q*q*Amu*Amu - * Cshift(prop1, mu, 1); - vpTensor[mu][nu] -= Cshift(adj(prop2), mu, 1) - * (-0.5)*q*q*Amu*Amu - * prop1; + tmp_vp = adj(prop2) + * (-0.5)*q*q*Amu*Amu + * Cshift(prop1, mu, 1); + tmp_vp -= Cshift(adj(prop2), mu, 1) + * (-0.5)*q*q*Amu*Amu + * prop1; + tmp_vp = 2.0*real(tmp_vp); + vpTensor[mu][nu] += tmp_vp; - freeVpTensor[mu][nu] = 2.0*real(freeVpTensor[mu][nu]); - vpTensor[mu][nu] = 2.0*real(vpTensor[mu][nu]); - } - } + // Output if necessary + if (!par().output.empty()) + { + sliceSum(tmp_vp, vecBuf, Tp); + result.resize(vecBuf.size()); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(*writerD, + "Pi_sinktadpole_"+std::to_string(mu)+"_"+std::to_string(nu), + result); + } - // OUTPUT IF NECESSARY - if (!par().output.empty()) - { - std::string filename = par().output + "." + - std::to_string(env().getTrajectory()); - - LOG(Message) << "Saving zero-momentum projection to '" - << filename << "'..." << std::endl; - - CorrWriter writer(filename); - std::vector vecBuf; - std::vector result; - - write(writer, "charge", q); - write(writer, "mass", static_cast(env().getModule(par().scalarProp))->par().mass); - - for (unsigned int mu = 0; mu < env().getNd(); ++mu) - { - for (unsigned int nu = 0; nu < env().getNd(); ++nu) + // Output if necessary + if (!par().output.empty()) { sliceSum(vpTensor[mu][nu], vecBuf, Tp); result.resize(vecBuf.size()); @@ -234,21 +393,14 @@ void TScalarVP::execute(void) { result[t] = TensorRemove(vecBuf[t]); } - write(writer, "Pi_"+std::to_string(mu)+"_"+std::to_string(nu), - result); - - sliceSum(freeVpTensor[mu][nu], vecBuf, Tp); - result.resize(vecBuf.size()); - for (unsigned int t = 0; t < vecBuf.size(); ++t) - { - result[t] = TensorRemove(vecBuf[t]); - } - write(writer, - "Pi_"+std::to_string(mu)+"_"+std::to_string(nu)+"_free", + write(*writer, "Pi_"+std::to_string(mu)+"_"+std::to_string(nu), result); } } } + delete writer; + delete writer0; + delete writerD; } void TScalarVP::momD1(ScalarField &s, FFT &fft) From 3bfd1f13e67735d2273127eefabafb779c00996d Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 11 Jun 2017 23:14:10 +0100 Subject: [PATCH 052/170] I/O improvements --- benchmarks/Benchmark_memory_bandwidth.cc | 4 +- benchmarks/Benchmark_su3.cc | 4 +- configure.ac | 2 +- extras/Hadrons/Modules/MGauge/Load.cc | 4 +- lib/Grid.h | 1 + lib/GridStd.h | 1 + lib/cartesian/Cartesian_base.h | 9 +- lib/cartesian/Cartesian_full.h | 8 +- lib/cartesian/Cartesian_red_black.h | 4 + lib/communicator/Communicator_base.h | 2 + lib/communicator/Communicator_mpi.cc | 8 + lib/communicator/Communicator_mpi3.cc | 8 + lib/communicator/Communicator_none.cc | 2 + lib/parallelIO/BinaryIO.h | 249 ++++++--- lib/parallelIO/IldgIO.h | 472 ++++++++++++------ lib/parallelIO/IldgIOtypes.h | 110 ++-- lib/parallelIO/NerscIO.h | 301 +++-------- .../hmc/checkpointers/BinaryCheckpointer.h | 38 +- lib/qcd/hmc/checkpointers/ILDGCheckpointer.h | 32 +- lib/qcd/hmc/checkpointers/NerscCheckpointer.h | 2 +- lib/qcd/utils/Utils.h | 3 - lib/serialisation/XmlIO.cc | 58 ++- lib/serialisation/XmlIO.h | 11 +- tests/IO/Test_nersc_io.cc | 4 +- tests/IO/Test_nersc_read.cc | 2 +- tests/IO/Test_serialisation.cc | 19 +- 26 files changed, 779 insertions(+), 579 deletions(-) diff --git a/benchmarks/Benchmark_memory_bandwidth.cc b/benchmarks/Benchmark_memory_bandwidth.cc index d57c4df5..1aa088f8 100644 --- a/benchmarks/Benchmark_memory_bandwidth.cc +++ b/benchmarks/Benchmark_memory_bandwidth.cc @@ -55,8 +55,8 @@ int main (int argc, char ** argv) std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc index 1321715a..3d7f9bc9 100644 --- a/benchmarks/Benchmark_su3.cc +++ b/benchmarks/Benchmark_su3.cc @@ -35,9 +35,9 @@ using namespace Grid::QCD; int main (int argc, char ** argv) { Grid_init(&argc,&argv); -#define LMAX (32) +#define LMAX (64) - int Nloop=200; + int Nloop=20; std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); std::vector mpi_layout = GridDefaultMpi(); diff --git a/configure.ac b/configure.ac index 62b7545b..2fc9dfec 100644 --- a/configure.ac +++ b/configure.ac @@ -27,7 +27,7 @@ AX_GXX_VERSION AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"], [version of g++ that will compile the code]) -CXXFLAGS="-O3 $CXXFLAGS" +CXXFLAGS="-g $CXXFLAGS" ############### Checks for typedefs, structures, and compiler characteristics diff --git a/extras/Hadrons/Modules/MGauge/Load.cc b/extras/Hadrons/Modules/MGauge/Load.cc index e5ee8abb..062e7e98 100644 --- a/extras/Hadrons/Modules/MGauge/Load.cc +++ b/extras/Hadrons/Modules/MGauge/Load.cc @@ -65,7 +65,7 @@ void TLoad::setup(void) // execution /////////////////////////////////////////////////////////////////// void TLoad::execute(void) { - NerscField header; + FieldMetaData header; std::string fileName = par().file + "." + std::to_string(env().getTrajectory()); @@ -74,5 +74,5 @@ void TLoad::execute(void) LatticeGaugeField &U = *env().createLattice(getName()); NerscIO::readConfiguration(U, header, fileName); LOG(Message) << "NERSC header:" << std::endl; - dump_nersc_header(header, LOG(Message)); + dump_meta_data(header, LOG(Message)); } diff --git a/lib/Grid.h b/lib/Grid.h index 543b0330..ce16894f 100644 --- a/lib/Grid.h +++ b/lib/Grid.h @@ -42,6 +42,7 @@ Author: paboyle #include #include #include +#include #include #endif diff --git a/lib/GridStd.h b/lib/GridStd.h index fb5e5b21..959ba9ac 100644 --- a/lib/GridStd.h +++ b/lib/GridStd.h @@ -18,6 +18,7 @@ #include #include #include +#include /////////////////// // Grid config diff --git a/lib/cartesian/Cartesian_base.h b/lib/cartesian/Cartesian_base.h index b31b3b5f..0db6ce0d 100644 --- a/lib/cartesian/Cartesian_base.h +++ b/lib/cartesian/Cartesian_base.h @@ -50,7 +50,6 @@ public: GridBase(const std::vector & processor_grid) : CartesianCommunicator(processor_grid) {}; - // Physics Grid information. std::vector _simd_layout;// Which dimensions get relayed out over simd lanes. std::vector _fdimensions;// (full) Global dimensions of array prior to cb removal @@ -63,13 +62,12 @@ public: int _isites; int _fsites; // _isites*_osites = product(dimensions). int _gsites; - std::vector _slice_block; // subslice information + std::vector _slice_block;// subslice information std::vector _slice_stride; std::vector _slice_nblock; - // Might need these at some point - // std::vector _lstart; // local start of array in gcoors. _processor_coor[d]*_ldimensions[d] - // std::vector _lend; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1 + std::vector _lstart; // local start of array in gcoors _processor_coor[d]*_ldimensions[d] + std::vector _lend ; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1 public: @@ -176,6 +174,7 @@ public: inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; inline int Nd (void) const { return _ndimension;}; + inline const std::vector LocalStarts(void) { return _lstart; }; inline const std::vector &FullDimensions(void) { return _fdimensions;}; inline const std::vector &GlobalDimensions(void) { return _gdimensions;}; inline const std::vector &LocalDimensions(void) { return _ldimensions;}; diff --git a/lib/cartesian/Cartesian_full.h b/lib/cartesian/Cartesian_full.h index 7e29d311..b0e47fa4 100644 --- a/lib/cartesian/Cartesian_full.h +++ b/lib/cartesian/Cartesian_full.h @@ -76,6 +76,8 @@ public: _ldimensions.resize(_ndimension); _rdimensions.resize(_ndimension); _simd_layout.resize(_ndimension); + _lstart.resize(_ndimension); + _lend.resize(_ndimension); _ostride.resize(_ndimension); _istride.resize(_ndimension); @@ -94,8 +96,10 @@ public: // Use a reduced simd grid _ldimensions[d]= _gdimensions[d]/_processors[d]; //local dimensions _rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition - _osites *= _rdimensions[d]; - _isites *= _simd_layout[d]; + _lstart[d] = _processor_coor[d]*_ldimensions[d]; + _lend[d] = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1; + _osites *= _rdimensions[d]; + _isites *= _simd_layout[d]; // Addressing support if ( d==0 ) { diff --git a/lib/cartesian/Cartesian_red_black.h b/lib/cartesian/Cartesian_red_black.h index 2f132c19..3037de00 100644 --- a/lib/cartesian/Cartesian_red_black.h +++ b/lib/cartesian/Cartesian_red_black.h @@ -151,6 +151,8 @@ public: _ldimensions.resize(_ndimension); _rdimensions.resize(_ndimension); _simd_layout.resize(_ndimension); + _lstart.resize(_ndimension); + _lend.resize(_ndimension); _ostride.resize(_ndimension); _istride.resize(_ndimension); @@ -169,6 +171,8 @@ public: _gdimensions[d] = _gdimensions[d]/2; // Remove a checkerboard } _ldimensions[d] = _gdimensions[d]/_processors[d]; + _lstart[d] = _processor_coor[d]*_ldimensions[d]; + _lend[d] = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1; // Use a reduced simd grid _simd_layout[d] = simd_layout[d]; diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h index 23d4f647..12a8429f 100644 --- a/lib/communicator/Communicator_base.h +++ b/lib/communicator/Communicator_base.h @@ -177,6 +177,8 @@ class CartesianCommunicator { void GlobalSumVector(ComplexF *c,int N); void GlobalSum(ComplexD &c); void GlobalSumVector(ComplexD *c,int N); + void GlobalXOR(uint32_t &); + void GlobalXOR(uint64_t &); template void GlobalSum(obj &o){ typedef typename obj::scalar_type scalar_type; diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc index 470a06c7..bd2a62fb 100644 --- a/lib/communicator/Communicator_mpi.cc +++ b/lib/communicator/Communicator_mpi.cc @@ -83,6 +83,14 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); assert(ierr==0); } +void CartesianCommunicator::GlobalXOR(uint32_t &u){ + int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalXOR(uint64_t &u){ + int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator); + assert(ierr==0); +} void CartesianCommunicator::GlobalSum(float &f){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); assert(ierr==0); diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index 54a0f9b5..632eb991 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -510,6 +510,14 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); assert(ierr==0); } +void CartesianCommunicator::GlobalXOR(uint32_t &u){ + int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalXOR(uint64_t &u){ + int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator); + assert(ierr==0); +} void CartesianCommunicator::GlobalSum(float &f){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); assert(ierr==0); diff --git a/lib/communicator/Communicator_none.cc b/lib/communicator/Communicator_none.cc index ace2868b..5319ab93 100644 --- a/lib/communicator/Communicator_none.cc +++ b/lib/communicator/Communicator_none.cc @@ -59,6 +59,8 @@ void CartesianCommunicator::GlobalSum(double &){} void CartesianCommunicator::GlobalSum(uint32_t &){} void CartesianCommunicator::GlobalSum(uint64_t &){} void CartesianCommunicator::GlobalSumVector(double *,int N){} +void CartesianCommunicator::GlobalXOR(uint32_t &){} +void CartesianCommunicator::GlobalXOR(uint64_t &){} void CartesianCommunicator::SendRecvPacket(void *xmit, void *recv, diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h index 8b8d4165..bc3da38b 100644 --- a/lib/parallelIO/BinaryIO.h +++ b/lib/parallelIO/BinaryIO.h @@ -125,57 +125,94 @@ class BinaryIO { ///////////////////////////////////////////////////////////////////////////// // more byte manipulation helpers ///////////////////////////////////////////////////////////////////////////// - static inline void Uint32Checksum(uint32_t *buf,uint64_t buf_size_bytes,uint32_t &csum) + + template static inline void Uint32Checksum(Lattice &lat, + uint32_t &nersc_csum, + uint32_t &scidac_csuma, + uint32_t &scidac_csumb) + { + typedef typename vobj::scalar_object sobj; + + GridBase *grid = lat._grid; + int lsites = grid->lSites(); + + std::vector scalardata(lsites); + unvectorizeToLexOrdArray(scalardata,lat); + + Uint32Checksum(grid,scalardata,nersc_csum,scidac_csuma,scidac_csumb); + } + + template + static inline void Uint32Checksum(GridBase *grid, + std::vector &fbuf, + uint32_t &nersc_csum, + uint32_t &scidac_csuma, + uint32_t &scidac_csumb) + { + const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t); + + + int nd = grid->_ndimension; + + uint64_t lsites =grid->lSites(); + std::vector local_vol =grid->LocalDimensions(); + std::vector local_start =grid->LocalStarts(); + std::vector global_vol =grid->FullDimensions(); + #pragma omp parallel { - uint32_t csum_thr=0; - uint64_t count = buf_size_bytes/sizeof(uint32_t); + std::vector coor(nd); + uint32_t nersc_csum_thr=0; + uint32_t scidac_csuma_thr=0; + uint32_t scidac_csumb_thr=0; + uint32_t site_crc=0; + uint32_t zcrc = crc32(0L, Z_NULL, 0); + #pragma omp for - for(uint64_t i=0;i>(32-gsite29); + scidac_csumb_thr ^= site_crc<>(32-gsite31); } + #pragma omp critical - csum = csum + csum_thr; + { + nersc_csum += nersc_csum_thr; + scidac_csuma^= scidac_csuma_thr; + scidac_csumb^= scidac_csumb_thr; + } } } + // Network is big endian - static inline void htobe32_v(void *file_object,uint64_t bytes,uint32_t &csum){ - Uint32Checksum((uint32_t *)file_object,bytes,csum); - htobe32_v(file_object,bytes); - } - static inline void htobe64_v(void *file_object,uint64_t bytes,uint32_t &csum){ - Uint32Checksum((uint32_t *)file_object,bytes,csum); - htobe64_v(file_object,bytes); - } - static inline void htole32_v(void *file_object,uint64_t bytes,uint32_t &csum){ - Uint32Checksum((uint32_t *)file_object,bytes,csum); - htole32_v(file_object,bytes); - } - static inline void htole64_v(void *file_object,uint64_t bytes,uint32_t &csum){ - Uint32Checksum((uint32_t *)file_object,bytes,csum); - htole64_v(file_object,bytes); - } - static inline void be32toh_v(void *file_object,uint64_t bytes,uint32_t &csum){ - be32toh_v(file_object,bytes); - Uint32Checksum((uint32_t *)file_object,bytes,csum); - } - static inline void be64toh_v(void *file_object,uint64_t bytes,uint32_t &csum){ - be64toh_v(file_object,bytes); - Uint32Checksum((uint32_t *)file_object,bytes,csum); - } - static inline void le32toh_v(void *file_object,uint64_t bytes,uint32_t &csum){ - le32toh_v(file_object,bytes); - Uint32Checksum((uint32_t *)file_object,bytes,csum); - } - static inline void le64toh_v(void *file_object,uint64_t bytes,uint32_t &csum){ - le64toh_v(file_object,bytes); - Uint32Checksum((uint32_t *)file_object,bytes,csum); - } - static inline void htobe32_v(void *file_object,uint64_t bytes){ be32toh_v(file_object,bytes);} - static inline void htobe64_v(void *file_object,uint64_t bytes){ be64toh_v(file_object,bytes);} - static inline void htole32_v(void *file_object,uint64_t bytes){ le32toh_v(file_object,bytes);} - static inline void htole64_v(void *file_object,uint64_t bytes){ le64toh_v(file_object,bytes);} + static inline void htobe32_v(void *file_object,uint32_t bytes){ be32toh_v(file_object,bytes);} + static inline void htobe64_v(void *file_object,uint32_t bytes){ be64toh_v(file_object,bytes);} + static inline void htole32_v(void *file_object,uint32_t bytes){ le32toh_v(file_object,bytes);} + static inline void htole64_v(void *file_object,uint32_t bytes){ le64toh_v(file_object,bytes);} static inline void be32toh_v(void *file_object,uint64_t bytes) { @@ -199,6 +236,7 @@ class BinaryIO { fp[i] = ntohl(f); } } + // BE is same as network static inline void be64toh_v(void *file_object,uint64_t bytes) { @@ -238,18 +276,23 @@ class BinaryIO { static const int BINARYIO_WRITE = 0x01; template - static inline uint32_t IOobject(word w, - GridBase *grid, - std::vector &iodata, - std::string file, - int offset, - const std::string &format, int control) + static inline void IOobject(word w, + GridBase *grid, + std::vector &iodata, + std::string file, + int offset, + const std::string &format, int control, + uint32_t &nersc_csum, + uint32_t &scidac_csuma, + uint32_t &scidac_csumb) { grid->Barrier(); GridStopWatch timer; GridStopWatch bstimer; - uint32_t csum=0; + nersc_csum=0; + scidac_csuma=0; + scidac_csumb=0; int ndim = grid->Dimensions(); int nrank = grid->ProcessorCount(); @@ -359,20 +402,22 @@ class BinaryIO { grid->Barrier(); bstimer.Start(); - if (ieee32big) be32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum); - if (ieee32) le32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum); - if (ieee64big) be64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum); - if (ieee64) le64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum); + if (ieee32big) be32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size()); + if (ieee32) le32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size()); + if (ieee64big) be64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size()); + if (ieee64) le64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size()); + Uint32Checksum(grid,iodata,nersc_csum,scidac_csuma,scidac_csumb); bstimer.Stop(); } if ( control & BINARYIO_WRITE ) { bstimer.Start(); - if (ieee32big) htobe32_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum); - if (ieee32) htole32_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum); - if (ieee64big) htobe64_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum); - if (ieee64) htole64_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum); + Uint32Checksum(grid,iodata,nersc_csum,scidac_csuma,scidac_csumb); + if (ieee32big) htobe32_v((void *)&iodata[0], sizeof(fobj)*iodata.size()); + if (ieee32) htole32_v((void *)&iodata[0], sizeof(fobj)*iodata.size()); + if (ieee64big) htobe64_v((void *)&iodata[0], sizeof(fobj)*iodata.size()); + if (ieee64) htole64_v((void *)&iodata[0], sizeof(fobj)*iodata.size()); bstimer.Stop(); grid->Barrier(); @@ -418,17 +463,27 @@ class BinaryIO { // Safety check ////////////////////////////////////////////////////////////////////////////// grid->Barrier(); - grid->GlobalSum(csum); + grid->GlobalSum(nersc_csum); + grid->GlobalXOR(scidac_csuma); + grid->GlobalXOR(scidac_csumb); grid->Barrier(); - - return csum; + // std::cout << "Binary IO NERSC checksum 0x"< - static inline uint32_t readLatticeObject(Lattice &Umu,std::string file,munger munge,int offset,const std::string &format) + static inline void readLatticeObject(Lattice &Umu, + std::string file, + munger munge, + int offset, + const std::string &format, + uint32_t &nersc_csum, + uint32_t &scidac_csuma, + uint32_t &scidac_csumb) { typedef typename vobj::scalar_object sobj; typedef typename vobj::Realified::scalar_type word; word w=0; @@ -439,7 +494,8 @@ class BinaryIO { std::vector scalardata(lsites); std::vector iodata(lsites); // Munge, checksum, byte order in here - uint32_t csum= IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC); + IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC, + nersc_csum,scidac_csuma,scidac_csumb); GridStopWatch timer; timer.Start(); @@ -451,15 +507,20 @@ class BinaryIO { timer.Stop(); std::cout< - static inline uint32_t writeLatticeObject(Lattice &Umu,std::string file,munger munge,int offset,const std::string &format) + static inline void writeLatticeObject(Lattice &Umu, + std::string file, + munger munge, + int offset, + const std::string &format, + uint32_t &nersc_csum, + uint32_t &scidac_csuma, + uint32_t &scidac_csumb) { typedef typename vobj::scalar_object sobj; typedef typename vobj::Realified::scalar_type word; word w=0; @@ -480,36 +541,45 @@ class BinaryIO { grid->Barrier(); timer.Stop(); - uint32_t csum= IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC); + IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC, + nersc_csum,scidac_csuma,scidac_csumb); std::cout< RNGstate; typedef RngStateType word; word w=0; - uint32_t csum = 0; std::string format = "IEEE32BIG"; GridBase *grid = parallel._grid; int gsites = grid->gSites(); int lsites = grid->lSites(); + uint32_t nersc_csum_tmp; + uint32_t scidac_csuma_tmp; + uint32_t scidac_csumb_tmp; + GridStopWatch timer; std::cout << GridLogMessage << "RNG read I/O on file " << file << std::endl; std::vector iodata(lsites); - csum= IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC); + IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC, + nersc_csum,scidac_csuma,scidac_csumb); timer.Start(); parallel_for(int lidx=0;lidx tmp(RngStateCount); std::copy(iodata[0].begin(),iodata[0].end(),tmp.begin()); serial.SetState(tmp,0); } - std::cout << GridLogMessage << "RNG file checksum " << std::hex << csum << std::dec << std::endl; + nersc_csum = nersc_csum + nersc_csum_tmp; + scidac_csuma = scidac_csuma ^ scidac_csuma_tmp; + scidac_csumb = scidac_csumb ^ scidac_csumb_tmp; + + // std::cout << GridLogMessage << "RNG file nersc_checksum " << std::hex << nersc_csum << std::dec << std::endl; + // std::cout << GridLogMessage << "RNG file scidac_checksuma " << std::hex << scidac_csuma << std::dec << std::endl; + // std::cout << GridLogMessage << "RNG file scidac_checksumb " << std::hex << scidac_csumb << std::dec << std::endl; + std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl; - return csum; } ///////////////////////////////////////////////////////////////////////////// // Write a RNG; lexico map to an array of state and use IOobject ////////////////////////////////////////////////////////////////////////////////////// - static inline uint32_t writeRNG(GridSerialRNG &serial,GridParallelRNG ¶llel,std::string file,int offset) + static inline void writeRNG(GridSerialRNG &serial, + GridParallelRNG ¶llel, + std::string file, + int offset, + uint32_t &nersc_csum, + uint32_t &scidac_csuma, + uint32_t &scidac_csumb) { typedef typename GridSerialRNG::RngStateType RngStateType; typedef RngStateType word; word w=0; const int RngStateCount = GridSerialRNG::RngStateCount; typedef std::array RNGstate; - uint32_t csum = 0; - GridBase *grid = parallel._grid; int gsites = grid->gSites(); int lsites = grid->lSites(); + uint32_t nersc_csum_tmp; + uint32_t scidac_csuma_tmp; + uint32_t scidac_csumb_tmp; + GridStopWatch timer; std::string format = "IEEE32BIG"; @@ -561,7 +647,8 @@ class BinaryIO { } timer.Stop(); - csum= IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC); + IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC, + nersc_csum,scidac_csuma,scidac_csumb); iodata.resize(1); { @@ -569,11 +656,11 @@ class BinaryIO { serial.GetState(tmp,0); std::copy(tmp.begin(),tmp.end(),iodata[0].begin()); } - csum+= IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_MASTER_APPEND); + IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_MASTER_APPEND, + nersc_csum_tmp,scidac_csuma_tmp,scidac_csumb_tmp); - std::cout << GridLogMessage << "RNG file checksum " << std::hex << csum << std::dec << std::endl; + // std::cout << GridLogMessage << "RNG file checksum " << std::hex << csum << std::dec << std::endl; std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl; - return csum; } }; } diff --git a/lib/parallelIO/IldgIO.h b/lib/parallelIO/IldgIO.h index 0912e2f6..237edf43 100644 --- a/lib/parallelIO/IldgIO.h +++ b/lib/parallelIO/IldgIO.h @@ -43,201 +43,351 @@ extern "C" { // for linkage #include "lime.h" } + +// Unused SCIDAC records names +// SCIDAC_PRIVATE_FILE_XML "scidac-private-file-xml" +// SCIDAC_SITELIST "scidac-sitelist" +// SCIDAC_FILE_XML "scidac-file-xml" +// SCIDAC_RIVATE_RECORD_XML "scidac-private-record-xml" +// SCIDAC_RECORD_XML "scidac-record-xml" +// SCIDAC_BINARY_DATA "scidac-binary-data" +// +// Scidac checksum: CRC32 every site, xor reduce some hash of this. +// https://github.com/usqcd-software/qio/blob/master/lib/dml/DML_utils.c + namespace Grid { namespace QCD { -inline void ILDGGrid(GridBase *grid, ILDGField &header) { - assert(grid->_ndimension == 4); // emit error if not - header.dimension.resize(4); - header.boundary.resize(4); - for (int d = 0; d < 4; d++) { - header.dimension[d] = grid->_fdimensions[d]; - // Read boundary conditions from ... ? - header.boundary[d] = std::string("periodic"); - } -} - -inline void ILDGChecksum(uint32_t *buf, uint32_t buf_size_bytes, - uint32_t &csum) { - BinaryIO::Uint32Checksum(buf, buf_size_bytes, csum); -} - -////////////////////////////////////////////////////////////////////// -// Utilities ; these are QCD aware -////////////////////////////////////////////////////////////////////// -template -inline void ILDGStatistics(GaugeField &data, ILDGField &header) { - // How to convert data precision etc... - header.link_trace = Grid::QCD::WilsonLoops::linkTrace(data); - header.plaquette = Grid::QCD::WilsonLoops::avgPlaquette(data); - // header.polyakov = -} - -// Forcing QCD here -template -struct ILDGMunger { - void operator()(fobj &in, sobj &out, uint32_t &csum) { - for (int mu = 0; mu < 4; mu++) { - for (int i = 0; i < 3; i++) { - for (int j = 0; j < 3; j++) { - out(mu)()(i, j) = in(mu)()(i, j); - } - } - } - ILDGChecksum((uint32_t *)&in, sizeof(in), csum); - }; -}; - -template -struct ILDGUnmunger { - void operator()(sobj &in, fobj &out, uint32_t &csum) { - for (int mu = 0; mu < 4; mu++) { - for (int i = 0; i < 3; i++) { - for (int j = 0; j < 3; j++) { - out(mu)()(i, j) = in(mu)()(i, j); - } - } - } - ILDGChecksum((uint32_t *)&out, sizeof(out), csum); - }; -}; - -//////////////////////////////////////////////////////////////////////////////// -// Write and read from fstream; compute header offset for payload -//////////////////////////////////////////////////////////////////////////////// -enum ILDGstate {ILDGread, ILDGwrite}; - -class ILDGIO : public BinaryIO { - FILE *File; - LimeWriter *LimeW; - LimeRecordHeader *LimeHeader; - LimeReader *LimeR; - std::string filename; - - +class IldgIO : public BinaryIO { public: - ILDGIO(std::string file, ILDGstate RW) { - filename = file; - if (RW == ILDGwrite){ - File = fopen(file.c_str(), "w"); - // check if opened correctly - LimeW = limeCreateWriter(File); - } else { - File = fopen(file.c_str(), "r"); - // check if opened correctly - - LimeR = limeCreateReader(File); - } - } - - ~ILDGIO() { fclose(File); } - - int createHeader(std::string message, int MB, int ME, size_t PayloadSize, LimeWriter* L){ + static int createHeader(std::string message, int MB, int ME, size_t PayloadSize, LimeWriter* L) + { LimeRecordHeader *h; h = limeCreateHeader(MB, ME, const_cast(message.c_str()), PayloadSize); - int status = limeWriteRecordHeader(h, L); - if (status < 0) { - std::cerr << "ILDG Header error\n"; - return status; - } + assert(limeWriteRecordHeader(h, L) >= 0); limeDestroyHeader(h); return LIME_SUCCESS; } - unsigned int writeHeader(ILDGField &header) { - // write header in LIME - n_uint64_t nbytes; - int MB_flag = 1, ME_flag = 0; - - char message[] = "ildg-format"; - nbytes = strlen(message); - LimeHeader = limeCreateHeader(MB_flag, ME_flag, message, nbytes); - limeWriteRecordHeader(LimeHeader, LimeW); - limeDestroyHeader(LimeHeader); - // save the xml header here - // use the xml_writer to c++ streams in pugixml - // and convert to char message - limeWriteRecordData(message, &nbytes, LimeW); + template + static void writeLimeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name, LimeWriter *LimeW) + { + std::string xmlstring; + { + XmlWriter WR("",""); + write(WR,object_name,object); + xmlstring = WR.XmlString(); + } + uint64_t nbytes = xmlstring.size(); + LimeRecordHeader *h = limeCreateHeader(MB, ME,(char *)record_name.c_str(), nbytes); + assert(limeWriteRecordHeader(h, LimeW)>=0); + assert(limeWriteRecordData(&xmlstring[0], &nbytes, LimeW)>=0); limeWriterCloseRecord(LimeW); - - return 0; + limeDestroyHeader(h); } - unsigned int readHeader(ILDGField &header) { + static unsigned int writeHeader(FieldMetaData &header, LimeWriter *LimeW) { + + uint64_t nbytes; + + ildgFormat ildgfmt ; + usqcdInfo info; + + ////////////////////////////////////////////////////// + // Fill ILDG header data struct + ////////////////////////////////////////////////////// + ildgfmt.field = std::string("su3gauge"); + ildgfmt.precision = 64; + ildgfmt.version = 1.0; + ildgfmt.lx = header.dimension[0]; + ildgfmt.ly = header.dimension[1]; + ildgfmt.lz = header.dimension[2]; + ildgfmt.lt = header.dimension[3]; + assert(header.nd==4); + assert(header.nd==header.dimension.size()); + + info.version=1.0; + info.plaq = header.plaquette; + info.linktr = header.link_trace; + + // Following scidac file downloaded from NERSC under MILC + // Begin message, keep open on successive records + //Message 1 + // Type: scidac-private-file-xml 1.1416 16 16 48 0 + // Type: scidac-file-xml MILC ILDG archival gauge configuration + //Message 2 + // Type: scidac-private-record-xml 1.0Thu May 11 00:11:33 2006 UTC0 + // QDP_F3_ColorMatrixF3724 + // Type: scidac-record-xml + // Type: ildg-format + // Type: ildg-data-lfn + // Type: ildg-binary-data + // Type: scidac-checksum + + writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT),LimeW); + writeLimeObject(0,0,info ,std::string("usqcdInfo" ),std::string(USQCD_INFO ),LimeW); + writeLimeObject(0,0,ildgfmt,std::string("ildgFormat") ,std::string(ILDG_FORMAT),LimeW); + // LFN is not a serializable object + { + std::string LFN = header.ildg_lfn; + uint64_t PayloadSize = LFN.size(); + createHeader(ILDG_DATA_LFN, 0 , 0, PayloadSize, LimeW); + limeWriteRecordData(const_cast(LFN.c_str()), &PayloadSize, LimeW); + limeWriterCloseRecord(LimeW); + } return 0; } template - uint32_t readConfiguration(Lattice > &Umu) { - typedef Lattice > GaugeField; - typedef LorentzColourMatrixD sobjd; - typedef LorentzColourMatrixF sobjf; - typedef iLorentzColourMatrix itype; - typedef LorentzColourMatrix sobj; - GridBase *grid = Umu._grid; + static void writeConfiguration(std::string filename,Lattice > &Umu, std::string format) { - ILDGField header; - readHeader(header); + FILE *File = fopen(filename.c_str(), "w"); + LimeWriter *LimeW = limeCreateWriter(File); - // now just the conf, ignore the header - std::string format = std::string("IEEE64BIG"); - do {limeReaderNextRecord(LimeR);} - while (strncmp(limeReaderType(LimeR), "ildg-binary-data",16)); - - n_uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration) - - - ILDGtype ILDGt(true, LimeR); - // this is special for double prec data, just for the moment - uint32_t csum = BinaryIO::readObjectParallel< itype, sobjd >( - Umu, filename, ILDGMunger(), 0, format, ILDGt); - - // Check configuration - // todo - - return csum; - } - - template - uint32_t writeConfiguration(Lattice > &Umu, std::string format) { typedef Lattice > GaugeField; typedef iLorentzColourMatrix vobj; typedef typename vobj::scalar_object sobj; typedef LorentzColourMatrixD fobj; - ILDGField header; - // fill the header + GridBase * grid = Umu._grid; + + //////////////////////////////////////// + // fill the headers + //////////////////////////////////////// + FieldMetaData header; + + GridMetaData(grid,header); + GaugeStatistics(Umu,header); + MachineCharacteristics(header); + + assert( (format=="IEEE64BIG") || (format=="IEEE32BIG")); header.floating_point = format; + header.checksum = 0x0; // unused in ILDG + writeHeader(header,LimeW); - ILDGUnmunger munge; - unsigned int offset = writeHeader(header); - - BinaryIO::Uint32Checksum(Umu, munge, header.checksum); - + //////////////////////////////////////// // Write data record header - n_uint64_t PayloadSize = sizeof(fobj) * Umu._grid->_gsites; - createHeader("ildg-binary-data", 0, 1, PayloadSize, LimeW); - - ILDGtype ILDGt(true, LimeW); - uint32_t csum = BinaryIO::writeObjectParallel( - Umu, filename, munge, 0, header.floating_point, ILDGt); - + //////////////////////////////////////// + uint64_t PayloadSize = sizeof(fobj) * Umu._grid->_gsites; + createHeader(ILDG_BINARY_DATA, 0, 0, PayloadSize, LimeW); + + off_t offset = ftell(File); + uint32_t nersc_csum,scidac_csuma,scidac_csumb; + GaugeSimpleMunger munge; + BinaryIO::writeLatticeObject(Umu, filename, munge, offset, header.floating_point, + nersc_csum,scidac_csuma,scidac_csumb); limeWriterCloseRecord(LimeW); - // Last record - // the logical file name LNF - // look into documentation on how to generate this string - std::string LNF = "empty"; + //////////////////////////////////////// + // Write checksum element, propagaing forward from the BinaryIO + //////////////////////////////////////// + scidacChecksum checksum; + checksum.suma= scidac_csuma; + checksum.sumb= scidac_csumb; + // std::cout << " writing scidac checksums "< + static void readConfiguration(std::string filename,Lattice > &Umu, FieldMetaData &FieldMetaData_) { + + typedef Lattice > GaugeField; + typedef LorentzColourMatrixD sobjd; + typedef LorentzColourMatrixF sobjf; + typedef iLorentzColourMatrix itype; + typedef LorentzColourMatrix sobj; + + GridBase *grid = Umu._grid; + + std::vector dims = Umu._grid->FullDimensions(); + assert(dims.size()==4); + + FILE *File = fopen(filename.c_str(), "r"); + LimeReader *LimeR = limeCreateReader(File); - PayloadSize = sizeof(LNF); - createHeader("ildg-binary-lfn", 1 , 1, PayloadSize, LimeW); - limeWriteRecordData(const_cast(LNF.c_str()), &PayloadSize, LimeW); + // Metadata holders + ildgFormat ildgFormat_ ; + std::string ildgLFN_ ; + scidacChecksum scidacChecksum_; + usqcdInfo usqcdInfo_ ; - limeWriterCloseRecord(LimeW); + // track what we read from file + int found_ildgFormat =0; + int found_ildgLFN =0; + int found_scidacChecksum=0; + int found_usqcdInfo =0; + int found_ildgBinary =0; + int found_FieldMetaData =0; - return csum; + uint32_t nersc_csum; + uint32_t scidac_csuma; + uint32_t scidac_csumb; + + // Binary format + std::string format; + + ////////////////////////////////////////////////////////////////////////// + // Loop over all records + // -- Order is poorly guaranteed except ILDG header preceeds binary section. + // -- Run like an event loop. + // -- Impose trust hierarchy. Grid takes precedence & look for ILDG, and failing + // that Scidac. + // -- Insist on Scidac checksum record. + ////////////////////////////////////////////////////////////////////////// + + while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { + + uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration) + + ////////////////////////////////////////////////////////////////// + // If not BINARY_DATA read a string and parse + ////////////////////////////////////////////////////////////////// + if ( strncmp(limeReaderType(LimeR), ILDG_BINARY_DATA,strlen(ILDG_BINARY_DATA) ) ) { + + // Copy out the string + std::vector xmlc(nbytes+1,'\0'); + limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR); + std::cout << GridLogMessage<< "Non binary record :" < munge; + BinaryIO::readLatticeObject< itype, sobjd >(Umu, filename, munge, offset, format, + nersc_csum,scidac_csuma,scidac_csumb); + found_ildgBinary = 1; + } + + } + + ////////////////////////////////////////////////////// + // Minimally must find binary segment and checksum + ////////////////////////////////////////////////////// + assert(found_ildgBinary); + assert(found_scidacChecksum); + + // Must find something with the lattice dimensions + assert(found_FieldMetaData||found_ildgFormat); + + if ( found_FieldMetaData ) { + + std::cout << GridLogMessage<<"a Grid MetaData was record found: configuration was probably written by Grid ! Yay ! "<(Umu,checker); + assert(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5); + assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5); + std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl; + } } // format for RNG? Now just binary out diff --git a/lib/parallelIO/IldgIOtypes.h b/lib/parallelIO/IldgIOtypes.h index 4c7a1edd..8e1316eb 100644 --- a/lib/parallelIO/IldgIOtypes.h +++ b/lib/parallelIO/IldgIOtypes.h @@ -34,47 +34,83 @@ extern "C" { // for linkage namespace Grid { -struct ILDGtype { - bool is_ILDG; - LimeWriter* LW; - LimeReader* LR; +#define GRID_FORMAT "grid-format" +#define ILDG_FORMAT "ildg-format" +#define ILDG_BINARY_DATA "ildg-binary-data" +#define ILDG_DATA_LFN "ildg-data-lfn" +#define USQCD_INFO "usqcdInfo" +#define SCIDAC_CHECKSUM "scidac-checksum" - ILDGtype(bool is, LimeWriter* L) : is_ILDG(is), LW(L), LR(NULL) {} - ILDGtype(bool is, LimeReader* L) : is_ILDG(is), LW(NULL), LR(L) {} - ILDGtype() : is_ILDG(false), LW(NULL), LR(NULL) {} +///////////////////////////////////////////////////////////////////////////////// +// Data representation of records that enter ILDG and SciDac formats +///////////////////////////////////////////////////////////////////////////////// +struct ildgFormat : Serializable { +public: + GRID_SERIALIZABLE_CLASS_MEMBERS(ildgFormat, + double, version, + std::string, field, + int, precision, + int, lx, + int, ly, + int, lz, + int, lt); + ildgFormat() { + version=1.0; + }; }; - -class ILDGField { +struct usqcdInfo : Serializable { public: - // header strings (not in order) - std::vector dimension; - std::vector boundary; - int data_start; - std::string hdr_version; - std::string storage_format; - // Checks on data - double link_trace; - double plaquette; - uint32_t checksum; - unsigned int sequence_number; - std::string data_type; - std::string ensemble_id; - std::string ensemble_label; - std::string creator; - std::string creator_hardware; - std::string creation_date; - std::string archive_date; - std::string floating_point; + GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdInfo, + double, version, + double, plaq, + double, linktr, + std::string, info); + usqcdInfo() { + version=1.0; + }; +}; + +struct usqcdPropFile : Serializable { + public: + GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropFile, + double, version, + std::string, type, + std::string, info); + usqcdPropFile() { + version=1.0; + }; +}; +struct usqcdSourceInfo : Serializable { + public: + GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdSourceInfo, + double, version, + std::string, info); + usqcdSourceInfo() { + version=1.0; + }; +}; +struct usqcdPropInfo : Serializable { + public: + GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropInfo, + double, version, + int, spin, + int, color, + std::string, info); + usqcdPropInfo() { + version=1.0; + }; +}; +struct scidacChecksum : Serializable { + public: + GRID_SERIALIZABLE_CLASS_MEMBERS(scidacChecksum, + double, version, + uint32_t, suma, + uint32_t, sumb); + scidacChecksum() { + version=1.0; + suma=sumb=0; + }; }; } -#else -namespace Grid { - -struct ILDGtype { - bool is_ILDG; - ILDGtype() : is_ILDG(false) {} -}; -} - #endif #endif diff --git a/lib/parallelIO/NerscIO.h b/lib/parallelIO/NerscIO.h index ba9d23de..cc37b537 100644 --- a/lib/parallelIO/NerscIO.h +++ b/lib/parallelIO/NerscIO.h @@ -30,168 +30,11 @@ #ifndef GRID_NERSC_IO_H #define GRID_NERSC_IO_H -#include -#include -#include -#include -#include - -#include -#include -#include - namespace Grid { namespace QCD { using namespace Grid; - //////////////////////////////////////////////////////////////////////////////// - // Some data types for intermediate storage - //////////////////////////////////////////////////////////////////////////////// - template using iLorentzColour2x3 = iVector, 2>, 4 >; - - typedef iLorentzColour2x3 LorentzColour2x3; - typedef iLorentzColour2x3 LorentzColour2x3F; - typedef iLorentzColour2x3 LorentzColour2x3D; - - //////////////////////////////////////////////////////////////////////////////// - // header specification/interpretation - //////////////////////////////////////////////////////////////////////////////// - class NerscField { - public: - // header strings (not in order) - int dimension[4]; - std::string boundary[4]; - int data_start; - std::string hdr_version; - std::string storage_format; - // Checks on data - double link_trace; - double plaquette; - uint32_t checksum; - unsigned int sequence_number; - std::string data_type; - std::string ensemble_id ; - std::string ensemble_label ; - std::string creator ; - std::string creator_hardware ; - std::string creation_date ; - std::string archive_date ; - std::string floating_point; - }; - - ////////////////////////////////////////////////////////////////////// - // Bit and Physical Checksumming and QA of data - ////////////////////////////////////////////////////////////////////// - - inline void NerscGrid(GridBase *grid,NerscField &header) - { - assert(grid->_ndimension==4); - for(int d=0;d<4;d++) { - header.dimension[d] = grid->_fdimensions[d]; - } - for(int d=0;d<4;d++) { - header.boundary[d] = std::string("PERIODIC"); - } - } - template - inline void NerscStatistics(GaugeField & data,NerscField &header) - { - // How to convert data precision etc... - header.link_trace=Grid::QCD::WilsonLoops::linkTrace(data); - header.plaquette =Grid::QCD::WilsonLoops::avgPlaquette(data); - } - - inline void NerscMachineCharacteristics(NerscField &header) - { - // Who - struct passwd *pw = getpwuid (getuid()); - if (pw) header.creator = std::string(pw->pw_name); - - // When - std::time_t t = std::time(nullptr); - std::tm tm = *std::localtime(&t); - std::ostringstream oss; - // oss << std::put_time(&tm, "%c %Z"); - header.creation_date = oss.str(); - header.archive_date = header.creation_date; - - // What - struct utsname name; uname(&name); - header.creator_hardware = std::string(name.nodename)+"-"; - header.creator_hardware+= std::string(name.machine)+"-"; - header.creator_hardware+= std::string(name.sysname)+"-"; - header.creator_hardware+= std::string(name.release); - - } - ////////////////////////////////////////////////////////////////////// - // Utilities ; these are QCD aware - ////////////////////////////////////////////////////////////////////// - inline void reconstruct3(LorentzColourMatrix & cm) - { - const int x=0; - const int y=1; - const int z=2; - for(int mu=0;mu<4;mu++){ - cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy - cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz - cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx - } - } - - template - struct NerscSimpleMunger{ - void operator()(fobj &in, sobj &out) { - for (int mu = 0; mu < Nd; mu++) { - for (int i = 0; i < Nc; i++) { - for (int j = 0; j < Nc; j++) { - out(mu)()(i, j) = in(mu)()(i, j); - }} - } - }; - }; - - template - struct NerscSimpleUnmunger { - - void operator()(sobj &in, fobj &out) { - for (int mu = 0; mu < Nd; mu++) { - for (int i = 0; i < Nc; i++) { - for (int j = 0; j < Nc; j++) { - out(mu)()(i, j) = in(mu)()(i, j); - }} - } - }; - }; - - template - struct Nersc3x2munger{ - - void operator() (fobj &in,sobj &out){ - for(int mu=0;mu<4;mu++){ - for(int i=0;i<2;i++){ - for(int j=0;j<3;j++){ - out(mu)()(i,j) = in(mu)(i)(j); - }} - } - reconstruct3(out); - } - }; - - template - struct Nersc3x2unmunger{ - - void operator() (sobj &in,fobj &out){ - for(int mu=0;mu<4;mu++){ - for(int i=0;i<2;i++){ - for(int j=0;j<3;j++){ - out(mu)(i)(j) = in(mu)()(i,j); - }} - } - } - }; - - //////////////////////////////////////////////////////////////////////////////// // Write and read from fstream; comput header offset for payload //////////////////////////////////////////////////////////////////////////////// @@ -202,42 +45,17 @@ namespace Grid { std::ofstream fout(file,std::ios::out); } -#define dump_nersc_header(field, s) \ - s << "BEGIN_HEADER" << std::endl; \ - s << "HDR_VERSION = " << field.hdr_version << std::endl; \ - s << "DATATYPE = " << field.data_type << std::endl; \ - s << "STORAGE_FORMAT = " << field.storage_format << std::endl; \ - for(int i=0;i<4;i++){ \ - s << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ; \ - } \ - s << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl; \ - s << "PLAQUETTE = " << std::setprecision(10) << field.plaquette << std::endl; \ - for(int i=0;i<4;i++){ \ - s << "BOUNDARY_"< header; @@ -309,19 +127,21 @@ namespace Grid { return field.data_start; } - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Now the meat: the object readers - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Now the meat: the object readers + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template - static inline void readConfiguration(Lattice > &Umu,NerscField& header,std::string file) + static inline void readConfiguration(Lattice > &Umu, + FieldMetaData& header, + std::string file) { typedef Lattice > GaugeField; GridBase *grid = Umu._grid; int offset = readHeader(file,Umu._grid,header); - NerscField clone(header); + FieldMetaData clone(header); std::string format(header.floating_point); @@ -330,34 +150,38 @@ namespace Grid { int ieee64big = (format == std::string("IEEE64BIG")); int ieee64 = (format == std::string("IEEE64")); - uint32_t csum; + uint32_t nersc_csum,scidac_csuma,scidac_csumb; // depending on datatype, set up munger; // munger is a function of if ( header.data_type == std::string("4D_SU3_GAUGE") ) { if ( ieee32 || ieee32big ) { - csum=BinaryIO::readLatticeObject, LorentzColour2x3F> - (Umu,file,Nersc3x2munger(), offset,format); + BinaryIO::readLatticeObject, LorentzColour2x3F> + (Umu,file,Gauge3x2munger(), offset,format, + nersc_csum,scidac_csuma,scidac_csumb); } if ( ieee64 || ieee64big ) { - csum=BinaryIO::readLatticeObject, LorentzColour2x3D> - (Umu,file,Nersc3x2munger(),offset,format); + BinaryIO::readLatticeObject, LorentzColour2x3D> + (Umu,file,Gauge3x2munger(),offset,format, + nersc_csum,scidac_csuma,scidac_csumb); } } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) { if ( ieee32 || ieee32big ) { - csum=BinaryIO::readLatticeObject,LorentzColourMatrixF> - (Umu,file,NerscSimpleMunger(),offset,format); + BinaryIO::readLatticeObject,LorentzColourMatrixF> + (Umu,file,GaugeSimpleMunger(),offset,format, + nersc_csum,scidac_csuma,scidac_csumb); } if ( ieee64 || ieee64big ) { - csum=BinaryIO::readLatticeObject,LorentzColourMatrixD> - (Umu,file,NerscSimpleMunger(),offset,format); + BinaryIO::readLatticeObject,LorentzColourMatrixD> + (Umu,file,GaugeSimpleMunger(),offset,format, + nersc_csum,scidac_csuma,scidac_csumb); } } else { assert(0); } - NerscStatistics(Umu,clone); + GaugeStatistics(Umu,clone); - std::cout< - static inline void writeConfiguration(Lattice > &Umu,std::string file, int two_row,int bits32) + static inline void writeConfiguration(Lattice > &Umu, + std::string file, + int two_row, + int bits32) { typedef Lattice > GaugeField; typedef iLorentzColourMatrix vobj; typedef typename vobj::scalar_object sobj; + FieldMetaData header; + /////////////////////////////////////////// // Following should become arguments - NerscField header; + /////////////////////////////////////////// header.sequence_number = 1; header.ensemble_id = "UKQCD"; header.ensemble_label = "DWF"; @@ -402,32 +231,31 @@ namespace Grid { GridBase *grid = Umu._grid; - NerscGrid(grid,header); - NerscStatistics(Umu,header); - NerscMachineCharacteristics(header); + GridMetaData(grid,header); + assert(header.nd==4); + GaugeStatistics(Umu,header); + MachineCharacteristics(header); int offset; truncate(file); - if ( two_row ) { - header.floating_point = std::string("IEEE64BIG"); - header.data_type = std::string("4D_SU3_GAUGE"); - Nersc3x2unmunger munge; - offset = writeHeader(header,file); - header.checksum=BinaryIO::writeLatticeObject(Umu,file,munge,offset,header.floating_point); - writeHeader(header,file); - } else { - header.floating_point = std::string("IEEE64BIG"); - header.data_type = std::string("4D_SU3_GAUGE_3x3"); - NerscSimpleUnmunger munge; - offset = writeHeader(header,file); - header.checksum=BinaryIO::writeLatticeObject(Umu,file,munge,offset,header.floating_point); - writeHeader(header,file); - } + // Sod it -- always write 3x3 double + header.floating_point = std::string("IEEE64BIG"); + header.data_type = std::string("4D_SU3_GAUGE_3x3"); + GaugeSimpleUnmunger munge; + offset = writeHeader(header,file); + + uint32_t nersc_csum,scidac_csuma,scidac_csumb; + BinaryIO::writeLatticeObject(Umu,file,munge,offset,header.floating_point, + nersc_csum,scidac_csuma,scidac_csumb); + header.checksum = nersc_csum; + writeHeader(header,file); + std::cout< - uint32_t csum=BinaryIO::readRNG(serial,parallel,file,offset); + uint32_t nersc_csum,scidac_csuma,scidac_csumb; + BinaryIO::readRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb); - if ( csum != header.checksum ) { - std::cerr << "checksum mismatch "< { fout.close(); } - void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG, - GridParallelRNG &pRNG) { + void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) { + if ((traj % Params.saveInterval) == 0) { std::string config, rng; this->build_filenames(traj, Params, config, rng); + uint32_t nersc_csum; + uint32_t scidac_csuma; + uint32_t scidac_csumb; + BinarySimpleUnmunger munge; truncate(rng); - BinaryIO::writeRNG(sRNG, pRNG, rng, 0); + BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb); truncate(config); - uint32_t csum = BinaryIO::writeLatticeObject( - U, config, munge, 0, Params.format); + + BinaryIO::writeLatticeObject(U, config, munge, 0, Params.format, + nersc_csum,scidac_csuma,scidac_csumb); std::cout << GridLogMessage << "Written Binary Configuration " << config - << " checksum " << std::hex << csum << std::dec << std::endl; + << " checksum " << std::hex + << nersc_csum <<"/" + << scidac_csuma <<"/" + << scidac_csumb + << std::dec << std::endl; } + }; - void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG, - GridParallelRNG &pRNG) { + void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) { std::string config, rng; this->build_filenames(traj, Params, config, rng); BinarySimpleMunger munge; - BinaryIO::readRNG(sRNG, pRNG, rng, 0); - uint32_t csum = BinaryIO::readLatticeObject( - U, config, munge, 0, Params.format); + uint32_t nersc_csum; + uint32_t scidac_csuma; + uint32_t scidac_csumb; + BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb); + BinaryIO::readLatticeObject(U, config, munge, 0, Params.format, + nersc_csum,scidac_csuma,scidac_csumb); + std::cout << GridLogMessage << "Read Binary Configuration " << config - << " checksum " << std::hex << csum << std::dec << std::endl; + << " checksums " << std::hex << nersc_csum<<"/"< { // check here that the format is valid int ieee32big = (Params.format == std::string("IEEE32BIG")); - int ieee32 = (Params.format == std::string("IEEE32")); + int ieee32 = (Params.format == std::string("IEEE32")); int ieee64big = (Params.format == std::string("IEEE64BIG")); - int ieee64 = (Params.format == std::string("IEEE64")); + int ieee64 = (Params.format == std::string("IEEE64")); if (!(ieee64big || ieee32 || ieee32big || ieee64)) { std::cout << GridLogError << "Unrecognized file format " << Params.format @@ -74,13 +74,17 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer { if ((traj % Params.saveInterval) == 0) { std::string config, rng; this->build_filenames(traj, Params, config, rng); - - ILDGIO IO(config, ILDGwrite); - BinaryIO::writeRNGSerial(sRNG, pRNG, rng, 0); - uint32_t csum = IO.writeConfiguration(U, Params.format); + + uint32_t nersc_csum,scidac_csuma,scidac_csumb; + BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb); + IldgIO::writeConfiguration(config,U, Params.format); std::cout << GridLogMessage << "Written ILDG Configuration on " << config - << " checksum " << std::hex << csum << std::dec << std::endl; + << " checksum " << std::hex + << nersc_csum<<"/" + << scidac_csuma<<"/" + << scidac_csumb + << std::dec << std::endl; } }; @@ -89,12 +93,18 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer { std::string config, rng; this->build_filenames(traj, Params, config, rng); - ILDGIO IO(config, ILDGread); - BinaryIO::readRNGSerial(sRNG, pRNG, rng, 0); - uint32_t csum = IO.readConfiguration(U); // format from the header + uint32_t nersc_csum,scidac_csuma,scidac_csumb; + BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb); + + FieldMetaData header; + IldgIO::readConfiguration(config,U,header); // format from the header std::cout << GridLogMessage << "Read ILDG Configuration from " << config - << " checksum " << std::hex << csum << std::dec << std::endl; + << " checksum " << std::hex + << nersc_csum<<"/" + << scidac_csuma<<"/" + << scidac_csumb + << std::dec << std::endl; }; }; } diff --git a/lib/qcd/hmc/checkpointers/NerscCheckpointer.h b/lib/qcd/hmc/checkpointers/NerscCheckpointer.h index 395369a0..a4b1b480 100644 --- a/lib/qcd/hmc/checkpointers/NerscCheckpointer.h +++ b/lib/qcd/hmc/checkpointers/NerscCheckpointer.h @@ -70,7 +70,7 @@ class NerscHmcCheckpointer : public BaseHmcCheckpointer { std::string config, rng; this->build_filenames(traj, Params, config, rng); - NerscField header; + FieldMetaData header; NerscIO::readRNGState(sRNG, pRNG, header, rng); NerscIO::readConfiguration(U, header, config); }; diff --git a/lib/qcd/utils/Utils.h b/lib/qcd/utils/Utils.h index 61c81cb5..1786db54 100644 --- a/lib/qcd/utils/Utils.h +++ b/lib/qcd/utils/Utils.h @@ -12,7 +12,4 @@ #include #include - - - #endif diff --git a/lib/serialisation/XmlIO.cc b/lib/serialisation/XmlIO.cc index b04263c9..a132a2f0 100644 --- a/lib/serialisation/XmlIO.cc +++ b/lib/serialisation/XmlIO.cc @@ -32,16 +32,21 @@ using namespace Grid; using namespace std; // Writer implementation /////////////////////////////////////////////////////// -XmlWriter::XmlWriter(const string &fileName) -: fileName_(fileName) +XmlWriter::XmlWriter(const string &fileName, string toplev) : fileName_(fileName) { - node_ = doc_.append_child(); - node_.set_name("grid"); + if ( toplev == std::string("") ) { + node_=doc_; + } else { + node_=doc_.append_child(); + node_.set_name(toplev.c_str()); + } } XmlWriter::~XmlWriter(void) { - doc_.save_file(fileName_.c_str(), " "); + if ( fileName_ != std::string("") ) { + doc_.save_file(fileName_.c_str(), " "); + } } void XmlWriter::push(const string &s) @@ -53,21 +58,44 @@ void XmlWriter::pop(void) { node_ = node_.parent(); } - -// Reader implementation /////////////////////////////////////////////////////// -XmlReader::XmlReader(const string &fileName) -: fileName_(fileName) +std::string XmlWriter::XmlString(void) { - pugi::xml_parse_result result = doc_.load_file(fileName_.c_str()); - - if ( !result ) - { + std::ostringstream oss; + doc_.save(oss); + return oss.str(); +} + +XmlReader::XmlReader(const char *xmlstring,string toplev) : fileName_("") +{ + pugi::xml_parse_result result; + result = doc_.load_string(xmlstring); + if ( !result ) { cerr << "XML error description: " << result.description() << "\n"; cerr << "XML error offset : " << result.offset << "\n"; abort(); } - - node_ = doc_.child("grid"); + if ( toplev == std::string("") ) { + node_ = doc_; + } else { + node_ = doc_.child(toplev.c_str()); + } +} + +// Reader implementation /////////////////////////////////////////////////////// +XmlReader::XmlReader(const string &fileName,string toplev) : fileName_(fileName) +{ + pugi::xml_parse_result result; + result = doc_.load_file(fileName_.c_str()); + if ( !result ) { + cerr << "XML error description: " << result.description() << "\n"; + cerr << "XML error offset : " << result.offset << "\n"; + abort(); + } + if ( toplev == std::string("") ) { + node_ = doc_; + } else { + node_ = doc_.child(toplev.c_str()); + } } bool XmlReader::push(const string &s) diff --git a/lib/serialisation/XmlIO.h b/lib/serialisation/XmlIO.h index f333b9aa..fcdbf1e4 100644 --- a/lib/serialisation/XmlIO.h +++ b/lib/serialisation/XmlIO.h @@ -44,10 +44,9 @@ namespace Grid { class XmlWriter: public Writer - { - + { public: - XmlWriter(const std::string &fileName); + XmlWriter(const std::string &fileName,std::string toplev = std::string("grid") ); virtual ~XmlWriter(void); void push(const std::string &s); void pop(void); @@ -55,6 +54,7 @@ namespace Grid void writeDefault(const std::string &s, const U &x); template void writeDefault(const std::string &s, const std::vector &x); + std::string XmlString(void); private: pugi::xml_document doc_; pugi::xml_node node_; @@ -64,7 +64,8 @@ namespace Grid class XmlReader: public Reader { public: - XmlReader(const std::string &fileName); + XmlReader(const char *xmlstring,std::string toplev = std::string("grid") ); + XmlReader(const std::string &fileName,std::string toplev = std::string("grid") ); virtual ~XmlReader(void) = default; bool push(const std::string &s); void pop(void); @@ -118,7 +119,7 @@ namespace Grid std::string buf; readDefault(s, buf); - std::cout << s << " " << buf << std::endl; + // std::cout << s << " " << buf << std::endl; fromString(output, buf); } diff --git a/tests/IO/Test_nersc_io.cc b/tests/IO/Test_nersc_io.cc index 14c6080d..ca04e623 100644 --- a/tests/IO/Test_nersc_io.cc +++ b/tests/IO/Test_nersc_io.cc @@ -64,8 +64,8 @@ int main (int argc, char ** argv) std::cout < U(4,&Fine); - NerscField header; + FieldMetaData header; std::string file("./ckpoint_lat"); NerscIO::readConfiguration(Umu,header,file); diff --git a/tests/IO/Test_serialisation.cc b/tests/IO/Test_serialisation.cc index 7d911dfd..ceddee77 100644 --- a/tests/IO/Test_serialisation.cc +++ b/tests/IO/Test_serialisation.cc @@ -31,6 +31,7 @@ Author: Peter Boyle using namespace Grid; +using namespace Grid::QCD; GRID_SERIALIZABLE_ENUM(myenum, undef, red, 1, blue, 2, green, 3); @@ -62,6 +63,7 @@ public: } }; + int16_t i16 = 1; uint16_t u16 = 2; int32_t i32 = 3; @@ -237,7 +239,22 @@ int main(int argc,char **argv) std::cout << "Loaded (JSON) -----------------" << std::endl; std::cout << jcopy1 << std::endl << jveccopy1 << std::endl; } - + + { + ildgFormat format; + format.version =1.0; + format.field =std::string("su3gauge"); + format.precision =32; + format.lx =24; + format.ly =24; + format.lz =24; + format.lt =48; + XmlWriter WR("ildg-format.xml",""); + XmlWriter WRs("",""); + write(WR,"ildgFormat",format); + write(WRs,"ildgFormat",format); + std::cout << " XmlString: " < Date: Sun, 11 Jun 2017 23:19:20 +0100 Subject: [PATCH 053/170] New files --- lib/parallelIO/MetaData.h | 223 ++++++++++++++++++++++++++++++++++++++ tests/IO/Test_ildg_io.cc | 93 ++++++++++++++++ 2 files changed, 316 insertions(+) create mode 100644 lib/parallelIO/MetaData.h create mode 100644 tests/IO/Test_ildg_io.cc diff --git a/lib/parallelIO/MetaData.h b/lib/parallelIO/MetaData.h new file mode 100644 index 00000000..e91371b8 --- /dev/null +++ b/lib/parallelIO/MetaData.h @@ -0,0 +1,223 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/parallelIO/NerscIO.h + + Copyright (C) 2015 + + + Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Grid { + namespace QCD { + + using namespace Grid; + + //////////////////////////////////////////////////////////////////////////////// + // header specification/interpretation + //////////////////////////////////////////////////////////////////////////////// + class FieldMetaData : Serializable { + public: + + GRID_SERIALIZABLE_CLASS_MEMBERS(FieldMetaData, + int, nd, + std::vector, dimension, + std::vector, boundary, + int, data_start, + std::string, hdr_version, + std::string, storage_format, + double, link_trace, + double, plaquette, + uint32_t, checksum, + uint32_t, scidac_checksuma, + uint32_t, scidac_checksumb, + unsigned int, sequence_number, + std::string, data_type, + std::string, ensemble_id, + std::string, ensemble_label, + std::string, ildg_lfn, + std::string, creator, + std::string, creator_hardware, + std::string, creation_date, + std::string, archive_date, + std::string, floating_point); + }; + + ////////////////////////////////////////////////////////////////////// + // Bit and Physical Checksumming and QA of data + ////////////////////////////////////////////////////////////////////// + inline void GridMetaData(GridBase *grid,FieldMetaData &header) + { + int nd = grid->_ndimension; + header.nd = nd; + header.dimension.resize(nd); + header.boundary.resize(nd); + for(int d=0;d_fdimensions[d]; + } + for(int d=0;d + inline void GaugeStatistics(GaugeField & data,FieldMetaData &header) + { + // How to convert data precision etc... + header.link_trace=Grid::QCD::WilsonLoops::linkTrace(data); + header.plaquette =Grid::QCD::WilsonLoops::avgPlaquette(data); + } + + inline void MachineCharacteristics(FieldMetaData &header) + { + // Who + struct passwd *pw = getpwuid (getuid()); + if (pw) header.creator = std::string(pw->pw_name); + + // When + std::time_t t = std::time(nullptr); + std::tm tm = *std::localtime(&t); + std::ostringstream oss; + oss << std::put_time(&tm, "%c %Z"); + header.creation_date = oss.str(); + header.archive_date = header.creation_date; + + // What + struct utsname name; uname(&name); + header.creator_hardware = std::string(name.nodename)+"-"; + header.creator_hardware+= std::string(name.machine)+"-"; + header.creator_hardware+= std::string(name.sysname)+"-"; + header.creator_hardware+= std::string(name.release); + } + +#define dump_meta_data(field, s) \ + s << "BEGIN_HEADER" << std::endl; \ + s << "HDR_VERSION = " << field.hdr_version << std::endl; \ + s << "DATATYPE = " << field.data_type << std::endl; \ + s << "STORAGE_FORMAT = " << field.storage_format << std::endl; \ + for(int i=0;i<4;i++){ \ + s << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ; \ + } \ + s << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl; \ + s << "PLAQUETTE = " << std::setprecision(10) << field.plaquette << std::endl; \ + for(int i=0;i<4;i++){ \ + s << "BOUNDARY_"< using iLorentzColour2x3 = iVector, 2>, Nd >; + + typedef iLorentzColour2x3 LorentzColour2x3; + typedef iLorentzColour2x3 LorentzColour2x3F; + typedef iLorentzColour2x3 LorentzColour2x3D; + + template + struct GaugeSimpleMunger{ + void operator()(fobj &in, sobj &out) { + for (int mu = 0; mu < Nd; mu++) { + for (int i = 0; i < Nc; i++) { + for (int j = 0; j < Nc; j++) { + out(mu)()(i, j) = in(mu)()(i, j); + }} + } + }; + }; + + template + struct GaugeSimpleUnmunger { + + void operator()(sobj &in, fobj &out) { + for (int mu = 0; mu < Nd; mu++) { + for (int i = 0; i < Nc; i++) { + for (int j = 0; j < Nc; j++) { + out(mu)()(i, j) = in(mu)()(i, j); + }} + } + }; + }; + + template + struct Gauge3x2munger{ + void operator() (fobj &in,sobj &out){ + for(int mu=0;mu + struct Gauge3x2unmunger{ + void operator() (sobj &in,fobj &out){ + for(int mu=0;mu +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::cout < simd_layout = GridDefaultSimd(4,vComplex::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + //std::vector latt_size ({48,48,48,96}); + //std::vector latt_size ({32,32,32,32}); + std::vector latt_size ({16,16,16,32}); + std::vector clatt_size ({4,4,4,8}); + int orthodir=3; + int orthosz =latt_size[orthodir]; + + GridCartesian Fine(latt_size,simd_layout,mpi_layout); + GridCartesian Coarse(clatt_size,simd_layout,mpi_layout); + + + GridParallelRNG pRNGa(&Fine); + GridParallelRNG pRNGb(&Fine); + GridSerialRNG sRNGa; + GridSerialRNG sRNGb; + + std::cout <({45,12,81,9})); + sRNGa.SeedFixedIntegers(std::vector({45,12,81,9})); + std::cout < U(4,&Fine); + + SU3::HotConfiguration(pRNGa,Umu); + + + FieldMetaData header; + + std::cout < Date: Mon, 12 Jun 2017 00:41:21 +0100 Subject: [PATCH 055/170] Odd new error on G++ 49 on travis --- lib/serialisation/MacroMagic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/serialisation/MacroMagic.h b/lib/serialisation/MacroMagic.h index a864989c..04f1b401 100644 --- a/lib/serialisation/MacroMagic.h +++ b/lib/serialisation/MacroMagic.h @@ -110,7 +110,7 @@ THE SOFTWARE. #define GRID_MACRO_MEMBER(A,B) A B; #define GRID_MACRO_COMP_MEMBER(A,B) result = (result and (lhs. B == rhs. B)); -#define GRID_MACRO_OS_WRITE_MEMBER(A,B) os<< #A <<" "#B <<" = "<< obj. B <<" ; " < Date: Mon, 12 Jun 2017 18:27:32 +0100 Subject: [PATCH 056/170] QedVFol: Allow output of scalar propagator and vacuum polarisation projected to arbitrary lattice momentum, not just zero-momentum. --- extras/Hadrons/Modules/MScalar/ChargedProp.cc | 121 ++++--- .../Hadrons/Modules/MScalar/ChargedProp.hpp | 3 +- extras/Hadrons/Modules/MScalar/ScalarVP.cc | 318 ++++++++++++------ extras/Hadrons/Modules/MScalar/ScalarVP.hpp | 3 +- 4 files changed, 302 insertions(+), 143 deletions(-) diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.cc b/extras/Hadrons/Modules/MScalar/ChargedProp.cc index a9089056..285b237a 100644 --- a/extras/Hadrons/Modules/MScalar/ChargedProp.cc +++ b/extras/Hadrons/Modules/MScalar/ChargedProp.cc @@ -171,58 +171,91 @@ void TChargedProp::execute(void) // OUTPUT IF NECESSARY if (!par().output.empty()) { - std::string filename = par().output + "." + + for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p) + { + std::vector mom = strToVec(par().outputMom[i_p]); + std::string filename = par().output + "_" + std::to_string(mom[0]) + + std::to_string(mom[1]) + + std::to_string(mom[2]) + + "." + std::to_string(env().getTrajectory()); - - LOG(Message) << "Saving zero-momentum projection to '" + + LOG(Message) << "Saving (" << par().outputMom[i_p] << ") momentum projection to '" << filename << "'..." << std::endl; - - CorrWriter writer(filename); - std::vector vecBuf; - std::vector result; - - write(writer, "charge", q); - // Write full propagator - sliceSum(prop, vecBuf, Tp); - result.resize(vecBuf.size()); - for (unsigned int t = 0; t < vecBuf.size(); ++t) - { - result[t] = TensorRemove(vecBuf[t]); - } - write(writer, "prop", result); + CorrWriter writer(filename); + std::vector vecBuf; + std::vector result; - // Write free propagator - sliceSum(*prop0_, vecBuf, Tp); - for (unsigned int t = 0; t < vecBuf.size(); ++t) - { - result[t] = TensorRemove(vecBuf[t]); - } - write(writer, "prop_0", result); + write(writer, "charge", q); + write(writer, "mass", par().mass); - // Write propagator O(q) term - sliceSum(propQ, vecBuf, Tp); - for (unsigned int t = 0; t < vecBuf.size(); ++t) - { - result[t] = TensorRemove(vecBuf[t]); - } - write(writer, "prop_Q", result); + // Write full propagator + buf = prop; + for (unsigned int j = 0; j < env().getNd()-1; ++j) + { + buf = buf*pow(adj(*phase_[j]), mom[j]); + } + sliceSum(buf, vecBuf, Tp); + result.resize(vecBuf.size()); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(writer, "prop", result); - // Write propagator sunset term - sliceSum(propSun, vecBuf, Tp); - for (unsigned int t = 0; t < vecBuf.size(); ++t) - { - result[t] = TensorRemove(vecBuf[t]); - } - write(writer, "prop_Sun", result); + // Write free propagator + buf = *prop0_; + for (unsigned int j = 0; j < env().getNd()-1; ++j) + { + buf = buf*pow(adj(*phase_[j]), mom[j]); + } + sliceSum(buf, vecBuf, Tp); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(writer, "prop_0", result); - // Write propagator tadpole term - sliceSum(propTad, vecBuf, Tp); - for (unsigned int t = 0; t < vecBuf.size(); ++t) - { - result[t] = TensorRemove(vecBuf[t]); + // Write propagator O(q) term + buf = propQ; + for (unsigned int j = 0; j < env().getNd()-1; ++j) + { + buf = buf*pow(adj(*phase_[j]), mom[j]); + } + sliceSum(buf, vecBuf, Tp); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(writer, "prop_Q", result); + + // Write propagator sunset term + buf = propSun; + for (unsigned int j = 0; j < env().getNd()-1; ++j) + { + buf = buf*pow(adj(*phase_[j]), mom[j]); + } + sliceSum(buf, vecBuf, Tp); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(writer, "prop_Sun", result); + + // Write propagator tadpole term + buf = propTad; + for (unsigned int j = 0; j < env().getNd()-1; ++j) + { + buf = buf*pow(adj(*phase_[j]), mom[j]); + } + sliceSum(buf, vecBuf, Tp); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(writer, "prop_Tad", result); } - write(writer, "prop_Tad", result); } } diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.hpp b/extras/Hadrons/Modules/MScalar/ChargedProp.hpp index 369fff30..92b89f9f 100644 --- a/extras/Hadrons/Modules/MScalar/ChargedProp.hpp +++ b/extras/Hadrons/Modules/MScalar/ChargedProp.hpp @@ -20,7 +20,8 @@ public: std::string, source, double, mass, double, charge, - std::string, output); + std::string, output, + std::vector, outputMom); }; class TChargedProp: public Module diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.cc b/extras/Hadrons/Modules/MScalar/ScalarVP.cc index 19cdbb9a..e4f4e820 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.cc +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.cc @@ -90,7 +90,7 @@ void TScalarVP::setup(void) // execution /////////////////////////////////////////////////////////////////// void TScalarVP::execute(void) { - // Get objects cached by ChargedProp module + // Get objects cached by ChargedProp module Complex ci(0.0,1.0); FFT fft(env().getGrid()); Real q = static_cast(env().getModule(par().scalarProp))->par().charge; @@ -144,32 +144,49 @@ void TScalarVP::execute(void) } // Open output files if necessary - CorrWriter *writer, *writer0, *writerD; - std::vector vecBuf; - std::vector result; + std::vector writer, writer0, writerD; if (!par().output.empty()) { - std::string filename = par().output + "." + - std::to_string(env().getTrajectory()); - std::string filename0 = par().output + "_free." + - std::to_string(env().getTrajectory()); - std::string filenameD = par().output + "_diagrams." + - std::to_string(env().getTrajectory()); - - // LOG(Message) << "Saving zero-momentum projection to '" - // << filename << "'..." << std::endl; - writer = new CorrWriter(filename); - writer0 = new CorrWriter(filename0); - writerD = new CorrWriter(filenameD); - - write(*writer, "charge", q); - write(*writer, "mass", static_cast(env().getModule(par().scalarProp))->par().mass); - write(*writer0, "charge", 0.0); - write(*writer0, "mass", static_cast(env().getModule(par().scalarProp))->par().mass); - write(*writerD, "charge", q); - write(*writerD, "mass", static_cast(env().getModule(par().scalarProp))->par().mass); - } + for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p) + { + std::vector mom = strToVec(par().outputMom[i_p]); + std::string filename = par().output + "_" + std::to_string(mom[0]) + + std::to_string(mom[1]) + + std::to_string(mom[2]) + + "." + + std::to_string(env().getTrajectory()); + std::string filename0 = par().output + "_" + std::to_string(mom[0]) + + std::to_string(mom[1]) + + std::to_string(mom[2]) + + "_free." + + std::to_string(env().getTrajectory()); + std::string filenameD = par().output + "_" + std::to_string(mom[0]) + + std::to_string(mom[1]) + + std::to_string(mom[2]) + + "_diagrams." + + std::to_string(env().getTrajectory()); + + CorrWriter *writer_i = new CorrWriter(filename); + writer.push_back(writer_i); + CorrWriter *writer0_i = new CorrWriter(filename0); + writer0.push_back(writer0_i); + CorrWriter *writerD_i = new CorrWriter(filenameD); + writerD.push_back(writerD_i); + + write(*writer[i_p], "charge", q); + write(*writer[i_p], "mass", static_cast(env().getModule(par().scalarProp))->par().mass); + write(*writer0[i_p], "charge", 0.0); + write(*writer0[i_p], "mass", static_cast(env().getModule(par().scalarProp))->par().mass); + write(*writerD[i_p], "charge", q); + write(*writerD[i_p], "mass", static_cast(env().getModule(par().scalarProp))->par().mass); + } + } + std::vector vecBuf; + std::vector result; + ScalarField vpPhase(env().getGrid()); + + // Do contractions for (unsigned int nu = 0; nu < env().getNd(); ++nu) { peekSite(Anu0, peekLorentz(A, nu), coor0); @@ -190,15 +207,25 @@ void TScalarVP::execute(void) // Output if necessary if (!par().output.empty()) { - sliceSum(freeVpTensor[mu][nu], vecBuf, Tp); - result.resize(vecBuf.size()); - for (unsigned int t = 0; t < vecBuf.size(); ++t) + std::vector mom; + for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p) { - result[t] = TensorRemove(vecBuf[t]); + mom = strToVec(par().outputMom[i_p]); + vpPhase = freeVpTensor[mu][nu]; + for (unsigned int j = 0; j < env().getNd()-1; ++j) + { + vpPhase = vpPhase*pow(adj(*phase_[j]), mom[j]); + } + sliceSum(vpPhase, vecBuf, Tp); + result.resize(vecBuf.size()); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(*writer0[i_p], + "Pi_"+std::to_string(mu)+"_"+std::to_string(nu), + result); } - write(*writer0, - "Pi_"+std::to_string(mu)+"_"+std::to_string(nu), - result); } // "Exchange" terms @@ -209,20 +236,30 @@ void TScalarVP::execute(void) tmp_vp -= Cshift(adj(prop2), mu, 1) * (1.0 - ci*q*Amu) * prop1 * (1.0 + ci*q*Anu0); tmp_vp = 2.0*real(tmp_vp); - vpTensor[mu][nu] = tmp_vp*1.0; + vpTensor[mu][nu] = tmp_vp; // Output if necessary if (!par().output.empty()) { - sliceSum(tmp_vp, vecBuf, Tp); - result.resize(vecBuf.size()); - for (unsigned int t = 0; t < vecBuf.size(); ++t) + std::vector mom; + for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p) { - result[t] = TensorRemove(vecBuf[t]); + mom = strToVec(par().outputMom[i_p]); + vpPhase = tmp_vp; + for (unsigned int j = 0; j < env().getNd()-1; ++j) + { + vpPhase = vpPhase*pow(adj(*phase_[j]), mom[j]); + } + sliceSum(vpPhase, vecBuf, Tp); + result.resize(vecBuf.size()); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(*writerD[i_p], + "Pi_exchange_"+std::to_string(mu)+"_"+std::to_string(nu), + result); } - write(*writerD, - "Pi_exchange_"+std::to_string(mu)+"_"+std::to_string(nu), - result); } // Subtract O(alpha^2) term @@ -238,15 +275,25 @@ void TScalarVP::execute(void) // Output if necessary if (!par().output.empty()) { - sliceSum(tmp_vp, vecBuf, Tp); - result.resize(vecBuf.size()); - for (unsigned int t = 0; t < vecBuf.size(); ++t) + std::vector mom; + for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p) { - result[t] = TensorRemove(vecBuf[t]); + mom = strToVec(par().outputMom[i_p]); + vpPhase = tmp_vp; + for (unsigned int j = 0; j < env().getNd()-1; ++j) + { + vpPhase = vpPhase*pow(adj(*phase_[j]), mom[j]); + } + sliceSum(vpPhase, vecBuf, Tp); + result.resize(vecBuf.size()); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(*writerD[i_p], + "Pi_alpha2_"+std::to_string(mu)+"_"+std::to_string(nu), + result); } - write(*writerD, - "Pi_alpha2_"+std::to_string(mu)+"_"+std::to_string(nu), - result); } // Sunset from unshifted source @@ -260,15 +307,25 @@ void TScalarVP::execute(void) // Output if necessary if (!par().output.empty()) { - sliceSum(tmp_vp, vecBuf, Tp); - result.resize(vecBuf.size()); - for (unsigned int t = 0; t < vecBuf.size(); ++t) + std::vector mom; + for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p) { - result[t] = TensorRemove(vecBuf[t]); + mom = strToVec(par().outputMom[i_p]); + vpPhase = tmp_vp; + for (unsigned int j = 0; j < env().getNd()-1; ++j) + { + vpPhase = vpPhase*pow(adj(*phase_[j]), mom[j]); + } + sliceSum(vpPhase, vecBuf, Tp); + result.resize(vecBuf.size()); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(*writerD[i_p], + "Pi_sunset_unshifted_"+std::to_string(mu)+"_"+std::to_string(nu), + result); } - write(*writerD, - "Pi_sunset_unshifted_"+std::to_string(mu)+"_"+std::to_string(nu), - result); } // Sunset from shifted source @@ -281,15 +338,25 @@ void TScalarVP::execute(void) // Output if necessary if (!par().output.empty()) { - sliceSum(tmp_vp, vecBuf, Tp); - result.resize(vecBuf.size()); - for (unsigned int t = 0; t < vecBuf.size(); ++t) + std::vector mom; + for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p) { - result[t] = TensorRemove(vecBuf[t]); + mom = strToVec(par().outputMom[i_p]); + vpPhase = tmp_vp; + for (unsigned int j = 0; j < env().getNd()-1; ++j) + { + vpPhase = vpPhase*pow(adj(*phase_[j]), mom[j]); + } + sliceSum(vpPhase, vecBuf, Tp); + result.resize(vecBuf.size()); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(*writerD[i_p], + "Pi_sunset_shifted_"+std::to_string(mu)+"_"+std::to_string(nu), + result); } - write(*writerD, - "Pi_sunset_shifted_"+std::to_string(mu)+"_"+std::to_string(nu), - result); } // Tadpole from unshifted source @@ -303,15 +370,25 @@ void TScalarVP::execute(void) // Output if necessary if (!par().output.empty()) { - sliceSum(tmp_vp, vecBuf, Tp); - result.resize(vecBuf.size()); - for (unsigned int t = 0; t < vecBuf.size(); ++t) + std::vector mom; + for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p) { - result[t] = TensorRemove(vecBuf[t]); + mom = strToVec(par().outputMom[i_p]); + vpPhase = tmp_vp; + for (unsigned int j = 0; j < env().getNd()-1; ++j) + { + vpPhase = vpPhase*pow(adj(*phase_[j]), mom[j]); + } + sliceSum(vpPhase, vecBuf, Tp); + result.resize(vecBuf.size()); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(*writerD[i_p], + "Pi_tadpole_unshifted_"+std::to_string(mu)+"_"+std::to_string(nu), + result); } - write(*writerD, - "Pi_tadpole_unshifted_"+std::to_string(mu)+"_"+std::to_string(nu), - result); } // Tadpole from shifted source @@ -324,15 +401,25 @@ void TScalarVP::execute(void) // Output if necessary if (!par().output.empty()) { - sliceSum(tmp_vp, vecBuf, Tp); - result.resize(vecBuf.size()); - for (unsigned int t = 0; t < vecBuf.size(); ++t) + std::vector mom; + for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p) { - result[t] = TensorRemove(vecBuf[t]); + mom = strToVec(par().outputMom[i_p]); + vpPhase = tmp_vp; + for (unsigned int j = 0; j < env().getNd()-1; ++j) + { + vpPhase = vpPhase*pow(adj(*phase_[j]), mom[j]); + } + sliceSum(vpPhase, vecBuf, Tp); + result.resize(vecBuf.size()); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(*writerD[i_p], + "Pi_tadpole_shifted_"+std::to_string(mu)+"_"+std::to_string(nu), + result); } - write(*writerD, - "Pi_tadpole_shifted_"+std::to_string(mu)+"_"+std::to_string(nu), - result); } // Source tadpole @@ -349,15 +436,25 @@ void TScalarVP::execute(void) // Output if necessary if (!par().output.empty()) { - sliceSum(tmp_vp, vecBuf, Tp); - result.resize(vecBuf.size()); - for (unsigned int t = 0; t < vecBuf.size(); ++t) + std::vector mom; + for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p) { - result[t] = TensorRemove(vecBuf[t]); + mom = strToVec(par().outputMom[i_p]); + vpPhase = tmp_vp; + for (unsigned int j = 0; j < env().getNd()-1; ++j) + { + vpPhase = vpPhase*pow(adj(*phase_[j]), mom[j]); + } + sliceSum(vpPhase, vecBuf, Tp); + result.resize(vecBuf.size()); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(*writerD[i_p], + "Pi_sourcetadpole_"+std::to_string(mu)+"_"+std::to_string(nu), + result); } - write(*writerD, - "Pi_sourcetadpole_"+std::to_string(mu)+"_"+std::to_string(nu), - result); } // Sink tadpole @@ -373,34 +470,61 @@ void TScalarVP::execute(void) // Output if necessary if (!par().output.empty()) { - sliceSum(tmp_vp, vecBuf, Tp); - result.resize(vecBuf.size()); - for (unsigned int t = 0; t < vecBuf.size(); ++t) + std::vector mom; + for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p) { - result[t] = TensorRemove(vecBuf[t]); + mom = strToVec(par().outputMom[i_p]); + vpPhase = tmp_vp; + for (unsigned int j = 0; j < env().getNd()-1; ++j) + { + vpPhase = vpPhase*pow(adj(*phase_[j]), mom[j]); + } + sliceSum(vpPhase, vecBuf, Tp); + result.resize(vecBuf.size()); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(*writerD[i_p], + "Pi_sinktadpole_"+std::to_string(mu)+"_"+std::to_string(nu), + result); } - write(*writerD, - "Pi_sinktadpole_"+std::to_string(mu)+"_"+std::to_string(nu), - result); } // Output if necessary if (!par().output.empty()) { - sliceSum(vpTensor[mu][nu], vecBuf, Tp); - result.resize(vecBuf.size()); - for (unsigned int t = 0; t < vecBuf.size(); ++t) + std::vector mom; + for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p) { - result[t] = TensorRemove(vecBuf[t]); + mom = strToVec(par().outputMom[i_p]); + vpPhase = vpTensor[mu][nu]; + for (unsigned int j = 0; j < env().getNd()-1; ++j) + { + vpPhase = vpPhase*pow(adj(*phase_[j]), mom[j]); + } + sliceSum(vpPhase, vecBuf, Tp); + result.resize(vecBuf.size()); + for (unsigned int t = 0; t < vecBuf.size(); ++t) + { + result[t] = TensorRemove(vecBuf[t]); + } + write(*writer[i_p], + "Pi_"+std::to_string(mu)+"_"+std::to_string(nu), + result); } - write(*writer, "Pi_"+std::to_string(mu)+"_"+std::to_string(nu), - result); } } } - delete writer; - delete writer0; - delete writerD; + if (!par().output.empty()) + { + for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p) + { + delete writer[i_p]; + delete writer0[i_p]; + delete writerD[i_p]; + } + } } void TScalarVP::momD1(ScalarField &s, FFT &fft) diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.hpp b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp index 81071ca0..e4a6e825 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.hpp +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp @@ -18,7 +18,8 @@ public: GRID_SERIALIZABLE_CLASS_MEMBERS(ScalarVPPar, std::string, emField, std::string, scalarProp, - std::string, output); + std::string, output, + std::vector, outputMom); }; class TScalarVP: public Module From a16b1e134ea9f7d7353648aa09a64ffefc275ab9 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 13 Jun 2017 10:48:43 +0100 Subject: [PATCH 057/170] gcc 4.9 fix --- lib/parallelIO/MetaData.h | 8 +++++--- tests/IO/Test_serialisation.cc | 5 ++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/lib/parallelIO/MetaData.h b/lib/parallelIO/MetaData.h index 85a6e0b9..1bad07f2 100644 --- a/lib/parallelIO/MetaData.h +++ b/lib/parallelIO/MetaData.h @@ -37,9 +37,6 @@ #include namespace Grid { - namespace QCD { - - using namespace Grid; //////////////////////////////////////////////////////////////////////////////// // header specification/interpretation @@ -71,6 +68,11 @@ namespace Grid { std::string, floating_point); }; + namespace QCD { + + using namespace Grid; + + ////////////////////////////////////////////////////////////////////// // Bit and Physical Checksumming and QA of data ////////////////////////////////////////////////////////////////////// diff --git a/tests/IO/Test_serialisation.cc b/tests/IO/Test_serialisation.cc index ceddee77..6d918787 100644 --- a/tests/IO/Test_serialisation.cc +++ b/tests/IO/Test_serialisation.cc @@ -45,8 +45,8 @@ public: double, y, bool , b, std::vector, array, - std::vector>, twodimarray, - std::vector>>, cmplx3darray + std::vector >, twodimarray, + std::vector > >, cmplx3darray ); myclass() {} myclass(int i) @@ -63,7 +63,6 @@ public: } }; - int16_t i16 = 1; uint16_t u16 = 2; int32_t i32 = 3; From 0494feec98f1c53b2ac20cab2a4e159637ade84f Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 13 Jun 2017 12:00:23 +0100 Subject: [PATCH 058/170] Libz dependency --- configure.ac | 4 ++++ lib/parallelIO/IldgIO.h | 3 +++ 2 files changed, 7 insertions(+) diff --git a/configure.ac b/configure.ac index 2fc9dfec..f7284d48 100644 --- a/configure.ac +++ b/configure.ac @@ -184,6 +184,10 @@ AC_SEARCH_LIBS([limeCreateReader], [lime], In order to use ILGG file format please install or provide the correct path to your installation Info at: http://usqcd.jlab.org/usqcd-docs/c-lime/)]) +AC_SEARCH_LIBS([crc32], [z], + [AC_DEFINE([HAVE_ZLIB], [1], [Define to 1 if you have the `LIBZ' library])] + [have_zlib=true], + [AC_MSG_ERROR(zlib library was not found in your system.)]) AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp], [AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])] diff --git a/lib/parallelIO/IldgIO.h b/lib/parallelIO/IldgIO.h index df840fb2..a6810b0d 100644 --- a/lib/parallelIO/IldgIO.h +++ b/lib/parallelIO/IldgIO.h @@ -379,6 +379,9 @@ class IldgIO : public BinaryIO { assert( scidac_csuma ==FieldMetaData_.scidac_checksuma); assert( scidac_csumb ==FieldMetaData_.scidac_checksumb); std::cout << GridLogMessage<<"SciDAC checksums match " << std::endl; + } else { + std::cout << GridLogWarning<<"SciDAC checksums not found. This is unsafe. " << std::endl; + //Could choose to fail ? } if ( found_FieldMetaData || found_usqcdInfo ) { From 91199a8ea0907ff1b074066ae566a318b803e437 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 13 Jun 2017 12:21:29 +0100 Subject: [PATCH 059/170] openmpi is not const safe --- lib/parallelIO/BinaryIO.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h index bc3da38b..7226ccba 100644 --- a/lib/parallelIO/BinaryIO.h +++ b/lib/parallelIO/BinaryIO.h @@ -376,7 +376,7 @@ class BinaryIO { if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) { #ifdef USE_MPI_IO std::cout<< GridLogMessage<< "MPI read I/O "<< file<< std::endl; - ierr=MPI_File_open(grid->communicator, file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); assert(ierr==0); + ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); assert(ierr==0); ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0); ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0); MPI_File_close(&fh); @@ -426,7 +426,7 @@ class BinaryIO { if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) { #ifdef USE_MPI_IO std::cout<< GridLogMessage<< "MPI write I/O "<< file<< std::endl; - ierr=MPI_File_open(grid->communicator, file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0); + ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0); ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0); ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0); MPI_File_close(&fh); From e7564f8330eceac22e73b61cca4110bdb2ea5b09 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 13 Jun 2017 12:22:50 +0100 Subject: [PATCH 060/170] Starting a test for reading an ILDG file. --- tests/IO/Test_ildg_read.cc | 112 +++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 tests/IO/Test_ildg_read.cc diff --git a/tests/IO/Test_ildg_read.cc b/tests/IO/Test_ildg_read.cc new file mode 100644 index 00000000..70a46dbf --- /dev/null +++ b/tests/IO/Test_ildg_read.cc @@ -0,0 +1,112 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_nersc_io.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + + std::vector simd_layout = GridDefaultSimd(4,vComplex::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + std::vector latt_size = GridDefaultLatt(); + int orthodir=3; + int orthosz =latt_size[orthodir]; + + GridCartesian Fine(latt_size,simd_layout,mpi_layout); + + LatticeGaugeField Umu(&Fine); + std::vector U(4,&Fine); + + FieldMetaData header; + std::string file("./ckpoint_lat"); + IldgIO::readConfiguration(file,Umu,header); + + for(int mu=0;mu(Umu,mu); + } + + // Painful ; fix syntactical niceness + LatticeComplex LinkTrace(&Fine); + LinkTrace=zero; + for(int mu=0;mu Plaq_T(orthosz); + sliceSum(Plaq,Plaq_T,Nd-1); + int Nt = Plaq_T.size(); + + TComplex Plaq_T_sum; + Plaq_T_sum=zero; + for(int t=0;t Date: Tue, 13 Jun 2017 13:26:59 +0200 Subject: [PATCH 061/170] ARM neon intrinsics support --- configure.ac | 3 + lib/simd/Grid_generic_types.h | 2 +- lib/simd/Grid_neon.h | 460 +++++++++++++++++++++++++++------- lib/simd/Grid_vector_types.h | 44 ++-- 4 files changed, 396 insertions(+), 113 deletions(-) diff --git a/configure.ac b/configure.ac index 62b7545b..20f71128 100644 --- a/configure.ac +++ b/configure.ac @@ -244,6 +244,9 @@ case ${ax_cv_cxx_compiler_vendor} in [generic SIMD vector width (in bytes)]) SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)" SIMD_FLAGS='';; + NEONv8) + AC_DEFINE([NEONV8],[1],[ARMv8 NEON]) + SIMD_FLAGS='';; QPX|BGQ) AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q]) SIMD_FLAGS='';; diff --git a/lib/simd/Grid_generic_types.h b/lib/simd/Grid_generic_types.h index 642f6ffe..eac65e09 100644 --- a/lib/simd/Grid_generic_types.h +++ b/lib/simd/Grid_generic_types.h @@ -26,7 +26,7 @@ Author: Antonin Portelli See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ - +#define GEN_SIMD_WIDTH 16 static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes"); //#define VECTOR_LOOPS diff --git a/lib/simd/Grid_neon.h b/lib/simd/Grid_neon.h index 7c1ad443..f3f802e7 100644 --- a/lib/simd/Grid_neon.h +++ b/lib/simd/Grid_neon.h @@ -1,11 +1,12 @@ - /************************************************************************************* +/************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid + Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/simd/Grid_neon.h Copyright (C) 2015 +Author: Nils Meyer Author: Peter Boyle Author: neo @@ -27,18 +28,23 @@ Author: neo *************************************************************************************/ /* END LEGAL */ //---------------------------------------------------------------------- -/*! @file Grid_sse4.h - @brief Optimization libraries for NEON (ARM) instructions set ARMv8 +/* + + ARMv8 NEON intrinsics layer by + + Nils Meyer , + University of Regensburg, Germany + SFB/TRR55 - Experimental - Using intrinsics - DEVELOPING! */ -// Time-stamp: <2015-07-10 17:45:09 neo> //---------------------------------------------------------------------- +//#ifndef ARM_NEON +//#define ARM_NEON +#include "Grid_generic_types.h" #include -// ARMv8 supports double precision - +namespace Grid { namespace Optimization { template @@ -46,16 +52,20 @@ namespace Optimization { float32x4_t f; vtype v; }; - union u128f { float32x4_t v; float f[4]; }; union u128d { float64x2_t v; - double f[4]; + double f[2]; }; - + // half precision + union u128h { + float16x8_t v; + uint16_t f[8]; + }; + struct Vsplat{ //Complex float inline float32x4_t operator()(float a, float b){ @@ -64,31 +74,31 @@ namespace Optimization { } // Real float inline float32x4_t operator()(float a){ - return vld1q_dup_f32(&a); + return vdupq_n_f32(a); } //Complex double - inline float32x4_t operator()(double a, double b){ - float tmp[4]={(float)a,(float)b,(float)a,(float)b}; - return vld1q_f32(tmp); + inline float64x2_t operator()(double a, double b){ + double tmp[2]={a,b}; + return vld1q_f64(tmp); } - //Real double - inline float32x4_t operator()(double a){ - return vld1q_dup_f32(&a); + //Real double // N:tbc + inline float64x2_t operator()(double a){ + return vdupq_n_f64(a); } - //Integer + //Integer // N:tbc inline uint32x4_t operator()(Integer a){ - return vld1q_dup_u32(&a); + return vdupq_n_u32(a); } }; struct Vstore{ - //Float + //Float inline void operator()(float32x4_t a, float* F){ vst1q_f32(F, a); } //Double - inline void operator()(float32x4_t a, double* D){ - vst1q_f32((float*)D, a); + inline void operator()(float64x2_t a, double* D){ + vst1q_f64(D, a); } //Integer inline void operator()(uint32x4_t a, Integer* I){ @@ -97,54 +107,54 @@ namespace Optimization { }; - struct Vstream{ - //Float + struct Vstream{ // N:equivalents to _mm_stream_p* in NEON? + //Float // N:generic inline void operator()(float * a, float32x4_t b){ - + memcpy(a,&b,4*sizeof(float)); } - //Double - inline void operator()(double * a, float32x4_t b){ - + //Double // N:generic + inline void operator()(double * a, float64x2_t b){ + memcpy(a,&b,2*sizeof(double)); } }; + // Nils: Vset untested; not used currently in Grid at all; + // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b struct Vset{ - // Complex float + // Complex float // N:ok inline float32x4_t operator()(Grid::ComplexF *a){ - float32x4_t foo; - return foo; + float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()}; + return vld1q_f32(tmp); } - // Complex double - inline float32x4_t operator()(Grid::ComplexD *a){ - float32x4_t foo; - return foo; + // Complex double // N:ok + inline float64x2_t operator()(Grid::ComplexD *a){ + double tmp[2]={a[0].imag(),a[0].real()}; + return vld1q_f64(tmp); } - // Real float + // Real float // N:ok inline float32x4_t operator()(float *a){ - float32x4_t foo; - return foo; + float tmp[4]={a[3],a[2],a[1],a[0]}; + return vld1q_f32(tmp); } - // Real double - inline float32x4_t operator()(double *a){ - float32x4_t foo; - return foo; + // Real double // N:ok + inline float64x2_t operator()(double *a){ + double tmp[2]={a[1],a[0]}; + return vld1q_f64(tmp); } - // Integer + // Integer // N:ok inline uint32x4_t operator()(Integer *a){ - uint32x4_t foo; - return foo; + return vld1q_dup_u32(a); } - - }; + // N:leaving as is template struct Reduce{ //Need templated class to overload output type //General form must generate error if compiled - inline Out_type operator()(In_type in){ + inline Out_type operator()(In_type in){ printf("Error, using wrong Reduce function\n"); exit(1); return 0; @@ -184,26 +194,98 @@ namespace Optimization { } }; + struct MultRealPart{ + inline float32x4_t operator()(float32x4_t a, float32x4_t b){ + float32x4_t re = vtrn1q_f32(a, a); + return vmulq_f32(re, b); + } + inline float64x2_t operator()(float64x2_t a, float64x2_t b){ + float64x2_t re = vzip1q_f64(a, a); + return vmulq_f64(re, b); + } + }; + + struct MaddRealPart{ + inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){ + float32x4_t re = vtrn1q_f32(a, a); + return vfmaq_f32(c, re, b); + } + inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){ + float64x2_t re = vzip1q_f64(a, a); + return vfmaq_f64(c, re, b); + } + }; + + struct Div{ + // Real float + inline float32x4_t operator()(float32x4_t a, float32x4_t b){ + return vdivq_f32(a, b); + } + // Real double + inline float64x2_t operator()(float64x2_t a, float64x2_t b){ + return vdivq_f64(a, b); + } + }; + struct MultComplex{ // Complex float inline float32x4_t operator()(float32x4_t a, float32x4_t b){ - float32x4_t foo; - return foo; + + float32x4_t r0, r1, r2, r3, r4; + + // a = ar ai Ar Ai + // b = br bi Br Bi + // collect real/imag part, negate bi and Bi + r0 = vtrn1q_f32(b, b); // br br Br Br + r1 = vnegq_f32(b); // -br -bi -Br -Bi + r2 = vtrn2q_f32(b, r1); // bi -bi Bi -Bi + + // the fun part + r3 = vmulq_f32(r2, a); // bi*ar -bi*ai ... + r4 = vrev64q_f32(r3); // -bi*ai bi*ar ... + + // fma(a,b,c) = a+b*c + return vfmaq_f32(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi ... + + // no fma, use mul and add + //float32x4_t r5; + //r5 = vmulq_f32(r0, a); + //return vaddq_f32(r4, r5); } // Complex double inline float64x2_t operator()(float64x2_t a, float64x2_t b){ - float32x4_t foo; - return foo; + + float64x2_t r0, r1, r2, r3, r4; + + // b = br bi + // collect real/imag part, negate bi + r0 = vtrn1q_f64(b, b); // br br + r1 = vnegq_f64(b); // -br -bi + r2 = vtrn2q_f64(b, r1); // bi -bi + + // the fun part + r3 = vmulq_f64(r2, a); // bi*ar -bi*ai + r4 = vextq_f64(r3,r3,1); // -bi*ai bi*ar + + // fma(a,b,c) = a+b*c + return vfmaq_f64(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi + + // no fma, use mul and add + //float64x2_t r5; + //r5 = vmulq_f64(r0, a); + //return vaddq_f64(r4, r5); } }; struct Mult{ // Real float inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){ - return vaddq_f32(vmulq_f32(b,c),a); + //return vaddq_f32(vmulq_f32(b,c),a); + return vfmaq_f32(a, b, c); } inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){ - return vaddq_f64(vmulq_f64(b,c),a); + //return vaddq_f64(vmulq_f64(b,c),a); + return vfmaq_f64(a, b, c); } inline float32x4_t operator()(float32x4_t a, float32x4_t b){ return vmulq_f32(a,b); @@ -221,89 +303,275 @@ namespace Optimization { struct Conj{ // Complex single inline float32x4_t operator()(float32x4_t in){ - return in; + // ar ai br bi -> ar -ai br -bi + float32x4_t r0, r1; + r0 = vnegq_f32(in); // -ar -ai -br -bi + r1 = vrev64q_f32(r0); // -ai -ar -bi -br + return vtrn1q_f32(in, r1); // ar -ai br -bi } // Complex double - //inline float32x4_t operator()(float32x4_t in){ - // return 0; - //} + inline float64x2_t operator()(float64x2_t in){ + + float64x2_t r0, r1; + r0 = vextq_f64(in, in, 1); // ai ar + r1 = vnegq_f64(r0); // -ai -ar + return vextq_f64(r0, r1, 1); // ar -ai + } // do not define for integer input }; struct TimesMinusI{ //Complex single inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ - return in; + // ar ai br bi -> ai -ar ai -br + float32x4_t r0, r1; + r0 = vnegq_f32(in); // -ar -ai -br -bi + r1 = vrev64q_f32(in); // ai ar bi br + return vtrn1q_f32(r1, r0); // ar -ai br -bi } //Complex double - //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ - // return in; - //} - - + inline float64x2_t operator()(float64x2_t in, float64x2_t ret){ + // a ib -> b -ia + float64x2_t tmp; + tmp = vnegq_f64(in); + return vextq_f64(in, tmp, 1); + } }; struct TimesI{ //Complex single inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ - //need shuffle - return in; + // ar ai br bi -> -ai ar -bi br + float32x4_t r0, r1; + r0 = vnegq_f32(in); // -ar -ai -br -bi + r1 = vrev64q_f32(r0); // -ai -ar -bi -br + return vtrn1q_f32(r1, in); // -ai ar -bi br } //Complex double - //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ - // return 0; - //} + inline float64x2_t operator()(float64x2_t in, float64x2_t ret){ + // a ib -> -b ia + float64x2_t tmp; + tmp = vnegq_f64(in); + return vextq_f64(tmp, in, 1); + } + }; + + struct Permute{ + + static inline float32x4_t Permute0(float32x4_t in){ // N:ok + // AB CD -> CD AB + return vextq_f32(in, in, 2); + }; + static inline float32x4_t Permute1(float32x4_t in){ // N:ok + // AB CD -> BA DC + return vrev64q_f32(in); + }; + static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle + return in; + }; + static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle + return in; + }; + + static inline float64x2_t Permute0(float64x2_t in){ // N:ok + // AB -> BA + return vextq_f64(in, in, 1); + }; + static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle + return in; + }; + static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle + return in; + }; + static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle + return in; + }; + + }; + + struct Rotate{ + + static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok + switch(n){ + case 0: // AB CD -> AB CD + return tRotate<0>(in); + break; + case 1: // AB CD -> BC DA + return tRotate<1>(in); + break; + case 2: // AB CD -> CD AB + return tRotate<2>(in); + break; + case 3: // AB CD -> DA BC + return tRotate<3>(in); + break; + default: assert(0); + } + } + static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok + switch(n){ + case 0: // AB -> AB + return tRotate<0>(in); + break; + case 1: // AB -> BA + return tRotate<1>(in); + break; + default: assert(0); + } + } + +// working, but no restriction on n +// template static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); }; +// template static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); }; + +// restriction on n + template static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); }; + template static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); }; + + }; + + struct PrecisionChange { + + static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) { + float16x4_t h = vcvt_f16_f32(a); + return vcvt_high_f16_f32(h, b); + } + static inline void HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) { + sb = vcvt_high_f32_f16(h); + // there is no direct conversion from lower float32x4_t to float64x2_t + // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang + //float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang + // workaround for clang + uint32x4_t h1u = reinterpret_cast(h); + float16x8_t h1 = reinterpret_cast(vextq_u32(h1u, h1u, 2)); + sa = vcvt_high_f32_f16(h1); + } + static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) { + float32x2_t s = vcvt_f32_f64(a); + return vcvt_high_f32_f64(s, b); + + } + static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) { + b = vcvt_high_f64_f32(s); + // there is no direct conversion from lower float32x4_t to float64x2_t + float32x4_t s1 = vextq_f32(s, s, 2); + a = vcvt_high_f64_f32(s1); + + } + static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) { + float32x4_t s1 = DtoS(a, b); + float32x4_t s2 = DtoS(c, d); + return StoH(s1, s2); + } + static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) { + float32x4_t s1, s2; + HtoS(h, s1, s2); + StoD(s1, a, b); + StoD(s2, c, d); + } + }; + + ////////////////////////////////////////////// + // Exchange support + + struct Exchange{ + static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ + // in1: ABCD -> out1: ABEF + // in2: EFGH -> out2: CDGH + + // z: CDAB + float32x4_t z = vextq_f32(in1, in1, 2); + // out1: ABEF + out1 = vextq_f32(z, in2, 2); + + // z: GHEF + z = vextq_f32(in2, in2, 2); + // out2: CDGH + out2 = vextq_f32(in1, z, 2); + }; + + static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ + // in1: ABCD -> out1: AECG + // in2: EFGH -> out2: BFDH + out1 = vtrn1q_f32(in1, in2); + out2 = vtrn2q_f32(in1, in2); + }; + static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ + assert(0); + return; + }; + static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ + assert(0); + return; + }; + // double precision + static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ + // in1: AB -> out1: AC + // in2: CD -> out2: BD + out1 = vzip1q_f64(in1, in2); + out2 = vzip2q_f64(in1, in2); + }; + static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ + assert(0); + return; + }; + static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ + assert(0); + return; + }; + static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ + assert(0); + return; + }; }; ////////////////////////////////////////////// // Some Template specialization - template < typename vtype > - void permute(vtype &a, vtype b, int perm) { - }; //Complex float Reduce template<> inline Grid::ComplexF Reduce::operator()(float32x4_t in){ - return 0; + float32x4_t v1; // two complex + v1 = Optimization::Permute::Permute0(in); + v1 = vaddq_f32(v1,in); + u128f conv; conv.v=v1; + return Grid::ComplexF(conv.f[0],conv.f[1]); } //Real float Reduce template<> inline Grid::RealF Reduce::operator()(float32x4_t in){ - float32x2_t high = vget_high_f32(in); - float32x2_t low = vget_low_f32(in); - float32x2_t tmp = vadd_f32(low, high); - float32x2_t sum = vpadd_f32(tmp, tmp); - return vget_lane_f32(sum,0); + return vaddvq_f32(in); } - - + + //Complex double Reduce - template<> + template<> // N:by Boyle inline Grid::ComplexD Reduce::operator()(float64x2_t in){ - return 0; + u128d conv; conv.v = in; + return Grid::ComplexD(conv.f[0],conv.f[1]); } - + //Real double Reduce template<> inline Grid::RealD Reduce::operator()(float64x2_t in){ - float64x2_t sum = vpaddq_f64(in, in); - return vgetq_lane_f64(sum,0); + return vaddvq_f64(in); } //Integer Reduce template<> inline Integer Reduce::operator()(uint32x4_t in){ // FIXME unimplemented - printf("Reduce : Missing integer implementation -> FIX\n"); + printf("Reduce : Missing integer implementation -> FIX\n"); assert(0); } } ////////////////////////////////////////////////////////////////////////////////////// -// Here assign types -namespace Grid { +// Here assign types +// typedef Optimization::vech SIMD_Htype; // Reduced precision type + typedef float16x8_t SIMD_Htype; // Half precision type typedef float32x4_t SIMD_Ftype; // Single precision type typedef float64x2_t SIMD_Dtype; // Double precision type typedef uint32x4_t SIMD_Itype; // Integer type @@ -312,13 +580,6 @@ namespace Grid { inline void prefetch_HINT_T0(const char *ptr){}; - // Gpermute function - template < typename VectorSIMD > - inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) { - Optimization::permute(y.v,b.v,perm); - } - - // Function name aliases typedef Optimization::Vsplat VsplatSIMD; typedef Optimization::Vstore VstoreSIMD; @@ -326,16 +587,21 @@ namespace Grid { typedef Optimization::Vstream VstreamSIMD; template using ReduceSIMD = Optimization::Reduce; - + // Arithmetic operations typedef Optimization::Sum SumSIMD; typedef Optimization::Sub SubSIMD; + typedef Optimization::Div DivSIMD; typedef Optimization::Mult MultSIMD; typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultRealPart MultRealPartSIMD; + typedef Optimization::MaddRealPart MaddRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; } + +//#endif // ARM_NEON diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 0048382f..424b5573 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -328,15 +328,15 @@ class Grid_simd { /////////////////////////////////////// //#if (__GNUC__ == 5 ) || ( ( __GNUC__ == 6 ) && __GNUC_MINOR__ < 3 ) - //#pragma GCC push_options - //#pragma GCC optimize ("O0") + //#pragma GCC push_options + //#pragma GCC optimize ("O0") //#endif template friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) { Grid_simd ret; Grid_simd::conv_t conv; Grid_simd::scalar_type s; - + conv.v = v.v; for (int i = 0; i < Nsimd(); i++) { s = conv.s[i]; @@ -368,7 +368,7 @@ class Grid_simd { //#pragma GCC pop_options //#endif /////////////////////// - // Exchange + // Exchange // Al Ah , Bl Bh -> Al Bl Ah,Bh /////////////////////// friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n) @@ -379,7 +379,7 @@ class Grid_simd { Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v); } else if(n==1) { Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v); - } else if(n==0) { + } else if(n==0) { Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v); } } @@ -406,7 +406,7 @@ class Grid_simd { int dist = perm & 0xF; y = rotate(b, dist); return; - } + } else if(perm==3) permute3(y, b); else if(perm==2) permute2(y, b); else if(perm==1) permute1(y, b); @@ -425,7 +425,7 @@ class Grid_simd { } - + }; // end of Grid_simd class definition @@ -451,29 +451,29 @@ inline Grid_simd rotate(Grid_simd b, int nrot) { ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot); return ret; } -template =0> +template =0> inline void rotate( Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); ret.v = Optimization::Rotate::rotate(b.v,nrot); } -template =0> +template =0> inline void rotate(Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); ret.v = Optimization::Rotate::rotate(b.v,2*nrot); } -template +template inline void vbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ S* typepun =(S*) &src; vsplat(ret,typepun[lane]); -} -template =0> +} +template =0> inline void rbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ S* typepun =(S*) &src; ret.v = unary(real(typepun[lane]), VsplatSIMD()); -} +} @@ -604,13 +604,27 @@ inline Grid_simd real_mult(Grid_simd a, Grid_simd b) { ret.v = binary(a.v, b.v, MultRealPartSIMD()); return ret; }; +// TEST for Test_simd +template = 0> +inline Grid_simd real_mult(std::complex a, std::complex b) { + Grid_simd ret; + //ret.v = binary(a.v, b.v, MultRealPartSIMD()); + return ret; +}; + template = 0> inline Grid_simd real_madd(Grid_simd a, Grid_simd b, Grid_simd c) { Grid_simd ret; ret.v = trinary(a.v, b.v, c.v, MaddRealPartSIMD()); return ret; }; - +// TEST for Test_simd +template = 0> +inline Grid_simd real_madd(std::complex a, std::complex b) { + Grid_simd ret; + //ret.v = binary(a.v, b.v, MultRealPartSIMD()); + return ret; +}; // Distinguish between complex types and others template = 0> @@ -640,7 +654,7 @@ inline Grid_simd operator/(Grid_simd a, Grid_simd b) { ret = a * conjugate(b) ; den = b * conjugate(b) ; - + auto real_den = toReal(den); ret.v=binary(ret.v, real_den.v, DivSIMD()); From 12ccc73cf512f09cebda7a8fd0dc13fdce0d9e3e Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 14 Jun 2017 05:19:17 +0100 Subject: [PATCH 062/170] Serialisation no compile fix --- tests/IO/Test_serialisation.cc | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/tests/IO/Test_serialisation.cc b/tests/IO/Test_serialisation.cc index 6d918787..d5b52044 100644 --- a/tests/IO/Test_serialisation.cc +++ b/tests/IO/Test_serialisation.cc @@ -239,21 +239,6 @@ int main(int argc,char **argv) std::cout << jcopy1 << std::endl << jveccopy1 << std::endl; } - { - ildgFormat format; - format.version =1.0; - format.field =std::string("su3gauge"); - format.precision =32; - format.lx =24; - format.ly =24; - format.lz =24; - format.lt =48; - XmlWriter WR("ildg-format.xml",""); - XmlWriter WRs("",""); - write(WR,"ildgFormat",format); - write(WRs,"ildgFormat",format); - std::cout << " XmlString: " < Date: Wed, 14 Jun 2017 10:53:39 +0100 Subject: [PATCH 063/170] QPX exchange support --- lib/simd/Grid_qpx.h | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index cbca9118..9fc8ef3c 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -375,6 +375,49 @@ namespace Optimization { FLOAT_WRAP_2(operator(), inline) }; + ////////////////////////////////////////////// + // Exchange support +#define FLOAT_WRAP_EXCHANGE(fn) \ + static inline void fn(vector4float &out1, vector4float &out2, \ + vector4float in1, vector4float in2) \ + { \ + vector4double out1d, out2d, in1d, in2d; \ + in1d = Vset()(in1); \ + in2d = Vset()(in2); \ + fn(out1d, out2d, in1d, in2d); \ + Vstore()(out1d, out1); \ + Vstore()(out2d, out2); \ + } + + struct Exchange{ + + // double precision + static inline void Exchange0(vector4double &out1, vector4double &out2, + vector4double in1, vector4double in2) { + out1 = vec_perm(in1, in2, vec_gpci(0145)); + out2 = vec_perm(in1, in2, vec_gpci(02367)); + } + static inline void Exchange1(vector4double &out1, vector4double &out2, + vector4double in1, vector4double in2) { + out1 = vec_perm(in1, in2, vec_gpci(0426)); + out2 = vec_perm(in1, in2, vec_gpci(01537)); + } + static inline void Exchange2(vector4double &out1, vector4double &out2, + vector4double in1, vector4double in2) { + assert(0); + } + static inline void Exchange3(vector4double &out1, vector4double &out2, + vector4double in1, vector4double in2) { + assert(0); + } + + // single precision + FLOAT_WRAP_EXCHANGE(Exchange0); + FLOAT_WRAP_EXCHANGE(Exchange1); + FLOAT_WRAP_EXCHANGE(Exchange2); + FLOAT_WRAP_EXCHANGE(Exchange3); + }; + struct Permute{ //Complex double static inline vector4double Permute0(vector4double v){ //0123 -> 2301 From 735cbdb983703fd3ffadc6133d792b4d058a897b Mon Sep 17 00:00:00 2001 From: Lanny91 Date: Wed, 14 Jun 2017 10:55:10 +0100 Subject: [PATCH 064/170] QPX Integer reduction (+ integer reduction test) --- lib/simd/Grid_qpx.h | 11 +++++++---- tests/Test_simd.cc | 47 +++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 6 deletions(-) diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index 9fc8ef3c..00dbace5 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -540,10 +540,13 @@ namespace Optimization { //Integer Reduce template<> - inline Integer Reduce::operator()(int in){ - // FIXME unimplemented - printf("Reduce : Missing integer implementation -> FIX\n"); - assert(0); + inline Integer Reduce::operator()(veci in){ + Integer a = 0; + for (unsigned int i = 0; i < W::r; ++i) + { + a += in.v[i]; + } + return a; } } diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc index c0bbef1d..b2e8d68e 100644 --- a/tests/Test_simd.cc +++ b/tests/Test_simd.cc @@ -183,8 +183,6 @@ void IntTester(const functor &func) { typedef Integer scal; typedef vInteger vec; - GridSerialRNG sRNG; - sRNG.SeedFixedIntegers(std::vector({45,12,81,9})); int Nsimd = vec::Nsimd(); @@ -287,6 +285,50 @@ void ReductionTester(const functor &func) } +template +void IntReductionTester(const functor &func) +{ + int Nsimd = vec::Nsimd(); + + std::vector input1(Nsimd); + std::vector input2(Nsimd); + reduced result(0); + reduced reference(0); + reduced tmp; + + std::vector > buf(3); + vec & v_input1 = buf[0]; + vec & v_input2 = buf[1]; + + for(int i=0;i(v_input1,input1); + merge(v_input2,input2); + + func.template vfunc(result,v_input1,v_input2); + + for(int i=0;i(tmp,input1[i],input2[i]); + reference+=tmp; + } + + std::cout<(funcReduce()); std::cout< Date: Fri, 16 Jun 2017 15:04:26 +0100 Subject: [PATCH 065/170] Placeholder precision change functions to allow Grid to compile with QPX (warning: no actual functionality) --- lib/simd/Grid_qpx.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index 00dbace5..8de7bde8 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -374,6 +374,41 @@ namespace Optimization { // Complex float FLOAT_WRAP_2(operator(), inline) }; +#define USE_FP16 + struct PrecisionChange { + static inline vech StoH (const vector4float &a, const vector4float &b) { + vech ret; + std::cout << GridLogError << "QPX single to half precision conversion not yet supported." << std::endl; + assert(0); + return ret; + } + static inline void HtoS (vech h, vector4float &sa, vector4float &sb) { + std::cout << GridLogError << "QPX half to single precision conversion not yet supported." << std::endl; + assert(0); + } + static inline vector4float DtoS (vector4double a, vector4double b) { + vector4float ret; + std::cout << GridLogError << "QPX double to single precision conversion not yet supported." << std::endl; + assert(0); + return ret; + } + static inline void StoD (vector4float s, vector4double &a, vector4double &b) { + std::cout << GridLogError << "QPX single to double precision conversion not yet supported." << std::endl; + assert(0); + } + static inline vech DtoH (vector4double a, vector4double b, + vector4double c, vector4double d) { + vech ret; + std::cout << GridLogError << "QPX double to half precision conversion not yet supported." << std::endl; + assert(0); + return ret; + } + static inline void HtoD (vech h, vector4double &a, vector4double &b, + vector4double &c, vector4double &d) { + std::cout << GridLogError << "QPX half to double precision conversion not yet supported." << std::endl; + assert(0); + } + }; ////////////////////////////////////////////// // Exchange support @@ -552,6 +587,7 @@ namespace Optimization { //////////////////////////////////////////////////////////////////////////////// // Here assign types +typedef Optimization::vech SIMD_Htype; // Half precision type typedef Optimization::vector4float SIMD_Ftype; // Single precision type typedef vector4double SIMD_Dtype; // Double precision type typedef Optimization::veci SIMD_Itype; // Integer type From a833f88c3237f9c941e9eb79ad459d0e260d2a2b Mon Sep 17 00:00:00 2001 From: Lanny91 Date: Fri, 16 Jun 2017 15:58:47 +0100 Subject: [PATCH 066/170] Added missing SIMD integer reduction implementation for AVX, AVX-512, SSE4, IMCI --- lib/simd/Grid_avx.h | 25 ++++++++++++++++++++++--- lib/simd/Grid_avx512.h | 22 +++++++++++++++++++--- lib/simd/Grid_imci.h | 4 +--- lib/simd/Grid_sse4.h | 6 +++--- 4 files changed, 45 insertions(+), 12 deletions(-) diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index 52be9c05..57d9064d 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -701,9 +701,28 @@ namespace Optimization { //Integer Reduce template<> inline Integer Reduce::operator()(__m256i in){ - // FIXME unimplemented - printf("Reduce : Missing integer implementation -> FIX\n"); - assert(0); + __m128i ret; +#if defined (AVX2) + // AVX2 horizontal adds within upper and lower halves of register; use + // SSE to add upper and lower halves for result. + __m256i v1, v2; + __m128i u1, u2; + v1 = _mm256_hadd_epi32(in, in); + v2 = _mm256_hadd_epi32(v1, v1); + u1 = _mm256_castsi256_si128(v2); // upper half + u2 = _mm256_extracti128_si256(v2, 1); // lower half + ret = _mm256_add_epi32(u1, u2); +#else + // No AVX horizontal add; extract upper and lower halves of register & use + // SSE intrinsics. + __m128i u1, u2, u3; + u1 = _mm256_extractf128_si256(in, 0); // upper half + u2 = _mm256_extractf128_si256(in, 1); // lower half + u3 = _mm_add_epi32(u1, u2); + u1 = _mm_hadd_epi32(u3, u3); + ret = _mm_hadd_epi32(u1, u1); +#endif + return _mm_cvtsi128_si32(ret); } } diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index ba054665..458a8f7c 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -543,6 +543,24 @@ namespace Optimization { u512d conv; conv.v = v1; return conv.f[0]; } + + //Integer Reduce + template<> + inline Integer Reduce::operator()(__m512i in){ + // No full vector reduce, use AVX to add upper and lower halves of register + // and perform AVX reduction. + __m256i v1, v2, v3; + __m128i u1, u2, ret; + v1 = _mm512_castsi512_si256(in); // upper half + v2 = _mm512_extracti32x8_epi32(in, 1); // lower half + v3 = _mm256_add_epi32(v1, v2); + v1 = _mm256_hadd_epi32(v3, v3); + v2 = _mm256_hadd_epi32(v1, v1); + u1 = _mm256_castsi256_si128(v2) // upper half + u2 = _mm256_extracti128_si256(v2, 1); // lower half + ret = _mm256_add_epi32(u1, u2); + return _mm_cvtsi128_si32(ret); + } #else //Complex float Reduce template<> @@ -570,9 +588,7 @@ namespace Optimization { //Integer Reduce template<> inline Integer Reduce::operator()(__m512i in){ - // FIXME unimplemented - printf("Reduce : Missing integer implementation -> FIX\n"); - assert(0); + return _mm512_reduce_add_epi32(in); } #endif diff --git a/lib/simd/Grid_imci.h b/lib/simd/Grid_imci.h index 173e57d8..a1dae565 100644 --- a/lib/simd/Grid_imci.h +++ b/lib/simd/Grid_imci.h @@ -401,9 +401,7 @@ namespace Optimization { //Integer Reduce template<> inline Integer Reduce::operator()(__m512i in){ - // FIXME unimplemented - printf("Reduce : Missing integer implementation -> FIX\n"); - assert(0); + return _mm512_reduce_add_epi32(in); } diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index 2fb2df76..0b1f9ffb 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -570,9 +570,9 @@ namespace Optimization { //Integer Reduce template<> inline Integer Reduce::operator()(__m128i in){ - // FIXME unimplemented - printf("Reduce : Missing integer implementation -> FIX\n"); - assert(0); + __m128i v1 = _mm_hadd_epi32(in, in); + __m128i v2 = _mm_hadd_epi32(v1, v1); + return _mm_cvtsi128_si32(v2); } } From d57217017075d38c8f170fe7b141ea6d7f662c16 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 18 Jun 2017 00:10:20 +0100 Subject: [PATCH 067/170] Update for SciDAC --- lib/GridStd.h | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/GridStd.h b/lib/GridStd.h index 959ba9ac..097e62ab 100644 --- a/lib/GridStd.h +++ b/lib/GridStd.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include From 8e9be9f84f0aa38e94dfafa81d525526fbed9bc1 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 18 Jun 2017 00:10:42 +0100 Subject: [PATCH 068/170] Updates for SciDAC IO --- lib/parallelIO/BinaryIO.h | 135 +++++++++++++++++--------------------- 1 file changed, 59 insertions(+), 76 deletions(-) diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h index 7226ccba..117bec01 100644 --- a/lib/parallelIO/BinaryIO.h +++ b/lib/parallelIO/BinaryIO.h @@ -69,46 +69,6 @@ inline uint64_t Grid_ntohll(uint64_t A) { } #endif -///////////////////////////////////////////////////////////////////////////////// -// Simple classes for precision conversion -///////////////////////////////////////////////////////////////////////////////// -template -struct BinarySimpleUnmunger { - typedef typename getPrecision::real_scalar_type fobj_stype; - typedef typename getPrecision::real_scalar_type sobj_stype; - - void operator()(sobj &in, fobj &out) { - // take word by word and transform accoding to the status - fobj_stype *out_buffer = (fobj_stype *)&out; - sobj_stype *in_buffer = (sobj_stype *)∈ - size_t fobj_words = sizeof(out) / sizeof(fobj_stype); - size_t sobj_words = sizeof(in) / sizeof(sobj_stype); - assert(fobj_words == sobj_words); - - for (unsigned int word = 0; word < sobj_words; word++) - out_buffer[word] = in_buffer[word]; // type conversion on the fly - - } -}; - -template -struct BinarySimpleMunger { - typedef typename getPrecision::real_scalar_type fobj_stype; - typedef typename getPrecision::real_scalar_type sobj_stype; - - void operator()(fobj &in, sobj &out) { - // take word by word and transform accoding to the status - fobj_stype *in_buffer = (fobj_stype *)∈ - sobj_stype *out_buffer = (sobj_stype *)&out; - size_t fobj_words = sizeof(in) / sizeof(fobj_stype); - size_t sobj_words = sizeof(out) / sizeof(sobj_stype); - assert(fobj_words == sobj_words); - - for (unsigned int word = 0; word < sobj_words; word++) - out_buffer[word] = in_buffer[word]; // type conversion on the fly - - } -}; // A little helper inline void removeWhitespace(std::string &key) { @@ -126,11 +86,7 @@ class BinaryIO { // more byte manipulation helpers ///////////////////////////////////////////////////////////////////////////// - template static inline void Uint32Checksum(Lattice &lat, - uint32_t &nersc_csum, - uint32_t &scidac_csuma, - uint32_t &scidac_csumb) - + template static inline void Uint32Checksum(Lattice &lat,uint32_t &nersc_csum) { typedef typename vobj::scalar_object sobj; @@ -140,15 +96,38 @@ class BinaryIO { std::vector scalardata(lsites); unvectorizeToLexOrdArray(scalardata,lat); - Uint32Checksum(grid,scalardata,nersc_csum,scidac_csuma,scidac_csumb); + NerscChecksum(grid,scalardata,nersc_csum); } - template - static inline void Uint32Checksum(GridBase *grid, - std::vector &fbuf, - uint32_t &nersc_csum, - uint32_t &scidac_csuma, - uint32_t &scidac_csumb) + template static inline void NerscChecksum(GridBase *grid,std::vector &fbuf,uint32_t &nersc_csum) + { + const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t); + + + uint64_t lsites =grid->lSites(); + if (fbuf.size()==1) { + lsites=1; + } + +#pragma omp parallel + { + uint32_t nersc_csum_thr=0; + +#pragma omp for + for(uint64_t local_site=0;local_site static inline void ScidacChecksum(GridBase *grid,std::vector &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb) { const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t); @@ -156,6 +135,9 @@ class BinaryIO { int nd = grid->_ndimension; uint64_t lsites =grid->lSites(); + if (fbuf.size()==1) { + lsites=1; + } std::vector local_vol =grid->LocalDimensions(); std::vector local_start =grid->LocalStarts(); std::vector global_vol =grid->FullDimensions(); @@ -163,21 +145,15 @@ class BinaryIO { #pragma omp parallel { std::vector coor(nd); - uint32_t nersc_csum_thr=0; uint32_t scidac_csuma_thr=0; uint32_t scidac_csumb_thr=0; uint32_t site_crc=0; - uint32_t zcrc = crc32(0L, Z_NULL, 0); #pragma omp for for(uint64_t local_site=0;local_site>(32-gsite29); scidac_csumb_thr ^= site_crc<>(32-gsite31); } #pragma omp critical { - nersc_csum += nersc_csum_thr; scidac_csuma^= scidac_csuma_thr; scidac_csumb^= scidac_csumb_thr; } @@ -386,7 +363,8 @@ class BinaryIO { assert(0); #endif } else { - std::cout<< GridLogMessage<< "C++ read I/O "<< file<< std::endl; + std::cout<< GridLogMessage<< "C++ read I/O "<< file<<" : " + << iodata.size()*sizeof(fobj)<<" bytes"<Barrier(); bstimer.Start(); + ScidacChecksum(grid,iodata,scidac_csuma,scidac_csumb); if (ieee32big) be32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size()); if (ieee32) le32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size()); if (ieee64big) be64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size()); if (ieee64) le64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size()); - Uint32Checksum(grid,iodata,nersc_csum,scidac_csuma,scidac_csumb); + NerscChecksum(grid,iodata,nersc_csum); bstimer.Stop(); } if ( control & BINARYIO_WRITE ) { bstimer.Start(); - Uint32Checksum(grid,iodata,nersc_csum,scidac_csuma,scidac_csumb); + NerscChecksum(grid,iodata,nersc_csum); if (ieee32big) htobe32_v((void *)&iodata[0], sizeof(fobj)*iodata.size()); if (ieee32) htole32_v((void *)&iodata[0], sizeof(fobj)*iodata.size()); if (ieee64big) htobe64_v((void *)&iodata[0], sizeof(fobj)*iodata.size()); if (ieee64) htole64_v((void *)&iodata[0], sizeof(fobj)*iodata.size()); + ScidacChecksum(grid,iodata,scidac_csuma,scidac_csumb); bstimer.Stop(); grid->Barrier(); @@ -436,9 +416,9 @@ class BinaryIO { assert(0); #endif } else { - std::cout<< GridLogMessage<< "C++ write I/O "<< file<< std::endl; - std::ofstream fout; - fout.open(file,std::ios::binary|std::ios::out|std::ios::in); + std::ofstream fout; fout.open(file,std::ios::binary|std::ios::out|std::ios::in); + std::cout<< GridLogMessage<< "C++ write I/O "<< file<<" : " + << iodata.size()*sizeof(fobj)<<" bytes"<GlobalXOR(scidac_csuma); grid->GlobalXOR(scidac_csumb); grid->Barrier(); - // std::cout << "Binary IO NERSC checksum 0x"< Date: Sun, 18 Jun 2017 00:11:02 +0100 Subject: [PATCH 069/170] SciDAC I/O and ILDG improvements --- lib/parallelIO/IldgIO.h | 552 ++++++++++++++++++++++++++--------- lib/parallelIO/IldgIOtypes.h | 149 ++++++++-- 2 files changed, 551 insertions(+), 150 deletions(-) diff --git a/lib/parallelIO/IldgIO.h b/lib/parallelIO/IldgIO.h index a6810b0d..9a1612d5 100644 --- a/lib/parallelIO/IldgIO.h +++ b/lib/parallelIO/IldgIO.h @@ -27,6 +27,7 @@ directory #ifndef GRID_ILDG_IO_H #define GRID_ILDG_IO_H +#ifdef HAVE_LIME #include #include #include @@ -37,31 +38,153 @@ directory #include #include -#ifdef HAVE_LIME - +//Lime is a must have for this functionality extern "C" { // for linkage #include "lime.h" } - -// Unused SCIDAC records names -// SCIDAC_PRIVATE_FILE_XML "scidac-private-file-xml" -// SCIDAC_SITELIST "scidac-sitelist" -// SCIDAC_FILE_XML "scidac-file-xml" -// SCIDAC_RIVATE_RECORD_XML "scidac-private-record-xml" -// SCIDAC_RECORD_XML "scidac-record-xml" -// SCIDAC_BINARY_DATA "scidac-binary-data" -// -// Scidac checksum: CRC32 every site, xor reduce some hash of this. -// https://github.com/usqcd-software/qio/blob/master/lib/dml/DML_utils.c - namespace Grid { namespace QCD { -class IldgIO : public BinaryIO { + template inline std::string ScidacWordMnemonic(void){ return std::string("unknown"); } + template<> inline std::string ScidacWordMnemonic (void){ return std::string("D"); } + template<> inline std::string ScidacWordMnemonic (void){ return std::string("F"); } + template<> inline std::string ScidacWordMnemonic< int32_t>(void){ return std::string("I32_t"); } + template<> inline std::string ScidacWordMnemonic(void){ return std::string("U32_t"); } + template<> inline std::string ScidacWordMnemonic< int64_t>(void){ return std::string("I64_t"); } + template<> inline std::string ScidacWordMnemonic(void){ return std::string("U64_t"); } + + template std::string ScidacRecordTypeString(int &colors, int &spins, int & typesize,int &datacount) { + + typedef typename getPrecision::real_scalar_type stype; + + int _ColourN = indexRank(); + int _ColourScalar = isScalar(); + int _ColourVector = isVector(); + int _ColourMatrix = isMatrix(); + + int _SpinN = indexRank(); + int _SpinScalar = isScalar(); + int _SpinVector = isVector(); + int _SpinMatrix = isMatrix(); + + int _LorentzN = indexRank(); + int _LorentzScalar = isScalar(); + int _LorentzVector = isVector(); + int _LorentzMatrix = isMatrix(); + + std::stringstream stream; + + stream << "GRID_"; + stream << ScidacWordMnemonic(); + + // std::cout << " Lorentz N/S/V/M : " << _LorentzN<<" "<<_LorentzScalar<<"/"<<_LorentzVector<<"/"<<_LorentzMatrix< std::string ScidacRecordTypeString(Lattice & lat,int &colors, int &spins, int & typesize,int &datacount) { + return ScidacRecordTypeString(colors,spins,typesize,datacount); + }; + + template void ScidacMetaData(Lattice & field, + FieldMetaData &header, + scidacRecord & _scidacRecord, + scidacFile & _scidacFile) + { + typedef typename getPrecision::real_scalar_type stype; + + ///////////////////////////////////// + // Pull Grid's metadata + ///////////////////////////////////// + PrepareMetaData(field,header); + + ///////////////////////////////////// + // Scidac Private File structure + ///////////////////////////////////// + _scidacFile = scidacFile(field._grid); + + ///////////////////////////////////// + // Scidac Private Record structure + ///////////////////////////////////// + scidacRecord sr; + sr.datatype = ScidacRecordTypeString(field,sr.colors,sr.spins,sr.typesize,sr.datacount); + sr.date = header.creation_date; + sr.precision = ScidacWordMnemonic(); + sr.recordtype = GRID_IO_FIELD; + + _scidacRecord = sr; + + std::cout << GridLogMessage << "Build SciDAC datatype " < + int readObject(serialisable_object &object,std::string object_name,std::string record_name) + + int createLimeRecordHeader(std::string message, int MB, int ME, size_t PayloadSize); + template + int writeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name) + template + int writeLimeLatticeBinaryObject(Lattice &field,std::string filename,std::string record_name) + */ + /////////////////////////////////////////////////////// + // Lime utility functions + /////////////////////////////////////////////////////// + + static int createLimeRecordHeader(std::string message, int MB, int ME, size_t PayloadSize, LimeWriter* L) { LimeRecordHeader *h; h = limeCreateHeader(MB, ME, const_cast(message.c_str()), PayloadSize); @@ -70,6 +193,9 @@ class IldgIO : public BinaryIO { return LIME_SUCCESS; } + //////////////////////////////////////////// + // Write a generic serialisable object + //////////////////////////////////////////// template static void writeLimeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name, LimeWriter *LimeW) { @@ -81,24 +207,232 @@ class IldgIO : public BinaryIO { } uint64_t nbytes = xmlstring.size(); LimeRecordHeader *h = limeCreateHeader(MB, ME,(char *)record_name.c_str(), nbytes); - assert(limeWriteRecordHeader(h, LimeW)>=0); - assert(limeWriteRecordData(&xmlstring[0], &nbytes, LimeW)>=0); - limeWriterCloseRecord(LimeW); + int err=limeWriteRecordHeader(h, LimeW); assert(err>=0); + err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0); + err=limeWriterCloseRecord(LimeW); assert(err>=0); limeDestroyHeader(h); } + //////////////////////////////////////////// + // Read a generic serialisable object + //////////////////////////////////////////// + template + static void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name, LimeReader *LimeR) + { + std::string xmlstring; + // should this be a do while; can we miss a first record?? + while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { - static unsigned int writeHeader(FieldMetaData &header, LimeWriter *LimeW) { + uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration) + + if ( strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) ) ) { + std::vector xmlc(nbytes+1,'\0'); + limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR); + XmlReader RD(&xmlc[0],""); + read(RD,object_name,object); + return; + } + + } + assert(0); + } + + //////////////////////////////////////////// + // Read a generic lattice field and verify checksum + //////////////////////////////////////////// + template + static void readLimeLatticeBinaryObject(Lattice &field,std::string filename,std::string record_name,FILE *File, LimeReader *LimeR) + { + typedef typename vobj::scalar_object sobj; + scidacChecksum scidacChecksum_; + uint32_t nersc_csum,scidac_csuma,scidac_csumb; + + std::string format = getFormatString(); + + while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { + + std::cout << GridLogMessage << limeReaderType(LimeR) < munge; + BinaryIO::readLatticeObject< sobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb); + + ///////////////////////////////////////////// + // Insist checksum is next record + ///////////////////////////////////////////// + readLimeObject(scidacChecksum_,std::string("scidacChecksum"),record_name,LimeR); + + ///////////////////////////////////////////// + // Verify checksums + ///////////////////////////////////////////// + scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb); + return; + } + } + } + + //////////////////////////////////////////// + // Write a generic lattice field and csum + //////////////////////////////////////////// + template + static void writeLimeLatticeBinaryObject(Lattice &field,std::string filename,std::string record_name,FILE *File, LimeWriter *LimeW) + { + + //////////////////////////////////////////// + // Create record header + //////////////////////////////////////////// + typedef typename vobj::scalar_object sobj; + int err; + uint32_t nersc_csum,scidac_csuma,scidac_csumb; + uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites; + createLimeRecordHeader(record_name, 0, 0, PayloadSize, LimeW); + + //////////////////////////////////////////////////////////////////// + // NB: FILE and iostream are jointly writing disjoint sequences in the + // the same file through different file handles (integer units). + // + // These are both buffered, so why I think this code is right is as follows. + // + // i) write record header to FILE *File, telegraphing the size. + // ii) ftell reads the offset from FILE *File . + // iii) iostream / MPI Open independently seek this offset. Write sequence direct to disk. + // Closes iostream and flushes. + // iv) fseek on FILE * to end of this disjoint section. + // v) Continue writing scidac record. + //////////////////////////////////////////////////////////////////// + off_t offset = ftell(File); + std::string format = getFormatString(); + BinarySimpleMunger munge; + BinaryIO::writeLatticeObject(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb); + err=limeWriterCloseRecord(LimeW); assert(err>=0); + //////////////////////////////////////// + // Write checksum element, propagaing forward from the BinaryIO + // Always pair a checksum with a binary object, and close message + //////////////////////////////////////// + scidacChecksum checksum; + std::stringstream streama; streama << std::hex << scidac_csuma; + std::stringstream streamb; streamb << std::hex << scidac_csumb; + checksum.suma= streama.str(); + checksum.sumb= streamb.str(); + std::cout << GridLogMessage<<" writing scidac checksums "< + int open(std::string filename,GridBase *grid,userFile &_userFile,int volfmt) { + + } + void close(void) { + + } + template + int writeScidacField(Lattice &field,userRecord &_userRecord,int volfmt) + template + int readScidacField(Lattice &field,userRecord &_userRecord,int volfmt) + */ + //////////////////////////////////////////////// + // Write generic lattice field in scidac format + //////////////////////////////////////////////// + template + static void writeScidacField(std::string filename,Lattice &field,userFile _userFile,userRecord _userRecord) + { + typedef typename vobj::scalar_object sobj; + uint64_t nbytes; + GridBase * grid = field._grid; + + //////////////////////////////////////// + // fill the Grid header + //////////////////////////////////////// + FieldMetaData header; + scidacRecord _scidacRecord; + scidacFile _scidacFile; + + ScidacMetaData(field,header,_scidacRecord,_scidacFile); + + ////////////////////////////////////////////// + // Fill the Lime file record by record + ////////////////////////////////////////////// + FILE *File = fopen(filename.c_str(), "w"); + LimeWriter *LimeW = limeCreateWriter(File); + assert(LimeW != NULL ); + + writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT),LimeW); // Open message + writeLimeObject(0,0,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML),LimeW); + writeLimeObject(0,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML),LimeW); + writeLimeObject(0,0,_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML),LimeW); + writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML),LimeW); + writeLimeLatticeBinaryObject(field,filename,std::string(ILDG_BINARY_DATA),File,LimeW); // Closes message with checksum + + limeDestroyWriter(LimeW); + fclose(File); + } +}; + +class IldgIO : public ScidacIO { + public: + + /////////////////////////////////// + // A little helper + /////////////////////////////////// + static void writeLimeIldgLFN(std::string &LFN,LimeWriter *LimeW) + { + uint64_t PayloadSize = LFN.size(); + int err; + createLimeRecordHeader(ILDG_DATA_LFN, 0 , 0, PayloadSize, LimeW); + err=limeWriteRecordData(const_cast(LFN.c_str()), &PayloadSize, LimeW); assert(err>=0); + err=limeWriterCloseRecord(LimeW); assert(err>=0); + } + + //////////////////////////////////////////////////////////////// + // Special ILDG operations ; gauge configs only. + // Don't require scidac records EXCEPT checksum + // Use Grid MetaData object if present. + //////////////////////////////////////////////////////////////// + template + static void writeConfiguration(std::string filename,Lattice > &Umu) + { + GridBase * grid = Umu._grid; + typedef Lattice > GaugeField; + typedef iLorentzColourMatrix vobj; + typedef typename vobj::scalar_object sobj; uint64_t nbytes; - ildgFormat ildgfmt ; - usqcdInfo info; + //////////////////////////////////////// + // fill the Grid header + //////////////////////////////////////// + FieldMetaData header; + scidacRecord _scidacRecord; + scidacFile _scidacFile; + + ScidacMetaData(Umu,header,_scidacRecord,_scidacFile); + + std::string format = header.floating_point; + + assert ( (format == std::string("IEEE32BIG")) + ||(format == std::string("IEEE64BIG")) ); ////////////////////////////////////////////////////// // Fill ILDG header data struct ////////////////////////////////////////////////////// + ildgFormat ildgfmt ; ildgfmt.field = std::string("su3gauge"); - ildgfmt.precision = 64; + + if ( format == std::string("IEEE32BIG") ) { + ildgfmt.precision = 32; + } else { + ildgfmt.precision = 64; + } ildgfmt.version = 1.0; ildgfmt.lx = header.dimension[0]; ildgfmt.ly = header.dimension[1]; @@ -107,108 +441,59 @@ class IldgIO : public BinaryIO { assert(header.nd==4); assert(header.nd==header.dimension.size()); + ////////////////////////////////////////////////////////////////////////////// + // Fill the USQCD info field + ////////////////////////////////////////////////////////////////////////////// + usqcdInfo info; info.version=1.0; info.plaq = header.plaquette; info.linktr = header.link_trace; - // Following scidac file downloaded from NERSC under MILC - // Begin message, keep open on successive records - //Message 1 - // Type: scidac-private-file-xml 1.1416 16 16 48 0 - // Type: scidac-file-xml MILC ILDG archival gauge configuration - //Message 2 - // Type: scidac-private-record-xml 1.0Thu May 11 00:11:33 2006 UTC0 - // QDP_F3_ColorMatrixF3724 - // Type: scidac-record-xml - // Type: ildg-format - // Type: ildg-data-lfn - // Type: ildg-binary-data - // Type: scidac-checksum - - writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT),LimeW); - writeLimeObject(0,0,info ,std::string("usqcdInfo" ),std::string(USQCD_INFO ),LimeW); - writeLimeObject(0,0,ildgfmt,std::string("ildgFormat") ,std::string(ILDG_FORMAT),LimeW); - // LFN is not a serializable object - { - std::string LFN = header.ildg_lfn; - uint64_t PayloadSize = LFN.size(); - createHeader(ILDG_DATA_LFN, 0 , 0, PayloadSize, LimeW); - limeWriteRecordData(const_cast(LFN.c_str()), &PayloadSize, LimeW); - limeWriterCloseRecord(LimeW); - } - return 0; - } - - template - static void writeConfiguration(std::string filename,Lattice > &Umu, std::string format) { + std::cout << GridLogMessage << " Writing config; IldgIO "< > GaugeField; - typedef iLorentzColourMatrix vobj; - typedef typename vobj::scalar_object sobj; - typedef LorentzColourMatrixD fobj; - - GridBase * grid = Umu._grid; - - //////////////////////////////////////// - // fill the headers - //////////////////////////////////////// - FieldMetaData header; - - GridMetaData(grid,header); - GaugeStatistics(Umu,header); - MachineCharacteristics(header); - - assert( (format=="IEEE64BIG") || (format=="IEEE32BIG")); - header.floating_point = format; - header.checksum = 0x0; // unused in ILDG - writeHeader(header,LimeW); - - //////////////////////////////////////// - // Write data record header - //////////////////////////////////////// - uint64_t PayloadSize = sizeof(fobj) * Umu._grid->_gsites; - createHeader(ILDG_BINARY_DATA, 0, 0, PayloadSize, LimeW); - - off_t offset = ftell(File); - uint32_t nersc_csum,scidac_csuma,scidac_csumb; - GaugeSimpleMunger munge; - BinaryIO::writeLatticeObject(Umu, filename, munge, offset, header.floating_point, - nersc_csum,scidac_csuma,scidac_csumb); - limeWriterCloseRecord(LimeW); - - //////////////////////////////////////// - // Write checksum element, propagaing forward from the BinaryIO - //////////////////////////////////////// - scidacChecksum checksum; - checksum.suma= scidac_csuma; - checksum.sumb= scidac_csumb; - // std::cout << " writing scidac checksums "< static void readConfiguration(std::string filename,Lattice > &Umu, FieldMetaData &FieldMetaData_) { typedef Lattice > GaugeField; - typedef LorentzColourMatrixD sobjd; - typedef LorentzColourMatrixF sobjf; - typedef iLorentzColourMatrix itype; - typedef LorentzColourMatrix sobj; + typedef typename GaugeField::vector_object vobj; + typedef typename vobj::scalar_object sobj; + + typedef LorentzColourMatrixF fobj; + typedef LorentzColourMatrixD dobj; GridBase *grid = Umu._grid; std::vector dims = Umu._grid->FullDimensions(); + assert(dims.size()==4); FILE *File = fopen(filename.c_str(), "r"); LimeReader *LimeR = limeCreateReader(File); - // Metadata holders ildgFormat ildgFormat_ ; std::string ildgLFN_ ; @@ -263,8 +548,6 @@ class IldgIO : public BinaryIO { if ( ildgFormat_.precision == 64 ) format = std::string("IEEE64BIG"); if ( ildgFormat_.precision == 32 ) format = std::string("IEEE32BIG"); - // std::cout << "This is an ILDG format record : "< munge; - BinaryIO::readLatticeObject< itype, sobjd >(Umu, filename, munge, offset, format, - nersc_csum,scidac_csuma,scidac_csumb); + + if ( format == std::string("IEEE64BIG") ) { + GaugeSimpleMunger munge; + BinaryIO::readLatticeObject< vobj, dobj >(Umu, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb); + } else { + GaugeSimpleMunger munge; + BinaryIO::readLatticeObject< vobj, fobj >(Umu, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb); + } + found_ildgBinary = 1; } @@ -328,8 +610,10 @@ class IldgIO : public BinaryIO { ////////////////////////////////////////////////////// // Minimally must find binary segment and checksum + // Since this is an ILDG reader require ILDG format ////////////////////////////////////////////////////// assert(found_ildgBinary); + assert(found_ildgFormat); assert(found_scidacChecksum); // Must find something with the lattice dimensions @@ -337,9 +621,7 @@ class IldgIO : public BinaryIO { if ( found_FieldMetaData ) { - std::cout << GridLogMessage<<"a Grid MetaData was record found: configuration was probably written by Grid ! Yay ! "<(Umu,checker); + GaugeStatistics(Umu,checker); assert(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5); assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5); std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl; } } + }; - // format for RNG? Now just binary out -}; -} -} +}} //HAVE_LIME #endif diff --git a/lib/parallelIO/IldgIOtypes.h b/lib/parallelIO/IldgIOtypes.h index 8e1316eb..c3a5321c 100644 --- a/lib/parallelIO/IldgIOtypes.h +++ b/lib/parallelIO/IldgIOtypes.h @@ -34,16 +34,110 @@ extern "C" { // for linkage namespace Grid { +///////////////////////////////////////////////////////////////////////////////// +// Data representation of records that enter ILDG and SciDac formats +///////////////////////////////////////////////////////////////////////////////// + #define GRID_FORMAT "grid-format" #define ILDG_FORMAT "ildg-format" #define ILDG_BINARY_DATA "ildg-binary-data" #define ILDG_DATA_LFN "ildg-data-lfn" -#define USQCD_INFO "usqcdInfo" -#define SCIDAC_CHECKSUM "scidac-checksum" +#define SCIDAC_CHECKSUM "scidac-checksum" +#define SCIDAC_PRIVATE_FILE_XML "scidac-private-file-xml" +#define SCIDAC_FILE_XML "scidac-file-xml" +#define SCIDAC_PRIVATE_RECORD_XML "scidac-private-record-xml" +#define SCIDAC_RECORD_XML "scidac-record-xml" +#define SCIDAC_BINARY_DATA "scidac-binary-data" +// Unused SCIDAC records names; could move to support this functionality +#define SCIDAC_SITELIST "scidac-sitelist" + + //////////////////////////////////////////////////////////// + const int GRID_IO_SINGLEFILE = 0; // hardcode lift from QIO compat + const int GRID_IO_MULTIFILE = 1; // hardcode lift from QIO compat + const int GRID_IO_FIELD = 0; // hardcode lift from QIO compat + const int GRID_IO_GLOBAL = 1; // hardcode lift from QIO compat + //////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////// -// Data representation of records that enter ILDG and SciDac formats +// QIO uses mandatory "private" records fixed format +// Private is in principle "opaque" however it can't be changed now because that would break existing +// file compatability, so should be correct to assume the undocumented but defacto file structure. ///////////////////////////////////////////////////////////////////////////////// + +//////////////////////// +// Scidac private file xml +// 1.1416 16 16 32 0 +//////////////////////// +struct scidacFile : Serializable { + public: + GRID_SERIALIZABLE_CLASS_MEMBERS(scidacFile, + double, version, + int, spacetime, + std::string, dims, // must convert to int + int, volfmt); + + std::vector getDimensions(void) { + std::stringstream stream(dims); + std::vector dimensions; + int n; + while(stream >> n){ + dimensions.push_back(n); + } + return dimensions; + } + + void setDimensions(std::vector dimensions) { + char delimiter = ' '; + std::stringstream stream; + for(int i=0;i_ndimension; + setDimensions(grid->FullDimensions()); + volfmt = GRID_IO_SINGLEFILE; + } + +}; + +/////////////////////////////////////////////////////////////////////// +// scidac-private-record-xml : example +// +// 1.1Tue Jul 26 21:14:44 2011 UTC0 +// QDP_D3_ColorMatrixD34 +// 1444 +// +/////////////////////////////////////////////////////////////////////// + +struct scidacRecord : Serializable { + public: + GRID_SERIALIZABLE_CLASS_MEMBERS(scidacRecord, + double, version, + std::string, date, + int, recordtype, + std::string, datatype, + std::string, precision, + int, colors, + int, spins, + int, typesize, + int, datacount); + + scidacRecord() { version =1.0; } + +}; + +//////////////////////// +// ILDG format +//////////////////////// struct ildgFormat : Serializable { public: GRID_SERIALIZABLE_CLASS_MEMBERS(ildgFormat, @@ -54,10 +148,11 @@ public: int, ly, int, lz, int, lt); - ildgFormat() { - version=1.0; - }; + ildgFormat() { version=1.0; }; }; +//////////////////////// +// USQCD info +//////////////////////// struct usqcdInfo : Serializable { public: GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdInfo, @@ -69,7 +164,36 @@ struct usqcdInfo : Serializable { version=1.0; }; }; +//////////////////////// +// Scidac Checksum +//////////////////////// +struct scidacChecksum : Serializable { + public: + GRID_SERIALIZABLE_CLASS_MEMBERS(scidacChecksum, + double, version, + std::string, suma, + std::string, sumb); + scidacChecksum() { + version=1.0; + }; +}; +//////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Type: scidac-file-xml MILC ILDG archival gauge configuration +//////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Type: +//////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +//////////////////////// +// Scidac private file xml +// 1.1416 16 16 32 0 +//////////////////////// + +#if 0 +//////////////////////////////////////////////////////////////////////////////////////// +// From http://www.physics.utah.edu/~detar/scidac/qio_2p3.pdf +//////////////////////////////////////////////////////////////////////////////////////// struct usqcdPropFile : Serializable { public: GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropFile, @@ -100,17 +224,8 @@ struct usqcdPropInfo : Serializable { version=1.0; }; }; -struct scidacChecksum : Serializable { - public: - GRID_SERIALIZABLE_CLASS_MEMBERS(scidacChecksum, - double, version, - uint32_t, suma, - uint32_t, sumb); - scidacChecksum() { - version=1.0; - suma=sumb=0; - }; -}; +#endif + } #endif #endif From ae4de947989d1c9299b7dbeb8c1a570f745a84d7 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 18 Jun 2017 00:11:23 +0100 Subject: [PATCH 070/170] SciDAC I/O support --- lib/parallelIO/MetaData.h | 124 ++++++++++++++++++++++++++++++++++---- lib/parallelIO/NerscIO.h | 4 +- 2 files changed, 114 insertions(+), 14 deletions(-) diff --git a/lib/parallelIO/MetaData.h b/lib/parallelIO/MetaData.h index 1bad07f2..6d45d0a5 100644 --- a/lib/parallelIO/MetaData.h +++ b/lib/parallelIO/MetaData.h @@ -38,9 +38,24 @@ namespace Grid { - //////////////////////////////////////////////////////////////////////////////// - // header specification/interpretation - //////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////// + // Precision mapping + /////////////////////////////////////////////////////// + template static std::string getFormatString (void) + { + std::string format; + typedef typename getPrecision::real_scalar_type stype; + if ( sizeof(stype) == sizeof(float) ) { + format = std::string("IEEE32BIG"); + } + if ( sizeof(stype) == sizeof(double) ) { + format = std::string("IEEE64BIG"); + } + return format; + } + //////////////////////////////////////////////////////////////////////////////// + // header specification/interpretation + //////////////////////////////////////////////////////////////////////////////// class FieldMetaData : Serializable { public: @@ -66,8 +81,15 @@ namespace Grid { std::string, creation_date, std::string, archive_date, std::string, floating_point); + FieldMetaData(void) { + nd=4; + dimension.resize(4); + boundary.resize(4); + } }; + + namespace QCD { using namespace Grid; @@ -89,13 +111,6 @@ namespace Grid { header.boundary[d] = std::string("PERIODIC"); } } - template - inline void GaugeStatistics(GaugeField & data,FieldMetaData &header) - { - // How to convert data precision etc... - header.link_trace=Grid::QCD::WilsonLoops::linkTrace(data); - header.plaquette =Grid::QCD::WilsonLoops::avgPlaquette(data); - } inline void MachineCharacteristics(FieldMetaData &header) { @@ -133,7 +148,7 @@ namespace Grid { s << "BOUNDARY_"< inline void PrepareMetaData(Lattice & field, FieldMetaData &header) +{ + GridBase *grid = field._grid; + std::string format = getFormatString(); + header.floating_point = format; + header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac + GridMetaData(grid,header); + MachineCharacteristics(header); + } + inline void GaugeStatistics(Lattice & data,FieldMetaData &header) + { + // How to convert data precision etc... + header.link_trace=Grid::QCD::WilsonLoops::linkTrace(data); + header.plaquette =Grid::QCD::WilsonLoops::avgPlaquette(data); + } + inline void GaugeStatistics(Lattice & data,FieldMetaData &header) + { + // How to convert data precision etc... + header.link_trace=Grid::QCD::WilsonLoops::linkTrace(data); + header.plaquette =Grid::QCD::WilsonLoops::avgPlaquette(data); + } + template<> inline void PrepareMetaData(Lattice & field, FieldMetaData &header) + { + + GridBase *grid = field._grid; + std::string format = getFormatString(); + header.floating_point = format; + header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac + GridMetaData(grid,header); + GaugeStatistics(field,header); + MachineCharacteristics(header); + } + template<> inline void PrepareMetaData(Lattice & field, FieldMetaData &header) + { + GridBase *grid = field._grid; + std::string format = getFormatString(); + header.floating_point = format; + header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac + GridMetaData(grid,header); + GaugeStatistics(field,header); + MachineCharacteristics(header); + } ////////////////////////////////////////////////////////////////////// // Utilities ; these are QCD aware @@ -171,6 +228,48 @@ namespace Grid { typedef iLorentzColour2x3 LorentzColour2x3F; typedef iLorentzColour2x3 LorentzColour2x3D; +///////////////////////////////////////////////////////////////////////////////// +// Simple classes for precision conversion +///////////////////////////////////////////////////////////////////////////////// +template +struct BinarySimpleUnmunger { + typedef typename getPrecision::real_scalar_type fobj_stype; + typedef typename getPrecision::real_scalar_type sobj_stype; + + void operator()(sobj &in, fobj &out) { + // take word by word and transform accoding to the status + fobj_stype *out_buffer = (fobj_stype *)&out; + sobj_stype *in_buffer = (sobj_stype *)∈ + size_t fobj_words = sizeof(out) / sizeof(fobj_stype); + size_t sobj_words = sizeof(in) / sizeof(sobj_stype); + assert(fobj_words == sobj_words); + + for (unsigned int word = 0; word < sobj_words; word++) + out_buffer[word] = in_buffer[word]; // type conversion on the fly + + } +}; + +template +struct BinarySimpleMunger { + typedef typename getPrecision::real_scalar_type fobj_stype; + typedef typename getPrecision::real_scalar_type sobj_stype; + + void operator()(fobj &in, sobj &out) { + // take word by word and transform accoding to the status + fobj_stype *in_buffer = (fobj_stype *)∈ + sobj_stype *out_buffer = (sobj_stype *)&out; + size_t fobj_words = sizeof(in) / sizeof(fobj_stype); + size_t sobj_words = sizeof(out) / sizeof(sobj_stype); + assert(fobj_words == sobj_words); + + for (unsigned int word = 0; word < sobj_words; word++) + out_buffer[word] = in_buffer[word]; // type conversion on the fly + + } +}; + + template struct GaugeSimpleMunger{ void operator()(fobj &in, sobj &out) { @@ -220,6 +319,7 @@ namespace Grid { } } }; - } + + } diff --git a/lib/parallelIO/NerscIO.h b/lib/parallelIO/NerscIO.h index cc37b537..786839f2 100644 --- a/lib/parallelIO/NerscIO.h +++ b/lib/parallelIO/NerscIO.h @@ -179,7 +179,7 @@ namespace Grid { assert(0); } - GaugeStatistics(Umu,clone); + GaugeStatistics(Umu,clone); std::cout<(Umu,header); + GaugeStatistics(Umu,header); MachineCharacteristics(header); int offset; From 46879e165814015c8d82195771573df01a1edd66 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 18 Jun 2017 00:11:45 +0100 Subject: [PATCH 071/170] Complex defined in Impl even for gauge. --- lib/qcd/action/fermion/FermionOperatorImpl.h | 6 ------ lib/qcd/action/gauge/GaugeImplTypes.h | 16 +++++++++++----- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h index 20458b6d..524179f5 100644 --- a/lib/qcd/action/fermion/FermionOperatorImpl.h +++ b/lib/qcd/action/fermion/FermionOperatorImpl.h @@ -644,19 +644,16 @@ class StaggeredImpl : public PeriodicGaugeImpl using iImplScalar = iScalar > >; template using iImplSpinor = iScalar > >; template using iImplHalfSpinor = iScalar > >; template using iImplDoubledGaugeField = iVector >, Nds>; template using iImplPropagator = iScalar > >; - typedef iImplScalar SiteComplex; typedef iImplSpinor SiteSpinor; typedef iImplHalfSpinor SiteHalfSpinor; typedef iImplDoubledGaugeField SiteDoubledGaugeField; typedef iImplPropagator SitePropagator; - typedef Lattice ComplexField; typedef Lattice FermionField; typedef Lattice DoubledGaugeField; typedef Lattice PropagatorField; @@ -775,7 +772,6 @@ class StaggeredImpl : public PeriodicGaugeImpl using iImplScalar = iScalar > >; template using iImplSpinor = iScalar > >; template using iImplHalfSpinor = iScalar > >; template using iImplDoubledGaugeField = iVector >, Nds>; @@ -792,12 +788,10 @@ class StaggeredImpl : public PeriodicGaugeImpl DoubledGaugeField; typedef Lattice PropagatorField; - typedef iImplScalar SiteComplex; typedef iImplSpinor SiteSpinor; typedef iImplHalfSpinor SiteHalfSpinor; - typedef Lattice ComplexField; typedef Lattice FermionField; typedef SimpleCompressor Compressor; diff --git a/lib/qcd/action/gauge/GaugeImplTypes.h b/lib/qcd/action/gauge/GaugeImplTypes.h index 9d36eead..0c0df219 100644 --- a/lib/qcd/action/gauge/GaugeImplTypes.h +++ b/lib/qcd/action/gauge/GaugeImplTypes.h @@ -40,12 +40,15 @@ namespace QCD { typedef typename GImpl::Simd Simd; \ typedef typename GImpl::LinkField GaugeLinkField; \ typedef typename GImpl::Field GaugeField; \ + typedef typename GImpl::ComplexField ComplexField;\ typedef typename GImpl::SiteField SiteGaugeField; \ + typedef typename GImpl::SiteComplex SiteComplex; \ typedef typename GImpl::SiteLink SiteGaugeLink; -#define INHERIT_FIELD_TYPES(Impl) \ - typedef typename Impl::Simd Simd; \ - typedef typename Impl::SiteField SiteField; \ +#define INHERIT_FIELD_TYPES(Impl) \ + typedef typename Impl::Simd Simd; \ + typedef typename Impl::ComplexField ComplexField; \ + typedef typename Impl::SiteField SiteField; \ typedef typename Impl::Field Field; // hardcodes the exponential approximation in the template @@ -53,12 +56,15 @@ template class GaugeImplType public: typedef S Simd; - template using iImplGaugeLink = iScalar>>; - template using iImplGaugeField = iVector>, Nd>; + template using iImplScalar = iScalar > >; + template using iImplGaugeLink = iScalar > >; + template using iImplGaugeField = iVector >, Nd>; + typedef iImplScalar SiteComplex; typedef iImplGaugeLink SiteLink; typedef iImplGaugeField SiteField; + typedef Lattice ComplexField; typedef Lattice LinkField; typedef Lattice Field; From b96daf53a0c060c530eee3769861133d764589cf Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 18 Jun 2017 00:12:15 +0100 Subject: [PATCH 072/170] Query tensor structures --- lib/tensors/Tensor_index.h | 60 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/lib/tensors/Tensor_index.h b/lib/tensors/Tensor_index.h index 7f34f3ac..f114baf8 100644 --- a/lib/tensors/Tensor_index.h +++ b/lib/tensors/Tensor_index.h @@ -47,6 +47,28 @@ template class TensorIndexRecursion { public: + + //////////////////////////////////////////////////// + // Type Queries + //////////////////////////////////////////////////// + template static inline int indexRank(const iScalar tmp) { return TensorIndexRecursion::indexRank(tmp._internal); } + template static inline int indexRank(const iVector tmp){ return TensorIndexRecursion::indexRank(tmp._internal[0]); } + template static inline int indexRank(const iMatrix tmp){ return TensorIndexRecursion::indexRank(tmp._internal[0][0]); } + + template static inline int isScalar(const iScalar tmp) { return TensorIndexRecursion::isScalar(tmp._internal); } + template static inline int isScalar(const iVector tmp){ return TensorIndexRecursion::isScalar(tmp._internal[0]); } + template static inline int isScalar(const iMatrix tmp){ return TensorIndexRecursion::isScalar(tmp._internal[0][0]); } + + template static inline int isVector(const iScalar tmp) { return TensorIndexRecursion::isVector(tmp._internal); } + template static inline int isVector(const iVector tmp){ return TensorIndexRecursion::isVector(tmp._internal[0]); } + template static inline int isVector(const iMatrix tmp){ return TensorIndexRecursion::isVector(tmp._internal[0][0]); } + + template static inline int isMatrix(const iScalar tmp) { return TensorIndexRecursion::isMatrix(tmp._internal); } + template static inline int isMatrix(const iVector tmp){ return TensorIndexRecursion::isMatrix(tmp._internal[0]); } + template static inline int isMatrix(const iMatrix tmp){ return TensorIndexRecursion::isMatrix(tmp._internal[0][0]); } + //////////////////////////////////////////////////// + // Trace + //////////////////////////////////////////////////// template static auto traceIndex(const iScalar arg) -> iScalar::traceIndex(arg._internal))> { @@ -215,6 +237,24 @@ class TensorIndexRecursion { template<> class TensorIndexRecursion<0> { public: + //////////////////////////////////////////////////// + // Type Queries + //////////////////////////////////////////////////// + template static inline int indexRank(const iScalar tmp) { return 1; } + template static inline int indexRank(const iVector tmp){ return N; } + template static inline int indexRank(const iMatrix tmp){ return N; } + + template static inline int isScalar(const iScalar tmp) { return true;} + template static inline int isScalar(const iVector tmp){ return false;} + template static inline int isScalar(const iMatrix tmp){ return false;} + + template static inline int isVector(const iScalar tmp) { return false;} + template static inline int isVector(const iVector tmp){ return true;} + template static inline int isVector(const iMatrix tmp){ return false;} + + template static inline int isMatrix(const iScalar tmp) { return false;} + template static inline int isMatrix(const iVector tmp){ return false;} + template static inline int isMatrix(const iMatrix tmp){ return true;} ///////////////////////////////////////// // Ends recursion for trace (scalar/vector/matrix) @@ -302,6 +342,26 @@ class TensorIndexRecursion<0> { //////////////////////////////////////////////////////////////////////////////////////////////////////// // External wrappers //////////////////////////////////////////////////////////////////////////////////////////////////////// +template inline int indexRank(void) +{ + vtype tmp; + return TensorIndexRecursion::indexRank(tmp); +} +template inline int isScalar(void) +{ + vtype tmp; + return TensorIndexRecursion::isScalar(tmp); +} +template inline int isVector(void) +{ + vtype tmp; + return TensorIndexRecursion::isVector(tmp); +} +template inline int isMatrix(void) +{ + vtype tmp; + return TensorIndexRecursion::isMatrix(tmp); +} template inline auto traceIndex (const vtype &arg) -> RemoveCRV(TensorIndexRecursion::traceIndex(arg)) { From ae39ec85a3b89072d9ea325cb953068a064ec822 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 18 Jun 2017 00:12:48 +0100 Subject: [PATCH 073/170] ComplexField defined --- lib/qcd/utils/WilsonLoops.h | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/lib/qcd/utils/WilsonLoops.h b/lib/qcd/utils/WilsonLoops.h index 5382882e..ea713ec8 100644 --- a/lib/qcd/utils/WilsonLoops.h +++ b/lib/qcd/utils/WilsonLoops.h @@ -73,7 +73,7 @@ public: ////////////////////////////////////////////////// // trace of directed plaquette oriented in mu,nu plane ////////////////////////////////////////////////// - static void traceDirPlaquette(LatticeComplex &plaq, + static void traceDirPlaquette(ComplexField &plaq, const std::vector &U, const int mu, const int nu) { GaugeMat sp(U[0]._grid); @@ -83,9 +83,9 @@ public: ////////////////////////////////////////////////// // sum over all planes of plaquette ////////////////////////////////////////////////// - static void sitePlaquette(LatticeComplex &Plaq, + static void sitePlaquette(ComplexField &Plaq, const std::vector &U) { - LatticeComplex sitePlaq(U[0]._grid); + ComplexField sitePlaq(U[0]._grid); Plaq = zero; for (int mu = 1; mu < Nd; mu++) { for (int nu = 0; nu < mu; nu++) { @@ -104,11 +104,11 @@ public: U[mu] = PeekIndex(Umu, mu); } - LatticeComplex Plaq(Umu._grid); + ComplexField Plaq(Umu._grid); sitePlaquette(Plaq, U); - TComplex Tp = sum(Plaq); - Complex p = TensorRemove(Tp); + auto Tp = sum(Plaq); + auto p = TensorRemove(Tp); return p.real(); } @@ -129,15 +129,15 @@ public: static RealD linkTrace(const GaugeLorentz &Umu) { std::vector U(Nd, Umu._grid); - LatticeComplex Tr(Umu._grid); + ComplexField Tr(Umu._grid); Tr = zero; for (int mu = 0; mu < Nd; mu++) { U[mu] = PeekIndex(Umu, mu); Tr = Tr + trace(U[mu]); } - TComplex Tp = sum(Tr); - Complex p = TensorRemove(Tp); + auto Tp = sum(Tr); + auto p = TensorRemove(Tp); double vol = Umu._grid->gSites(); @@ -330,8 +330,8 @@ public: double coeff = 8.0/(32.0*M_PI*M_PI); - LatticeComplex qfield = coeff*trace(Bx*Ex + By*Ey + Bz*Ez); - TComplex Tq = sum(qfield); + ComplexField qfield = coeff*trace(Bx*Ex + By*Ey + Bz*Ez); + auto Tq = sum(qfield); return TensorRemove(Tq).real(); } @@ -350,16 +350,16 @@ public: adj(Gimpl::CovShiftForward( U[nu], nu, Gimpl::CovShiftForward(U[nu], nu, U[mu]))); } - static void traceDirRectangle(LatticeComplex &rect, + static void traceDirRectangle(ComplexField &rect, const std::vector &U, const int mu, const int nu) { GaugeMat sp(U[0]._grid); dirRectangle(sp, U, mu, nu); rect = trace(sp); } - static void siteRectangle(LatticeComplex &Rect, + static void siteRectangle(ComplexField &Rect, const std::vector &U) { - LatticeComplex siteRect(U[0]._grid); + ComplexField siteRect(U[0]._grid); Rect = zero; for (int mu = 1; mu < Nd; mu++) { for (int nu = 0; nu < mu; nu++) { @@ -379,12 +379,12 @@ public: U[mu] = PeekIndex(Umu, mu); } - LatticeComplex Rect(Umu._grid); + ComplexField Rect(Umu._grid); siteRectangle(Rect, U); - TComplex Tp = sum(Rect); - Complex p = TensorRemove(Tp); + auto Tp = sum(Rect); + auto p = TensorRemove(Tp); return p.real(); } ////////////////////////////////////////////////// From 1d18d95d4f1457e2f37f0237db79873a346873df Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 18 Jun 2017 00:13:03 +0100 Subject: [PATCH 074/170] Class name return --- lib/serialisation/MacroMagic.h | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/serialisation/MacroMagic.h b/lib/serialisation/MacroMagic.h index 04f1b401..774c947f 100644 --- a/lib/serialisation/MacroMagic.h +++ b/lib/serialisation/MacroMagic.h @@ -115,6 +115,7 @@ THE SOFTWARE. #define GRID_MACRO_WRITE_MEMBER(A,B) Grid::write(WR,#B,obj. B); #define GRID_SERIALIZABLE_CLASS_MEMBERS(cname,...)\ + std::string SerialisableClassName(void) {return std::string(#cname);} \ GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__))\ template \ static inline void write(Writer &WR,const std::string &s, const cname &obj){ \ From e6d984b484f9679bf1240414b2df239bc888e595 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 18 Jun 2017 00:13:22 +0100 Subject: [PATCH 075/170] ILDG tests --- tests/IO/Test_ildg_io.cc | 2 +- tests/IO/Test_ildg_read.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/IO/Test_ildg_io.cc b/tests/IO/Test_ildg_io.cc index 1408c638..199773ab 100644 --- a/tests/IO/Test_ildg_io.cc +++ b/tests/IO/Test_ildg_io.cc @@ -78,7 +78,7 @@ int main (int argc, char ** argv) std::cout < U(4,&Fine); FieldMetaData header; - std::string file("./ckpoint_lat"); + std::string file("./ildg.file"); IldgIO::readConfiguration(file,Umu,header); for(int mu=0;mu Date: Mon, 19 Jun 2017 01:01:48 +0100 Subject: [PATCH 076/170] Update to enable multiple records per file more consistent with SciDAC. open, close, write records... --- lib/parallelIO/IldgIO.h | 285 ++++++++++--------- lib/qcd/hmc/checkpointers/ILDGCheckpointer.h | 10 +- tests/IO/Test_ildg_io.cc | 10 +- tests/IO/Test_ildg_read.cc | 5 +- 4 files changed, 173 insertions(+), 137 deletions(-) diff --git a/lib/parallelIO/IldgIO.h b/lib/parallelIO/IldgIO.h index 9a1612d5..1d1b5e0c 100644 --- a/lib/parallelIO/IldgIO.h +++ b/lib/parallelIO/IldgIO.h @@ -38,14 +38,17 @@ directory #include #include -//Lime is a must have for this functionality -extern "C" { // for linkage +//C-Lime is a must have for this functionality +extern "C" { #include "lime.h" } namespace Grid { namespace QCD { + ///////////////////////////////// + // Encode word types as strings + ///////////////////////////////// template inline std::string ScidacWordMnemonic(void){ return std::string("unknown"); } template<> inline std::string ScidacWordMnemonic (void){ return std::string("D"); } template<> inline std::string ScidacWordMnemonic (void){ return std::string("F"); } @@ -54,6 +57,9 @@ namespace QCD { template<> inline std::string ScidacWordMnemonic< int64_t>(void){ return std::string("I64_t"); } template<> inline std::string ScidacWordMnemonic(void){ return std::string("U64_t"); } + ///////////////////////////////////////// + // Encode a generic tensor as a string + ///////////////////////////////////////// template std::string ScidacRecordTypeString(int &colors, int &spins, int & typesize,int &datacount) { typedef typename getPrecision::real_scalar_type stype; @@ -113,6 +119,10 @@ namespace QCD { return ScidacRecordTypeString(colors,spins,typesize,datacount); }; + + //////////////////////////////////////////////////////////// + // Helper to fill out metadata + //////////////////////////////////////////////////////////// template void ScidacMetaData(Lattice & field, FieldMetaData &header, scidacRecord & _scidacRecord, @@ -159,88 +169,38 @@ namespace QCD { //////////////////////////////////////////////////////////////////////////////////// // Lime, ILDG and Scidac I/O classes //////////////////////////////////////////////////////////////////////////////////// -class LimeIO : public BinaryIO { +class GridLimeReader : public BinaryIO { public: - /////////////////////////////////////////////////// // FIXME: format for RNG? Now just binary out instead - // FIXME: Make interface able to write multiple records - // FIXME: Split into LimeReader and LimeWriter /////////////////////////////////////////////////// - /* - FILE * File; - LimeWriter LimeW; - LimeReader LimeR; - template - int readObject(serialisable_object &object,std::string object_name,std::string record_name) - int createLimeRecordHeader(std::string message, int MB, int ME, size_t PayloadSize); - template - int writeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name) - template - int writeLimeLatticeBinaryObject(Lattice &field,std::string filename,std::string record_name) - */ - /////////////////////////////////////////////////////// - // Lime utility functions - /////////////////////////////////////////////////////// + FILE *File; + LimeReader *LimeR; + std::string filename; - static int createLimeRecordHeader(std::string message, int MB, int ME, size_t PayloadSize, LimeWriter* L) - { - LimeRecordHeader *h; - h = limeCreateHeader(MB, ME, const_cast(message.c_str()), PayloadSize); - assert(limeWriteRecordHeader(h, L) >= 0); - limeDestroyHeader(h); - return LIME_SUCCESS; - } - - //////////////////////////////////////////// - // Write a generic serialisable object - //////////////////////////////////////////// - template - static void writeLimeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name, LimeWriter *LimeW) - { - std::string xmlstring; - { - XmlWriter WR("",""); - write(WR,object_name,object); - xmlstring = WR.XmlString(); - } - uint64_t nbytes = xmlstring.size(); - LimeRecordHeader *h = limeCreateHeader(MB, ME,(char *)record_name.c_str(), nbytes); - int err=limeWriteRecordHeader(h, LimeW); assert(err>=0); - err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0); - err=limeWriterCloseRecord(LimeW); assert(err>=0); - limeDestroyHeader(h); - } - //////////////////////////////////////////// - // Read a generic serialisable object - //////////////////////////////////////////// - template - static void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name, LimeReader *LimeR) - { - std::string xmlstring; - // should this be a do while; can we miss a first record?? - while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { - - uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration) - - if ( strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) ) ) { - std::vector xmlc(nbytes+1,'\0'); - limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR); - XmlReader RD(&xmlc[0],""); - read(RD,object_name,object); - return; - } - - } - assert(0); - } + ///////////////////////////////////////////// + // Open the file + ///////////////////////////////////////////// + void open(std::string &_filename) + { + filename= _filename; + File = fopen(filename.c_str(), "r"); + LimeR = limeCreateReader(File); + } + ///////////////////////////////////////////// + // Close the file + ///////////////////////////////////////////// + void close(void){ + fclose(File); + // limeDestroyReader(LimeR); + } //////////////////////////////////////////// // Read a generic lattice field and verify checksum //////////////////////////////////////////// template - static void readLimeLatticeBinaryObject(Lattice &field,std::string filename,std::string record_name,FILE *File, LimeReader *LimeR) + void readLimeLatticeBinaryObject(Lattice &field,std::string record_name) { typedef typename vobj::scalar_object sobj; scidacChecksum scidacChecksum_; @@ -262,7 +222,7 @@ class LimeIO : public BinaryIO { ///////////////////////////////////////////// // Insist checksum is next record ///////////////////////////////////////////// - readLimeObject(scidacChecksum_,std::string("scidacChecksum"),record_name,LimeR); + readLimeObject(scidacChecksum_,std::string("scidacChecksum"),record_name); ///////////////////////////////////////////// // Verify checksums @@ -272,14 +232,91 @@ class LimeIO : public BinaryIO { } } } + //////////////////////////////////////////// + // Read a generic serialisable object + //////////////////////////////////////////// + template + void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name) + { + std::string xmlstring; + // should this be a do while; can we miss a first record?? + while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { + uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration) + + if ( strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) ) ) { + std::vector xmlc(nbytes+1,'\0'); + limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR); + XmlReader RD(&xmlc[0],""); + read(RD,object_name,object); + return; + } + + } + assert(0); + } +}; + +class GridLimeWriter : public BinaryIO { + public: + /////////////////////////////////////////////////// + // FIXME: format for RNG? Now just binary out instead + /////////////////////////////////////////////////// + + FILE *File; + LimeWriter *LimeW; + std::string filename; + + void open(std::string &_filename) { + filename= _filename; + File = fopen(filename.c_str(), "w"); + LimeW = limeCreateWriter(File); assert(LimeW != NULL ); + } + ///////////////////////////////////////////// + // Close the file + ///////////////////////////////////////////// + void close(void) { + fclose(File); + // limeDestroyWriter(LimeW); + } + /////////////////////////////////////////////////////// + // Lime utility functions + /////////////////////////////////////////////////////// + int createLimeRecordHeader(std::string message, int MB, int ME, size_t PayloadSize) + { + LimeRecordHeader *h; + h = limeCreateHeader(MB, ME, const_cast(message.c_str()), PayloadSize); + assert(limeWriteRecordHeader(h, LimeW) >= 0); + limeDestroyHeader(h); + return LIME_SUCCESS; + } + //////////////////////////////////////////// + // Write a generic serialisable object + //////////////////////////////////////////// + template + void writeLimeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name) + { + std::string xmlstring; + { + XmlWriter WR("",""); + write(WR,object_name,object); + xmlstring = WR.XmlString(); + } + uint64_t nbytes = xmlstring.size(); + int err; + LimeRecordHeader *h = limeCreateHeader(MB, ME,(char *)record_name.c_str(), nbytes); assert(h!= NULL); + + err=limeWriteRecordHeader(h, LimeW); assert(err>=0); + err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0); + err=limeWriterCloseRecord(LimeW); assert(err>=0); + limeDestroyHeader(h); + } //////////////////////////////////////////// // Write a generic lattice field and csum //////////////////////////////////////////// template - static void writeLimeLatticeBinaryObject(Lattice &field,std::string filename,std::string record_name,FILE *File, LimeWriter *LimeW) + void writeLimeLatticeBinaryObject(Lattice &field,std::string record_name) { - //////////////////////////////////////////// // Create record header //////////////////////////////////////////// @@ -287,7 +324,7 @@ class LimeIO : public BinaryIO { int err; uint32_t nersc_csum,scidac_csuma,scidac_csumb; uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites; - createLimeRecordHeader(record_name, 0, 0, PayloadSize, LimeW); + createLimeRecordHeader(record_name, 0, 0, PayloadSize); //////////////////////////////////////////////////////////////////// // NB: FILE and iostream are jointly writing disjoint sequences in the @@ -317,34 +354,25 @@ class LimeIO : public BinaryIO { checksum.suma= streama.str(); checksum.sumb= streamb.str(); std::cout << GridLogMessage<<" writing scidac checksums "< - int open(std::string filename,GridBase *grid,userFile &_userFile,int volfmt) { - - } - void close(void) { - - } - template - int writeScidacField(Lattice &field,userRecord &_userRecord,int volfmt) - template - int readScidacField(Lattice &field,userRecord &_userRecord,int volfmt) - */ + + template + void writeScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile) + { + scidacFile _scidacFile(grid); + writeLimeObject(1,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML)); + writeLimeObject(0,1,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML)); + } //////////////////////////////////////////////// // Write generic lattice field in scidac format //////////////////////////////////////////////// - template - static void writeScidacField(std::string filename,Lattice &field,userFile _userFile,userRecord _userRecord) + template + void writeScidacFieldRecord(Lattice &field,userRecord _userRecord) { typedef typename vobj::scalar_object sobj; uint64_t nbytes; @@ -362,34 +390,25 @@ class ScidacIO : public LimeIO { ////////////////////////////////////////////// // Fill the Lime file record by record ////////////////////////////////////////////// - FILE *File = fopen(filename.c_str(), "w"); - LimeWriter *LimeW = limeCreateWriter(File); - assert(LimeW != NULL ); - - writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT),LimeW); // Open message - writeLimeObject(0,0,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML),LimeW); - writeLimeObject(0,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML),LimeW); - writeLimeObject(0,0,_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML),LimeW); - writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML),LimeW); - writeLimeLatticeBinaryObject(field,filename,std::string(ILDG_BINARY_DATA),File,LimeW); // Closes message with checksum - - limeDestroyWriter(LimeW); - fclose(File); + writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message + writeLimeObject(0,0,_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML)); + writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML)); + writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA)); // Closes message with checksum } }; -class IldgIO : public ScidacIO { +class IldgWriter : public ScidacWriter { public: /////////////////////////////////// // A little helper /////////////////////////////////// - static void writeLimeIldgLFN(std::string &LFN,LimeWriter *LimeW) + void writeLimeIldgLFN(std::string &LFN) { uint64_t PayloadSize = LFN.size(); int err; - createLimeRecordHeader(ILDG_DATA_LFN, 0 , 0, PayloadSize, LimeW); - err=limeWriteRecordData(const_cast(LFN.c_str()), &PayloadSize, LimeW); assert(err>=0); + createLimeRecordHeader(ILDG_DATA_LFN, 0 , 0, PayloadSize); + err=limeWriteRecordData(const_cast(LFN.c_str()), &PayloadSize,LimeW); assert(err>=0); err=limeWriterCloseRecord(LimeW); assert(err>=0); } @@ -399,7 +418,7 @@ class IldgIO : public ScidacIO { // Use Grid MetaData object if present. //////////////////////////////////////////////////////////////// template - static void writeConfiguration(std::string filename,Lattice > &Umu) + void writeConfiguration(Lattice > &Umu,int sequence,std::string LFN,std::string description) { GridBase * grid = Umu._grid; typedef Lattice > GaugeField; @@ -418,6 +437,10 @@ class IldgIO : public ScidacIO { ScidacMetaData(Umu,header,_scidacRecord,_scidacFile); std::string format = header.floating_point; + header.ensemble_id = description; + header.ensemble_label = description; + header.sequence_number = sequence; + header.ildg_lfn = LFN; assert ( (format == std::string("IEEE32BIG")) ||(format == std::string("IEEE64BIG")) ); @@ -453,20 +476,21 @@ class IldgIO : public ScidacIO { ////////////////////////////////////////////// // Fill the Lime file record by record ////////////////////////////////////////////// - - FILE *File = fopen(filename.c_str(), "w"); - LimeWriter *LimeW = limeCreateWriter(File); assert(LimeW != NULL); - writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT),LimeW); // Open message - writeLimeObject(0,0,info,info.SerialisableClassName(),std::string(SCIDAC_FILE_XML),LimeW); - writeLimeObject(0,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML),LimeW); - writeLimeObject(0,0,info,info.SerialisableClassName(),std::string(SCIDAC_RECORD_XML),LimeW); - writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML),LimeW); - writeLimeObject(0,0,ildgfmt,std::string("ildgFormat") ,std::string(ILDG_FORMAT),LimeW); // rec - writeLimeIldgLFN(header.ildg_lfn, LimeW); // rec - writeLimeLatticeBinaryObject(Umu,filename,std::string(ILDG_BINARY_DATA),File,LimeW); // Closes message with checksum - limeDestroyWriter(LimeW); + writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message + writeLimeObject(0,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML)); + writeLimeObject(0,1,info,info.SerialisableClassName(),std::string(SCIDAC_FILE_XML)); + writeLimeObject(1,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML)); + writeLimeObject(0,0,info,info.SerialisableClassName(),std::string(SCIDAC_RECORD_XML)); + writeLimeObject(0,0,ildgfmt,std::string("ildgFormat") ,std::string(ILDG_FORMAT)); // rec + writeLimeIldgLFN(header.ildg_lfn); // rec + writeLimeLatticeBinaryObject(Umu,std::string(ILDG_BINARY_DATA)); // Closes message with checksum + // limeDestroyWriter(LimeW); fclose(File); } +}; + +class IldgReader : public GridLimeReader { + public: //////////////////////////////////////////////////////////////// // Read either Grid/SciDAC/ILDG configuration @@ -476,7 +500,7 @@ class IldgIO : public ScidacIO { // Else use SciDAC MetaData object if present. //////////////////////////////////////////////////////////////// template - static void readConfiguration(std::string filename,Lattice > &Umu, FieldMetaData &FieldMetaData_) { + void readConfiguration(Lattice > &Umu, FieldMetaData &FieldMetaData_) { typedef Lattice > GaugeField; typedef typename GaugeField::vector_object vobj; @@ -491,9 +515,6 @@ class IldgIO : public ScidacIO { assert(dims.size()==4); - FILE *File = fopen(filename.c_str(), "r"); - LimeReader *LimeR = limeCreateReader(File); - // Metadata holders ildgFormat ildgFormat_ ; std::string ildgLFN_ ; diff --git a/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h b/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h index b72fc6f7..118a8e25 100644 --- a/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h +++ b/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h @@ -77,7 +77,10 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer { uint32_t nersc_csum,scidac_csuma,scidac_csumb; BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb); - IldgIO::writeConfiguration(config,U, Params.format); + IldgWriter _IldgWriter; + _IldgWriter.open(config); + _IldgWriter.writeConfiguration(U, traj, config, config); + _IldgWriter.close(); std::cout << GridLogMessage << "Written ILDG Configuration on " << config << " checksum " << std::hex @@ -97,7 +100,10 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer { BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb); FieldMetaData header; - IldgIO::readConfiguration(config,U,header); // format from the header + IldgReader _IldgReader; + _IldgReader.open(config); + _IldgReader.readConfiguration(config,U,header); // format from the header + _IldgReader.close(); std::cout << GridLogMessage << "Read ILDG Configuration from " << config << " checksum " << std::hex diff --git a/tests/IO/Test_ildg_io.cc b/tests/IO/Test_ildg_io.cc index 199773ab..e3e9d385 100644 --- a/tests/IO/Test_ildg_io.cc +++ b/tests/IO/Test_ildg_io.cc @@ -78,13 +78,19 @@ int main (int argc, char ** argv) std::cout <(Umu,mu); From 8b7049f737617f67815433b52a7888874f7ffec1 Mon Sep 17 00:00:00 2001 From: paboyle Date: Mon, 19 Jun 2017 08:46:07 +0100 Subject: [PATCH 077/170] Improved detectino of usqcdInfo for plaq/linktr --- lib/parallelIO/IldgIO.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/lib/parallelIO/IldgIO.h b/lib/parallelIO/IldgIO.h index 1d1b5e0c..17ce4a06 100644 --- a/lib/parallelIO/IldgIO.h +++ b/lib/parallelIO/IldgIO.h @@ -598,9 +598,14 @@ class IldgReader : public GridLimeReader { } if ( !strncmp(limeReaderType(LimeR), SCIDAC_RECORD_XML,strlen(SCIDAC_RECORD_XML)) ) { - XmlReader RD(&xmlc[0],""); - read(RD,"usqcdInfo",usqcdInfo_); - found_usqcdInfo = 1; + std::string xmls(&xmlc[0]); + // is it a USQCD info field + if ( xmls.find(std::string("usqcdInfo")) != std::string::npos ) { + std::cout << GridLogMessage<<"...found a usqcdInfo field"< Date: Mon, 19 Jun 2017 14:04:21 +0100 Subject: [PATCH 078/170] Block solver improvements --- .../iterative/BlockConjugateGradient.h | 19 +- lib/lattice/Lattice_reduction.h | 189 +++++++++++++++++- .../solver/Test_staggered_block_cg_unprec.cc | 7 +- 3 files changed, 192 insertions(+), 23 deletions(-) diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h index d90194ae..53e11fa7 100644 --- a/lib/algorithms/iterative/BlockConjugateGradient.h +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -42,7 +42,7 @@ class BlockConjugateGradient : public OperatorFunction { typedef typename Field::scalar_type scomplex; - const int blockDim = 0; + int blockDim ; int Nblock; bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. @@ -51,14 +51,15 @@ class BlockConjugateGradient : public OperatorFunction { Integer MaxIterations; Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion - BlockConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true) + BlockConjugateGradient(int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true) : Tolerance(tol), + blockDim(_Orthog), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv){}; void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) { - int Orthog = 0; // First dimension is block dim + int Orthog = blockDim; // First dimension is block dim; this is an assumption Nblock = Src._grid->_fdimensions[Orthog]; std::cout< &Linop, const Field &Src, Field &Psi) Linop.HermOp(Psi, AP); AP = AP-Src; - std::cout << GridLogMessage <<"\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) < { typedef typename Field::scalar_type scomplex; - const int blockDim = 0; - + int blockDim; int Nblock; bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. // Defaults true. @@ -218,14 +218,15 @@ class MultiRHSConjugateGradient : public OperatorFunction { Integer MaxIterations; Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion - MultiRHSConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true) + MultiRHSConjugateGradient(int Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true) : Tolerance(tol), + blockDim(Orthog), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv){}; void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) { - int Orthog = 0; // First dimension is block dim + int Orthog = blockDim; // First dimension is block dim Nblock = Src._grid->_fdimensions[Orthog]; std::cout< &Linop, const Field &Src, Field &Psi) MatrixTimer.Stop(); // Alpha - // sliceInnerProductVectorTest(v_pAp_test,P,AP,Orthog); sliceInnerTimer.Start(); sliceInnerProductVector(v_pAp,P,AP,Orthog); sliceInnerTimer.Stop(); for(int b=0;b &R,Eigen::MatrixXcd &aa,const Lattice typedef typename vobj::vector_type vector_type; int Nblock = X._grid->GlobalDimensions()[Orthog]; - + GridBase *FullGrid = X._grid; GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - + Lattice Xslice(SliceGrid); Lattice Rslice(SliceGrid); - + +#if 0 + // R[i] = Y[i] + X[j] a(j,i) for(int i=0;i &R,Eigen::MatrixXcd &aa,const Lattice } InsertSlice(Rslice,R,i,Orthog); } +#endif +#if 0 + int nh = FullGrid->_ndimension; + int nl = SliceGrid->_ndimension; + +#pragma omp parallel +{ + + std::vector lcoor(nl); // sliced coor + std::vector hcoor(nh); // unsliced coor + std::vector s_x(Nblock); + +#pragma omp for + for(int idx=0;idxlSites();idx++){ + + SliceGrid->LocalIndexToLocalCoor(idx,lcoor); + + int ddl=0; + for(int d=0;d_simd_layout[Orthog]==1); + int nh = FullGrid->_ndimension; + int nl = SliceGrid->_ndimension; + + + //FIXME package in a convenient iterator + //Should loop over a plane orthogonal to direction "Orthog" + int stride=FullGrid->_slice_stride[Orthog]; + int block =FullGrid->_slice_block [Orthog]; + int nblock=FullGrid->_slice_nblock[Orthog]; + int ostride=FullGrid->_ostride[Orthog]; +#pragma omp parallel + { + + std::vector s_x(Nblock); + +#pragma omp for collapse(2) + for(int n=0;n static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) { @@ -497,7 +581,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice Lattice Rslice(SliceGrid); mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); - + +#if 0 for(int i=0;i mat(i,j) = innerProduct(Lslice,Rslice); } } -#undef FORCE_DIAG -#ifdef FORCE_DIAG - for(int i=0;i_ndimension; + int nl = SliceGrid->_ndimension; + +#pragma omp parallel +{ + std::vector lcoor(nl); // sliced coor + std::vector hcoor(nh); // unsliced coor + std::vector Left(Nblock); + std::vector Right(Nblock); + Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock); + +#pragma omp for + for(int idx=0;idxlSites();idx++){ + + SliceGrid->LocalIndexToLocalCoor(idx,lcoor); + + int ddl=0; + for(int d=0;d ip = innerProduct(Left[i],Right[j]); + mat_thread(i,j) += ip; + }} + } + +#pragma omp critical + { + mat += mat_thread; + } + +} +#endif + +#if 1 + assert( FullGrid->_simd_layout[Orthog]==1); + int nh = FullGrid->_ndimension; + int nl = SliceGrid->_ndimension; + + //FIXME package in a convenient iterator + //Should loop over a plane orthogonal to direction "Orthog" + int stride=FullGrid->_slice_stride[Orthog]; + int block =FullGrid->_slice_block [Orthog]; + int nblock=FullGrid->_slice_nblock[Orthog]; + int ostride=FullGrid->_ostride[Orthog]; + + typedef typename vobj::vector_typeD vector_typeD; + +#pragma omp parallel + { + std::vector Left(Nblock); + std::vector Right(Nblock); + Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock); + +#pragma omp for collapse(2) + for(int n=0;n HermOp(Ds); ConjugateGradient CG(1.0e-8,10000); - BlockConjugateGradient BCG(1.0e-8,10000); - MultiRHSConjugateGradient mCG(1.0e-8,10000); + int blockDim = 0; + BlockConjugateGradient BCG(blockDim,1.0e-8,10000); + MultiRHSConjugateGradient mCG(blockDim,1.0e-8,10000); std::cout << GridLogMessage << "************************************************************************ "< Date: Mon, 19 Jun 2017 22:03:03 +0100 Subject: [PATCH 079/170] No compile make tests fix --- lib/simd/Grid_vector_types.h | 4 ++-- lib/tensors/Tensor_class.h | 9 ++++++++- lib/tensors/Tensor_exp.h | 7 +++++-- tests/core/Test_GaugeAction.cc | 2 +- tests/core/Test_RectPlaq.cc | 2 +- tests/core/Test_main.cc | 2 +- 6 files changed, 18 insertions(+), 8 deletions(-) diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 1ebe7379..e05fecc4 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -751,8 +751,8 @@ inline Grid_simd, V> toComplex(const Grid_simd &in) { conv.v = in.v; for (int i = 0; i < Rsimd::Nsimd(); i += 2) { - assert(conv.s[i + 1] == - conv.s[i]); // trap any cases where real was not duplicated + assert(conv.s[i + 1] == conv.s[i]); + // trap any cases where real was not duplicated // indicating the SIMD grids of real and imag assignment did not correctly // match conv.s[i + 1] = 0.0; // zero imaginary parts diff --git a/lib/tensors/Tensor_class.h b/lib/tensors/Tensor_class.h index cb90da6c..c7f868db 100644 --- a/lib/tensors/Tensor_class.h +++ b/lib/tensors/Tensor_class.h @@ -156,11 +156,18 @@ class iScalar { // convert from a something to a scalar via constructor of something arg template ::value, T>::type * = nullptr> - strong_inline iScalar operator=(T arg) { + strong_inline iScalar operator=(T arg) { _internal = arg; return *this; } + // Convert elements + template + strong_inline iScalar operator=(iScalar &&arg) { + _internal = arg._internal; + return *this; + } + friend std::ostream &operator<<(std::ostream &stream,const iScalar &o) { stream << "S {" << o._internal << "}"; return stream; diff --git a/lib/tensors/Tensor_exp.h b/lib/tensors/Tensor_exp.h index e18fed70..f7eee8f0 100644 --- a/lib/tensors/Tensor_exp.h +++ b/lib/tensors/Tensor_exp.h @@ -80,8 +80,11 @@ template inline iVector Exponentiate(const iVector mat iQ2 = arg*arg*alpha*alpha; mat iQ3 = arg*iQ2*alpha; // sign in c0 from the conventions on the Ta - c0 = -imag( trace(iQ3) ) * one_over_three; - c1 = -real( trace(iQ2) ) * one_over_two; + scalar imQ3, reQ2; + imQ3 = imag( trace(iQ3) ); + reQ2 = real( trace(iQ2) ); + c0 = -imQ3 * one_over_three; + c1 = -reQ2 * one_over_two; // Cayley Hamilton checks to machine precision, tested tmp = c1 * one_over_three; diff --git a/tests/core/Test_GaugeAction.cc b/tests/core/Test_GaugeAction.cc index 2f0535f1..572f19fb 100644 --- a/tests/core/Test_GaugeAction.cc +++ b/tests/core/Test_GaugeAction.cc @@ -73,7 +73,7 @@ int main (int argc, char ** argv) std::vector U(4,&Fine); - NerscField header; + FieldMetaData header; std::string file("./ckpoint_lat.4000"); NerscIO::readConfiguration(Umu,header,file); diff --git a/tests/core/Test_RectPlaq.cc b/tests/core/Test_RectPlaq.cc index 9154f879..2e9cc832 100644 --- a/tests/core/Test_RectPlaq.cc +++ b/tests/core/Test_RectPlaq.cc @@ -90,7 +90,7 @@ int main (int argc, char ** argv) std::vector U(4,&Fine); - NerscField header; + FieldMetaData header; std::string file("./ckpoint_lat.4000"); NerscIO::readConfiguration(Umu,header,file); diff --git a/tests/core/Test_main.cc b/tests/core/Test_main.cc index 921298c7..378f49bd 100644 --- a/tests/core/Test_main.cc +++ b/tests/core/Test_main.cc @@ -336,7 +336,7 @@ int main(int argc, char **argv) { std::cout << GridLogMessage << "norm cMmat : " << norm2(cMat) << std::endl; - cMat = expMat(cMat, ComplexD(1.0, 0.0)); + cMat = expMat(cMat,1.0);// ComplexD(1.0, 0.0)); std::cout << GridLogMessage << "norm expMat: " << norm2(cMat) << std::endl; peekSite(cm, cMat, mysite); From 0a8faac2713c981be4a61c06d90ce0d6c5de211a Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Mon, 19 Jun 2017 22:54:18 +0100 Subject: [PATCH 080/170] Fix make tests compile --- lib/qcd/action/scalar/ScalarImpl.h | 13 ++++++++----- tests/debug/Test_cayley_ldop_cr.cc | 2 +- tests/solver/Test_dwf_hdcr.cc | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/lib/qcd/action/scalar/ScalarImpl.h b/lib/qcd/action/scalar/ScalarImpl.h index ee2d2fb8..0116b4f9 100644 --- a/lib/qcd/action/scalar/ScalarImpl.h +++ b/lib/qcd/action/scalar/ScalarImpl.h @@ -15,6 +15,8 @@ namespace Grid { typedef iImplField SiteField; + template using iImplScalar= iScalar > >; + typedef iImplScalar ComplexField; typedef Lattice Field; @@ -51,13 +53,14 @@ namespace Grid { public: typedef S Simd; - template - using iImplField = iScalar > >; - + template using iImplField = iScalar > >; + typedef iImplField SiteField; - - typedef Lattice Field; + + template using iImplScalar= iScalar > >; + typedef iImplScalar ComplexField; + static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){ gaussian(pRNG, P); diff --git a/tests/debug/Test_cayley_ldop_cr.cc b/tests/debug/Test_cayley_ldop_cr.cc index dfda43d2..cbefdd46 100644 --- a/tests/debug/Test_cayley_ldop_cr.cc +++ b/tests/debug/Test_cayley_ldop_cr.cc @@ -67,7 +67,7 @@ int main (int argc, char ** argv) LatticeFermion err(FGrid); LatticeGaugeField Umu(UGrid); - NerscField header; + FieldMetaData header; std::string file("./ckpoint_lat.400"); NerscIO::readConfiguration(Umu,header,file); diff --git a/tests/solver/Test_dwf_hdcr.cc b/tests/solver/Test_dwf_hdcr.cc index 64ca0b33..c553ba0a 100644 --- a/tests/solver/Test_dwf_hdcr.cc +++ b/tests/solver/Test_dwf_hdcr.cc @@ -516,7 +516,7 @@ int main (int argc, char ** argv) LatticeColourMatrix U(UGrid); LatticeColourMatrix zz(UGrid); - NerscField header; + FieldMetaData header; std::string file("./ckpoint_lat.4000"); NerscIO::readConfiguration(Umu,header,file); From e9cc21900f00b81a17ab87d649e014edc99c636b Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Tue, 20 Jun 2017 12:37:41 +0100 Subject: [PATCH 081/170] Block solver complete for staggered. Now stable on mass 0.003 and gives 8x (!) speed up on Haswell laptop vs. standard CG for 8 RHS solves. 166 iterations vs. 537 iterations so algorithmic gain + 2x in flop rate gain. Better than a slap in the face with a wet kipper. --- .../iterative/BlockConjugateGradient.h | 295 ++++++++++++++++-- lib/lattice/Lattice_reduction.h | 235 +++----------- .../solver/Test_staggered_block_cg_unprec.cc | 13 +- 3 files changed, 321 insertions(+), 222 deletions(-) diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h index 53e11fa7..f8b83b1f 100644 --- a/lib/algorithms/iterative/BlockConjugateGradient.h +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -33,6 +33,8 @@ directory namespace Grid { +enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS }; + ////////////////////////////////////////////////////////////////////////// // Block conjugate gradient. Dimension zero should be the block direction ////////////////////////////////////////////////////////////////////////// @@ -40,24 +42,274 @@ template class BlockConjugateGradient : public OperatorFunction { public: + typedef typename Field::scalar_type scomplex; int blockDim ; - int Nblock; + + BlockCGtype CGtype; bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. // Defaults true. RealD Tolerance; Integer MaxIterations; Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion - BlockConjugateGradient(int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true) + BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true) : Tolerance(tol), + CGtype(cgtype), blockDim(_Orthog), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv){}; +//////////////////////////////////////////////////////////////////////////////////////////////////// +// Thin QR factorisation (google it) +//////////////////////////////////////////////////////////////////////////////////////////////////// +void ThinQRfact (Eigen::MatrixXcd &m_rr, + Eigen::MatrixXcd &C, + Eigen::MatrixXcd &Cinv, + Field & Q, + const Field & R) +{ + int Orthog = blockDim; // First dimension is block dim; this is an assumption + //////////////////////////////////////////////////////////////////////////////////////////////////// + //Dimensions + // R_{ferm x Nblock} = Q_{ferm x Nblock} x C_{Nblock x Nblock} -> ferm x Nblock + // + // Rdag R = m_rr = Herm = L L^dag <-- Cholesky decomposition (LLT routine in Eigen) + // + // Q C = R => Q = R C^{-1} + // + // Want Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock} + // + // Set C = L^{dag}, and then Q^dag Q = ident + // + // Checks: + // Cdag C = Rdag R ; passes. + // QdagQ = 1 ; passes + //////////////////////////////////////////////////////////////////////////////////////////////////// + sliceInnerProductMatrix(m_rr,R,R,Orthog); + + //////////////////////////////////////////////////////////////////////////////////////////////////// + // Cholesky from Eigen + // There exists a ldlt that is documented as more stable + //////////////////////////////////////////////////////////////////////////////////////////////////// + Eigen::MatrixXcd L = m_rr.llt().matrixL(); + + C = L.adjoint(); + Cinv = C.inverse(); + + //////////////////////////////////////////////////////////////////////////////////////////////////// + // Q = R C^{-1} + // + // Q_j = R_i Cinv(i,j) + // + // NB maddMatrix conventions are Right multiplication X[j] a[j,i] already + //////////////////////////////////////////////////////////////////////////////////////////////////// + // FIXME:: make a sliceMulMatrix to avoid zero vector + sliceMulMatrix(Q,Cinv,R,Orthog); +} +//////////////////////////////////////////////////////////////////////////////////////////////////// +// Call one of several implementations +//////////////////////////////////////////////////////////////////////////////////////////////////// void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) +{ + if ( CGtype == BlockCGrQ ) { + BlockCGrQsolve(Linop,Src,Psi); + } else if (CGtype == BlockCG ) { + BlockCGsolve(Linop,Src,Psi); + } else if (CGtype == CGmultiRHS ) { + CGmultiRHSsolve(Linop,Src,Psi); + } else { + assert(0); + } +} + +//////////////////////////////////////////////////////////////////////////// +// BlockCGrQ implementation: +//-------------------------- +// X is guess/Solution +// B is RHS +// Solve A X_i = B_i ; i refers to Nblock index +//////////////////////////////////////////////////////////////////////////// +void BlockCGrQsolve(LinearOperatorBase &Linop, const Field &B, Field &X) +{ + int Orthog = blockDim; // First dimension is block dim; this is an assumption + Nblock = B._grid->_fdimensions[Orthog]; + + std::cout< residuals(Nblock); + std::vector ssq(Nblock); + + sliceNorm(ssq,B,Orthog); + RealD sssum=0; + for(int b=0;b Thin QR factorisation (google it) + * for k: + * Z = AD + * M = [D^dag Z]^{-1} + * X = X + D MC + * QS = Q - ZM + * D = Q + D S^dag + * C = S C + */ + /////////////////////////////////////// + // Initial block: initial search dir is guess + /////////////////////////////////////// + std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " < Thin QR factorisation (google it) + + Linop.HermOp(X, AD); + tmp = B - AD; + ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp); + D=Q; + + std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " < max_resid ) max_resid = rr; + } + + std::cout << GridLogIterative << "\titeration "< &Linop, const Field &Src, Field &Psi) { int Orthog = blockDim; // First dimension is block dim; this is an assumption Nblock = Src._grid->_fdimensions[Orthog]; @@ -163,8 +415,9 @@ void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) ********************* */ RealD max_resid=0; + RealD rr; for(int b=0;b max_resid ) max_resid = rr; } @@ -174,13 +427,14 @@ void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) std::cout << GridLogMessage<<"BlockCG converged in "< &Linop, const Field &Src, Field &Psi) if (ErrorOnNoConverge) assert(0); IterationsToComplete = k; } -}; - - ////////////////////////////////////////////////////////////////////////// // multiRHS conjugate gradient. Dimension zero should be the block direction +// Use this for spread out across nodes ////////////////////////////////////////////////////////////////////////// -template -class MultiRHSConjugateGradient : public OperatorFunction { - public: - - typedef typename Field::scalar_type scomplex; - - int blockDim; - int Nblock; - bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. - // Defaults true. - RealD Tolerance; - Integer MaxIterations; - Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion - - MultiRHSConjugateGradient(int Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true) - : Tolerance(tol), - blockDim(Orthog), - MaxIterations(maxit), - ErrorOnNoConverge(err_on_no_conv){}; - -void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) +void CGmultiRHSsolve(LinearOperatorBase &Linop, const Field &Src, Field &Psi) { int Orthog = blockDim; // First dimension is block dim Nblock = Src._grid->_fdimensions[Orthog]; @@ -331,7 +563,7 @@ void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) std::cout << GridLogMessage<<"MultiRHS solver converged in " < &Linop, const Field &Src, Field &Psi) if (ErrorOnNoConverge) assert(0); IterationsToComplete = k; } + }; - - } #endif diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h index 78f88ce3..c5b20f3c 100644 --- a/lib/lattice/Lattice_reduction.h +++ b/lib/lattice/Lattice_reduction.h @@ -369,71 +369,6 @@ static void sliceMaddVector(Lattice &R,std::vector &a,const Lattice } }; - -/* -template -static void sliceMaddVectorSlow (Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, - int Orthog,RealD scale=1.0) -{ - // FIXME: Implementation is slow - // Best base the linear combination by constructing a - // set of vectors of size grid->_rdimensions[Orthog]. - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - int Nblock = X._grid->GlobalDimensions()[Orthog]; - - GridBase *FullGrid = X._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - Lattice Xslice(SliceGrid); - Lattice Rslice(SliceGrid); - // If we based this on Cshift it would work for spread out - // but it would be even slower - for(int i=0;i -static void sliceInnerProductVectorSlow( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) - { - // FIXME: Implementation is slow - // Look at localInnerProduct implementation, - // and do inside a site loop with block strided iterators - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - typedef typename vobj::tensor_reduced scalar; - typedef typename scalar::scalar_object scomplex; - - int Nblock = lhs._grid->GlobalDimensions()[Orthog]; - vec.resize(Nblock); - std::vector sip(Nblock); - Lattice IP(lhs._grid); - IP=localInnerProduct(lhs,rhs); - sliceSum(IP,sip,Orthog); - - for(int ss=0;ss_rdimensions[Orthog]. -////////////////////////////////////////////////////////////////////////////////////////// - inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) { int NN = BlockSolverGrid->_ndimension; @@ -453,7 +388,6 @@ inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); } - template static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) { @@ -469,64 +403,10 @@ static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice Lattice Xslice(SliceGrid); Lattice Rslice(SliceGrid); -#if 0 - // R[i] = Y[i] + X[j] a(j,i) - for(int i=0;i_ndimension; - int nl = SliceGrid->_ndimension; - -#pragma omp parallel -{ - - std::vector lcoor(nl); // sliced coor - std::vector hcoor(nh); // unsliced coor - std::vector s_x(Nblock); - -#pragma omp for - for(int idx=0;idxlSites();idx++){ - - SliceGrid->LocalIndexToLocalCoor(idx,lcoor); - - int ddl=0; - for(int d=0;d_simd_layout[Orthog]==1); int nh = FullGrid->_ndimension; int nl = SliceGrid->_ndimension; - //FIXME package in a convenient iterator //Should loop over a plane orthogonal to direction "Orthog" int stride=FullGrid->_slice_stride[Orthog]; @@ -535,7 +415,6 @@ static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice int ostride=FullGrid->_ostride[Orthog]; #pragma omp parallel { - std::vector s_x(Nblock); #pragma omp for collapse(2) @@ -543,13 +422,11 @@ static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice for(int b=0;b &R,Eigen::MatrixXcd &aa,const Lattice } }} } -#endif +}; + +template +static void sliceMulMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,int Orthog,RealD scale=1.0) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + + assert( FullGrid->_simd_layout[Orthog]==1); + int nh = FullGrid->_ndimension; + int nl = SliceGrid->_ndimension; + + //FIXME package in a convenient iterator + //Should loop over a plane orthogonal to direction "Orthog" + int stride=FullGrid->_slice_stride[Orthog]; + int block =FullGrid->_slice_block [Orthog]; + int nblock=FullGrid->_slice_nblock[Orthog]; + int ostride=FullGrid->_ostride[Orthog]; +#pragma omp parallel + { + std::vector s_x(Nblock); + +#pragma omp for collapse(2) + for(int n=0;n static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) { - // FIXME: Implementation is slow - // Not sure of best solution.. think about it typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; @@ -582,63 +507,6 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); -#if 0 - for(int i=0;i_ndimension; - int nl = SliceGrid->_ndimension; - -#pragma omp parallel -{ - std::vector lcoor(nl); // sliced coor - std::vector hcoor(nh); // unsliced coor - std::vector Left(Nblock); - std::vector Right(Nblock); - Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock); - -#pragma omp for - for(int idx=0;idxlSites();idx++){ - - SliceGrid->LocalIndexToLocalCoor(idx,lcoor); - - int ddl=0; - for(int d=0;d ip = innerProduct(Left[i],Right[j]); - mat_thread(i,j) += ip; - }} - } - -#pragma omp critical - { - mat += mat_thread; - } - -} -#endif - -#if 1 assert( FullGrid->_simd_layout[Orthog]==1); int nh = FullGrid->_ndimension; int nl = SliceGrid->_ndimension; @@ -681,7 +549,6 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice mat += mat_thread; } } -#endif return; } diff --git a/tests/solver/Test_staggered_block_cg_unprec.cc b/tests/solver/Test_staggered_block_cg_unprec.cc index 8da93195..8db41e98 100644 --- a/tests/solver/Test_staggered_block_cg_unprec.cc +++ b/tests/solver/Test_staggered_block_cg_unprec.cc @@ -51,7 +51,7 @@ int main (int argc, char ** argv) typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField; typename ImprovedStaggeredFermion5DR::ImplParams params; - const int Ls=4; + const int Ls=8; Grid_init(&argc,&argv); @@ -80,12 +80,13 @@ int main (int argc, char ** argv) ConjugateGradient CG(1.0e-8,10000); int blockDim = 0; - BlockConjugateGradient BCG(blockDim,1.0e-8,10000); - MultiRHSConjugateGradient mCG(blockDim,1.0e-8,10000); + BlockConjugateGradient BCGrQ(BlockCGrQ,blockDim,1.0e-8,10000); + BlockConjugateGradient BCG (BlockCG,blockDim,1.0e-8,10000); + BlockConjugateGradient mCG (CGmultiRHS,blockDim,1.0e-8,10000); - std::cout << GridLogMessage << "************************************************************************ "< HermOp4d(Ds4d); FermionField src4d(UGrid); random(pRNG,src4d); @@ -112,7 +113,7 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << " Calling Block CG for "< Date: Tue, 20 Jun 2017 17:24:55 +0100 Subject: [PATCH 082/170] various compatibility fixes after merge --- lib/qcd/action/gauge/Photon.h | 6 ++++-- lib/qcd/action/scalar/ScalarImpl.h | 17 +++++++++++++---- tests/IO/Test_ildg_io.cc | 2 ++ tests/IO/Test_ildg_read.cc | 2 ++ 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/lib/qcd/action/gauge/Photon.h b/lib/qcd/action/gauge/Photon.h index 1512d4e3..7e21a1de 100644 --- a/lib/qcd/action/gauge/Photon.h +++ b/lib/qcd/action/gauge/Photon.h @@ -41,11 +41,13 @@ namespace QCD{ template using iImplGaugeField = iVector>, Nd>; - typedef iImplGaugeLink SiteLink; + typedef iImplGaugeLink SiteLink; typedef iImplGaugeField SiteField; + typedef SiteField SiteComplex; - typedef Lattice LinkField; + typedef Lattice LinkField; typedef Lattice Field; + typedef Field ComplexField; }; typedef QedGimpl QedGimplR; diff --git a/lib/qcd/action/scalar/ScalarImpl.h b/lib/qcd/action/scalar/ScalarImpl.h index 868bfc84..5342a1fa 100644 --- a/lib/qcd/action/scalar/ScalarImpl.h +++ b/lib/qcd/action/scalar/ScalarImpl.h @@ -15,8 +15,10 @@ class ScalarImplTypes { typedef iImplField SiteField; typedef SiteField SitePropagator; + typedef SiteField SiteComplex; typedef Lattice Field; + typedef Field ComplexField; typedef Field FermionField; typedef Field PropagatorField; @@ -92,11 +94,18 @@ class ScalarImplTypes { public: typedef S Simd; template - using iImplField = iScalar > >; + using iImplField = iScalar>>; + template + using iImplComplex = iScalar>>; - typedef iImplField SiteField; - - typedef Lattice Field; + typedef iImplField SiteField; + typedef SiteField SitePropagator; + typedef iImplComplex SiteComplex; + + typedef Lattice Field; + typedef Lattice ComplexField; + typedef Field FermionField; + typedef Field PropagatorField; static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) { QCD::SU::GaussianFundamentalLieAlgebraMatrix(pRNG, P); diff --git a/tests/IO/Test_ildg_io.cc b/tests/IO/Test_ildg_io.cc index e3e9d385..6aac2e38 100644 --- a/tests/IO/Test_ildg_io.cc +++ b/tests/IO/Test_ildg_io.cc @@ -36,6 +36,7 @@ using namespace Grid::QCD; int main (int argc, char ** argv) { +#ifdef HAVE_LIME Grid_init(&argc,&argv); std::cout < Date: Tue, 20 Jun 2017 18:46:01 +0100 Subject: [PATCH 083/170] Improved the lancos --- TODO | 28 +- lib/algorithms/densematrix/DenseMatrix.h | 137 --- lib/algorithms/densematrix/Francis.h | 525 ---------- lib/algorithms/densematrix/Householder.h | 242 ----- .../iterative/ImplicitlyRestartedLanczos.h | 987 ++++-------------- lib/qcd/hmc/checkpointers/ILDGCheckpointer.h | 2 +- tests/solver/Test_dwf_lanczos.cc | 2 +- 7 files changed, 211 insertions(+), 1712 deletions(-) delete mode 100644 lib/algorithms/densematrix/DenseMatrix.h delete mode 100644 lib/algorithms/densematrix/Francis.h delete mode 100644 lib/algorithms/densematrix/Householder.h diff --git a/TODO b/TODO index a5d4cabd..eeb7dfa5 100644 --- a/TODO +++ b/TODO @@ -1,24 +1,28 @@ TODO: --------------- -Peter's work list: -1)- Precision conversion and sort out localConvert <-- -2)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- - --- Profile CG, BlockCG, etc... Flop count/rate -- PARTIAL, time but no flop/s yet --- Physical propagator interface --- Conserved currents --- GaugeFix into central location --- Multigrid Wilson and DWF, compare to other Multigrid implementations --- HDCR resume +Large item work list: +1)- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- +2)- MultiRHS with spread out extra dim +3)- BG/Q port and check +4)- Precision conversion and sort out localConvert <-- partial + - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet +5)- Physical propagator interface +6)- Conserved currents +7)- Multigrid Wilson and DWF, compare to other Multigrid implementations +8)- HDCR resume Recent DONE +-- GaugeFix into central location <-- DONE +-- Scidac and Ildg metadata handling <-- DONE +-- Binary I/O MPI2 IO <-- DONE -- Binary I/O speed up & x-strips <-- DONE -- Cut down the exterior overhead <-- DONE -- Interior legs from SHM comms <-- DONE -- Half-precision comms <-- DONE --- Merge high precision reduction into develop --- multiRHS DWF; benchmark on Cori/BNL for comms elimination +-- Merge high precision reduction into develop <-- DONE +-- BlockCG, BCGrQ <-- DONE +-- multiRHS DWF; benchmark on Cori/BNL for comms elimination <-- DONE -- slice* linalg routines for multiRHS, BlockCG ----- diff --git a/lib/algorithms/densematrix/DenseMatrix.h b/lib/algorithms/densematrix/DenseMatrix.h deleted file mode 100644 index d86add21..00000000 --- a/lib/algorithms/densematrix/DenseMatrix.h +++ /dev/null @@ -1,137 +0,0 @@ - /************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: ./lib/algorithms/iterative/DenseMatrix.h - - Copyright (C) 2015 - -Author: Peter Boyle -Author: paboyle - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ -#ifndef GRID_DENSE_MATRIX_H -#define GRID_DENSE_MATRIX_H - -namespace Grid { - ///////////////////////////////////////////////////////////// - // Matrix untils - ///////////////////////////////////////////////////////////// - -template using DenseVector = std::vector; -template using DenseMatrix = DenseVector >; - -template void Size(DenseVector & vec, int &N) -{ - N= vec.size(); -} -template void Size(DenseMatrix & mat, int &N,int &M) -{ - N= mat.size(); - M= mat[0].size(); -} - -template void SizeSquare(DenseMatrix & mat, int &N) -{ - int M; Size(mat,N,M); - assert(N==M); -} - -template void Resize(DenseVector & mat, int N) { - mat.resize(N); -} -template void Resize(DenseMatrix & mat, int N, int M) { - mat.resize(N); - for(int i=0;i void Fill(DenseMatrix & mat, T&val) { - int N,M; - Size(mat,N,M); - for(int i=0;i DenseMatrix Transpose(DenseMatrix & mat){ - int N,M; - Size(mat,N,M); - DenseMatrix C; Resize(C,M,N); - for(int i=0;i void Unity(DenseMatrix &A){ - int N; SizeSquare(A,N); - for(int i=0;i -void PlusUnit(DenseMatrix & A,T c){ - int dim; SizeSquare(A,dim); - for(int i=0;i -DenseMatrix HermitianConj(DenseMatrix &mat){ - - int dim; SizeSquare(mat,dim); - - DenseMatrix C; Resize(C,dim,dim); - - for(int i=0;i -DenseMatrix GetSubMtx(DenseMatrix &A,int row_st, int row_end, int col_st, int col_end) -{ - DenseMatrix H; Resize(H,row_end - row_st,col_end-col_st); - - for(int i = row_st; i - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ -#ifndef FRANCIS_H -#define FRANCIS_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -//#include -//#include -//#include - -namespace Grid { - -template int SymmEigensystem(DenseMatrix &Ain, DenseVector &evals, DenseMatrix &evecs, RealD small); -template int Eigensystem(DenseMatrix &Ain, DenseVector &evals, DenseMatrix &evecs, RealD small); - -/** - Find the eigenvalues of an upper hessenberg matrix using the Francis QR algorithm. -H = - x x x x x x x x x - x x x x x x x x x - 0 x x x x x x x x - 0 0 x x x x x x x - 0 0 0 x x x x x x - 0 0 0 0 x x x x x - 0 0 0 0 0 x x x x - 0 0 0 0 0 0 x x x - 0 0 0 0 0 0 0 x x -Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary. -**/ -template -int QReigensystem(DenseMatrix &Hin, DenseVector &evals, DenseMatrix &evecs, RealD small) -{ - DenseMatrix H = Hin; - - int N ; SizeSquare(H,N); - int M = N; - - Fill(evals,0); - Fill(evecs,0); - - T s,t,x=0,y=0,z=0; - T u,d; - T apd,amd,bc; - DenseVector p(N,0); - T nrm = Norm(H); ///DenseMatrix Norm - int n, m; - int e = 0; - int it = 0; - int tot_it = 0; - int l = 0; - int r = 0; - DenseMatrix P; Resize(P,N,N); Unity(P); - DenseVector trows(N,0); - - /// Check if the matrix is really hessenberg, if not abort - RealD sth = 0; - for(int j=0;j small){ - std::cout << "Non hessenberg H = " << sth << " > " << small << std::endl; - exit(1); - } - } - } - - do{ - std::cout << "Francis QR Step N = " << N << std::endl; - /** Check for convergence - x x x x x - 0 x x x x - 0 0 x x x - 0 0 x x x - 0 0 0 0 x - for this matrix l = 4 - **/ - do{ - l = Chop_subdiag(H,nrm,e,small); - r = 0; ///May have converged on more than one eval - ///Single eval - if(l == N-1){ - evals[e] = H[l][l]; - N--; e++; r++; it = 0; - } - ///RealD eval - if(l == N-2){ - trows[l+1] = 1; ///Needed for UTSolve - apd = H[l][l] + H[l+1][l+1]; - amd = H[l][l] - H[l+1][l+1]; - bc = (T)4.0*H[l+1][l]*H[l][l+1]; - evals[e] = (T)0.5*( apd + sqrt(amd*amd + bc) ); - evals[e+1] = (T)0.5*( apd - sqrt(amd*amd + bc) ); - N-=2; e+=2; r++; it = 0; - } - } while(r>0); - - if(N ==0) break; - - DenseVector ck; Resize(ck,3); - DenseVector v; Resize(v,3); - - for(int m = N-3; m >= l; m--){ - ///Starting vector essentially random shift. - if(it%10 == 0 && N >= 3 && it > 0){ - s = (T)1.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) ); - t = (T)0.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) ); - x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t; - y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s); - z = H[m+1][m]*H[m+2][m+1]; - } - ///Starting vector implicit Q theorem - else{ - s = (H[N-2][N-2] + H[N-1][N-1]); - t = (H[N-2][N-2]*H[N-1][N-1] - H[N-2][N-1]*H[N-1][N-2]); - x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t; - y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s); - z = H[m+1][m]*H[m+2][m+1]; - } - ck[0] = x; ck[1] = y; ck[2] = z; - - if(m == l) break; - - /** Some stupid thing from numerical recipies, seems to work**/ - // PAB.. for heaven's sake quote page, purpose, evidence it works. - // what sort of comment is that!?!?!? - u=abs(H[m][m-1])*(abs(y)+abs(z)); - d=abs(x)*(abs(H[m-1][m-1])+abs(H[m][m])+abs(H[m+1][m+1])); - if ((T)abs(u+d) == (T)abs(d) ){ - l = m; break; - } - - //if (u < small){l = m; break;} - } - if(it > 100000){ - std::cout << "QReigensystem: bugger it got stuck after 100000 iterations" << std::endl; - std::cout << "got " << e << " evals " << l << " " << N << std::endl; - exit(1); - } - normalize(ck); ///Normalization cancels in PHP anyway - T beta; - Householder_vector(ck, 0, 2, v, beta); - Householder_mult(H,v,beta,0,l,l+2,0); - Householder_mult(H,v,beta,0,l,l+2,1); - ///Accumulate eigenvector - Householder_mult(P,v,beta,0,l,l+2,1); - int sw = 0; ///Are we on the last row? - for(int k=l;k(ck, 0, 2-sw, v, beta); - Householder_mult(H,v, beta,0,k+1,k+3-sw,0); - Householder_mult(H,v, beta,0,k+1,k+3-sw,1); - ///Accumulate eigenvector - Householder_mult(P,v, beta,0,k+1,k+3-sw,1); - } - it++; - tot_it++; - }while(N > 1); - N = evals.size(); - ///Annoying - UT solves in reverse order; - DenseVector tmp; Resize(tmp,N); - for(int i=0;i -int my_Wilkinson(DenseMatrix &Hin, DenseVector &evals, DenseMatrix &evecs, RealD small) -{ - /** - Find the eigenvalues of an upper Hessenberg matrix using the Wilkinson QR algorithm. - H = - x x 0 0 0 0 - x x x 0 0 0 - 0 x x x 0 0 - 0 0 x x x 0 - 0 0 0 x x x - 0 0 0 0 x x - Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary. **/ - return my_Wilkinson(Hin, evals, evecs, small, small); -} - -template -int my_Wilkinson(DenseMatrix &Hin, DenseVector &evals, DenseMatrix &evecs, RealD small, RealD tol) -{ - int N; SizeSquare(Hin,N); - int M = N; - - ///I don't want to modify the input but matricies must be passed by reference - //Scale a matrix by its "norm" - //RealD Hnorm = abs( Hin.LargestDiag() ); H = H*(1.0/Hnorm); - DenseMatrix H; H = Hin; - - RealD Hnorm = abs(Norm(Hin)); - H = H * (1.0 / Hnorm); - - // TODO use openmp and memset - Fill(evals,0); - Fill(evecs,0); - - T s, t, x = 0, y = 0, z = 0; - T u, d; - T apd, amd, bc; - DenseVector p; Resize(p,N); Fill(p,0); - - T nrm = Norm(H); ///DenseMatrix Norm - int n, m; - int e = 0; - int it = 0; - int tot_it = 0; - int l = 0; - int r = 0; - DenseMatrix P; Resize(P,N,N); - Unity(P); - DenseVector trows(N, 0); - /// Check if the matrix is really symm tridiag - RealD sth = 0; - for(int j = 0; j < N; ++j) - { - for(int i = j + 2; i < N; ++i) - { - if(abs(H[i][j]) > tol || abs(H[j][i]) > tol) - { - std::cout << "Non Tridiagonal H(" << i << ","<< j << ") = |" << Real( real( H[j][i] ) ) << "| > " << tol << std::endl; - std::cout << "Warning tridiagonalize and call again" << std::endl; - // exit(1); // see what is going on - //return; - } - } - } - - do{ - do{ - //Jasper - //Check if the subdiagonal term is small enough ( 0); - //Jasper - //Already converged - //-------------- - if(N == 0) break; - - DenseVector ck,v; Resize(ck,2); Resize(v,2); - - for(int m = N - 3; m >= l; m--) - { - ///Starting vector essentially random shift. - if(it%10 == 0 && N >= 3 && it > 0) - { - t = abs(H[N - 1][N - 2]) + abs(H[N - 2][N - 3]); - x = H[m][m] - t; - z = H[m + 1][m]; - } else { - ///Starting vector implicit Q theorem - d = (H[N - 2][N - 2] - H[N - 1][N - 1]) * (T) 0.5; - t = H[N - 1][N - 1] - H[N - 1][N - 2] * H[N - 1][N - 2] - / (d + sign(d) * sqrt(d * d + H[N - 1][N - 2] * H[N - 1][N - 2])); - x = H[m][m] - t; - z = H[m + 1][m]; - } - //Jasper - //why it is here???? - //----------------------- - if(m == l) - break; - - u = abs(H[m][m - 1]) * (abs(y) + abs(z)); - d = abs(x) * (abs(H[m - 1][m - 1]) + abs(H[m][m]) + abs(H[m + 1][m + 1])); - if ((T)abs(u + d) == (T)abs(d)) - { - l = m; - break; - } - } - //Jasper - if(it > 1000000) - { - std::cout << "Wilkinson: bugger it got stuck after 100000 iterations" << std::endl; - std::cout << "got " << e << " evals " << l << " " << N << std::endl; - exit(1); - } - // - T s, c; - Givens_calc(x, z, c, s); - Givens_mult(H, l, l + 1, c, -s, 0); - Givens_mult(H, l, l + 1, c, s, 1); - Givens_mult(P, l, l + 1, c, s, 1); - // - for(int k = l; k < N - 2; ++k) - { - x = H.A[k + 1][k]; - z = H.A[k + 2][k]; - Givens_calc(x, z, c, s); - Givens_mult(H, k + 1, k + 2, c, -s, 0); - Givens_mult(H, k + 1, k + 2, c, s, 1); - Givens_mult(P, k + 1, k + 2, c, s, 1); - } - it++; - tot_it++; - }while(N > 1); - - N = evals.size(); - ///Annoying - UT solves in reverse order; - DenseVector tmp(N); - for(int i = 0; i < N; ++i) - tmp[i] = evals[N-i-1]; - evals = tmp; - // - UTeigenvectors(H, trows, evals, evecs); - //UTSymmEigenvectors(H, trows, evals, evecs); - for(int i = 0; i < evals.size(); ++i) - { - evecs[i] = P * evecs[i]; - normalize(evecs[i]); - evals[i] = evals[i] * Hnorm; - } - // // FIXME this is to test - // Hin.write("evecs3", evecs); - // Hin.write("evals3", evals); - // // check rsd - // for(int i = 0; i < M; i++) { - // vector Aevec = Hin * evecs[i]; - // RealD norm2(0.); - // for(int j = 0; j < M; j++) { - // norm2 += (Aevec[j] - evals[i] * evecs[i][j]) * (Aevec[j] - evals[i] * evecs[i][j]); - // } - // } - return tot_it; -} - -template -void Hess(DenseMatrix &A, DenseMatrix &Q, int start){ - - /** - turn a matrix A = - x x x x x - x x x x x - x x x x x - x x x x x - x x x x x - into - x x x x x - x x x x x - 0 x x x x - 0 0 x x x - 0 0 0 x x - with householder rotations - Slow. - */ - int N ; SizeSquare(A,N); - DenseVector p; Resize(p,N); Fill(p,0); - - for(int k=start;k ck,v; Resize(ck,N-k-1); Resize(v,N-k-1); - for(int i=k+1;i(ck, 0, ck.size()-1, v, beta); ///Householder vector - Householder_mult(A,v,beta,start,k+1,N-1,0); ///A -> PA - Householder_mult(A,v,beta,start,k+1,N-1,1); ///PA -> PAP^H - ///Accumulate eigenvector - Householder_mult(Q,v,beta,start,k+1,N-1,1); ///Q -> QP^H - } - /*for(int l=0;l -void Tri(DenseMatrix &A, DenseMatrix &Q, int start){ -///Tridiagonalize a matrix - int N; SizeSquare(A,N); - Hess(A,Q,start); - /*for(int l=0;l -void ForceTridiagonal(DenseMatrix &A){ -///Tridiagonalize a matrix - int N ; SizeSquare(A,N); - for(int l=0;l -int my_SymmEigensystem(DenseMatrix &Ain, DenseVector &evals, DenseVector > &evecs, RealD small){ - ///Solve a symmetric eigensystem, not necessarily in tridiagonal form - int N; SizeSquare(Ain,N); - DenseMatrix A; A = Ain; - DenseMatrix Q; Resize(Q,N,N); Unity(Q); - Tri(A,Q,0); - int it = my_Wilkinson(A, evals, evecs, small); - for(int k=0;k -int Wilkinson(DenseMatrix &Ain, DenseVector &evals, DenseVector > &evecs, RealD small){ - return my_Wilkinson(Ain, evals, evecs, small); -} - -template -int SymmEigensystem(DenseMatrix &Ain, DenseVector &evals, DenseVector > &evecs, RealD small){ - return my_SymmEigensystem(Ain, evals, evecs, small); -} - -template -int Eigensystem(DenseMatrix &Ain, DenseVector &evals, DenseVector > &evecs, RealD small){ -///Solve a general eigensystem, not necessarily in tridiagonal form - int N = Ain.dim; - DenseMatrix A(N); A = Ain; - DenseMatrix Q(N);Q.Unity(); - Hess(A,Q,0); - int it = QReigensystem(A, evals, evecs, small); - for(int k=0;k - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ -#ifndef HOUSEHOLDER_H -#define HOUSEHOLDER_H - -#define TIMER(A) std::cout << GridLogMessage << __FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl; -#define ENTER() std::cout << GridLogMessage << "ENTRY "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl; -#define LEAVE() std::cout << GridLogMessage << "EXIT "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl; - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace Grid { -/** Comparison function for finding the max element in a vector **/ -template bool cf(T i, T j) { - return abs(i) < abs(j); -} - -/** - Calculate a real Givens angle - **/ -template inline void Givens_calc(T y, T z, T &c, T &s){ - - RealD mz = (RealD)abs(z); - - if(mz==0.0){ - c = 1; s = 0; - } - if(mz >= (RealD)abs(y)){ - T t = -y/z; - s = (T)1.0 / sqrt ((T)1.0 + t * t); - c = s * t; - } else { - T t = -z/y; - c = (T)1.0 / sqrt ((T)1.0 + t * t); - s = c * t; - } -} - -template inline void Givens_mult(DenseMatrix &A, int i, int k, T c, T s, int dir) -{ - int q ; SizeSquare(A,q); - - if(dir == 0){ - for(int j=0;j inline void Householder_vector(DenseVector input, int k, int j, DenseVector &v, T &beta) -{ - int N ; Size(input,N); - T m = *max_element(input.begin() + k, input.begin() + j + 1, cf ); - - if(abs(m) > 0.0){ - T alpha = 0; - - for(int i=k; i 0.0) v[k] = v[k] + (v[k]/abs(v[k]))*alpha; - else v[k] = -alpha; - } else{ - for(int i=k; i inline void Householder_vector(DenseVector input, int k, int j, int dir, DenseVector &v, T &beta) -{ - int N = input.size(); - T m = *max_element(input.begin() + k, input.begin() + j + 1, cf); - - if(abs(m) > 0.0){ - T alpha = 0; - - for(int i=k; i 0.0) v[dir] = v[dir] + (v[dir]/abs(v[dir]))*alpha; - else v[dir] = -alpha; - }else{ - for(int i=k; i inline void Householder_mult(DenseMatrix &A , DenseVector v, T beta, int l, int k, int j, int trans) -{ - int N ; SizeSquare(A,N); - - if(abs(beta) > 0.0){ - for(int p=l; p inline void Householder_mult_tri(DenseMatrix &A , DenseVector v, T beta, int l, int M, int k, int j, int trans) -{ - if(abs(beta) > 0.0){ - - int N ; SizeSquare(A,N); - - DenseMatrix tmp; Resize(tmp,N,N); Fill(tmp,0); - - T s; - for(int p=l; p +template using DenseVector = std::vector; + +//#include #include namespace Grid { @@ -47,104 +49,85 @@ namespace Grid { ///////////////////////////////////////////////////////////// // Implicitly restarted lanczos ///////////////////////////////////////////////////////////// - - template - class ImplicitlyRestartedLanczos { +class ImplicitlyRestartedLanczos { - const RealD small = 1.0e-16; public: - int lock; - int get; - int Niter; - int converged; + int Niter; // Max iterations + int Nstop; // Number of evecs checked for convergence + int Nk; // Number of converged sought + int Nm; // Nm -- total number of vectors - int Nstop; // Number of evecs checked for convergence - int Nk; // Number of converged sought - int Np; // Np -- Number of spare vecs in kryloc space - int Nm; // Nm -- total number of vectors + RealD eresid; - RealD eresid; + //////////////////////////////////// + // Embedded objects + //////////////////////////////////// + SortEigen _sort; + LinearOperatorBase &_Linop; + OperatorFunction &_poly; - SortEigen _sort; + ///////////////////////// + // Constructor + ///////////////////////// + ImplicitlyRestartedLanczos(LinearOperatorBase &Linop, // op + OperatorFunction & poly, // polynmial + int _Nstop, // sought vecs + int _Nk, // sought vecs + int _Nm, // total vecs + RealD _eresid, // resid in lmdue deficit + int _Niter) : // Max iterations + _Linop(Linop), _poly(poly), + Nstop(_Nstop), Nk(_Nk), Nm(_Nm), + eresid(_eresid), Niter(_Niter) { }; -// GridCartesian &_fgrid; - - LinearOperatorBase &_Linop; - - OperatorFunction &_poly; - - ///////////////////////// - // Constructor - ///////////////////////// - void init(void){}; - void Abort(int ff, DenseVector &evals, DenseVector > &evecs); - - ImplicitlyRestartedLanczos( - LinearOperatorBase &Linop, // op - OperatorFunction & poly, // polynmial - int _Nstop, // sought vecs - int _Nk, // sought vecs - int _Nm, // spare vecs - RealD _eresid, // resid in lmdue deficit - int _Niter) : // Max iterations - _Linop(Linop), - _poly(poly), - Nstop(_Nstop), - Nk(_Nk), - Nm(_Nm), - eresid(_eresid), - Niter(_Niter) - { - Np = Nm-Nk; assert(Np>0); - }; - - ImplicitlyRestartedLanczos( - LinearOperatorBase &Linop, // op +#if 0 + ImplicitlyRestartedLanczos(LinearOperatorBase &Linop, // op OperatorFunction & poly, // polynmial int _Nk, // sought vecs - int _Nm, // spare vecs + int _Nm, // total vecs RealD _eresid, // resid in lmdue deficit int _Niter) : // Max iterations - _Linop(Linop), - _poly(poly), - Nstop(_Nk), - Nk(_Nk), - Nm(_Nm), - eresid(_eresid), - Niter(_Niter) - { - Np = Nm-Nk; assert(Np>0); - }; + _Linop(Linop), _poly(poly), + Nstop(_Nk), Nk(_Nk), Nm(_Nm), + eresid(_eresid), Niter(_Niter) { }; +#endif - ///////////////////////// - // Sanity checked this routine (step) against Saad. - ///////////////////////// - void RitzMatrix(DenseVector& evec,int k){ +#if 0 + void calc(DenseVector& eval, + DenseVector& evec, + const Field& src, + int& Nconv); - if(1) return; + void step(DenseVector& lmd, + DenseVector& lme, + DenseVector& evec, + Field& w,int Nm,int k); - GridBase *grid = evec[0]._grid; - Field w(grid); - std::cout << "RitzMatrix "<1 ) { - if (abs(in) >1.0e-9 ) { - std::cout<<"oops"< &Qt) ; + + static RealD normalise(Field& v) ; + void orthogonalize(Field& w, DenseVector& evec, int k); + void diagonalize(DenseVector& lmd, + DenseVector& lme, + int N2, int N1, + DenseVector& Qt, + GridBase *grid); + + void qr_decomp(DenseVector& lmd, + DenseVector& lme, + int Nk, int Nm, + DenseVector& Qt, + RealD Dsh, int kmin, int kmax); + +#ifdef USE_LAPACK + void diagonalize_lapack(DenseVector& lmd, + DenseVector& lme, + int N1, int N2, + DenseVector& Qt, + GridBase *grid); +#endif +#endif /* Saad PP. 195 1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0 @@ -161,12 +144,12 @@ public: DenseVector& evec, Field& w,int Nm,int k) { + const RealD tiny = 1.0e-20; assert( k< Nm ); _poly(_Linop,evec[k],w); // 3. wk:=Avk−βkv_{k−1} - if(k>0){ - w -= lme[k-1] * evec[k-1]; - } + + if(k>0) w -= lme[k-1] * evec[k-1]; ComplexD zalph = innerProduct(evec[k],w); // 4. αk:=(wk,vk) RealD alph = real(zalph); @@ -176,29 +159,20 @@ public: RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop // 7. vk+1 := wk/βk+1 -// std::cout << "alpha = " << zalph << " beta "<0) { - orthogonalize(w,evec,k); // orthonormalise - } - - if(k < Nm-1) evec[k+1] = w; + if ( k > 0 ) orthogonalize(w,evec,k); // orthonormalise + if ( k < Nm-1) evec[k+1] = w; + + if ( beta < tiny ) std::cout << " beta is tiny "<& lmd, - DenseVector& lme, - int Nk, - int Nm, - DenseVector& Qt, - RealD Dsh, - int kmin, - int kmax) + + void qr_decomp(DenseVector& lmd, // Nm + DenseVector& lme, // Nm + int Nk, int Nm, + DenseVector& Qt, // Nm x Nm matrix + RealD Dsh, int kmin, int kmax) { int k = kmin-1; RealD x; @@ -218,7 +192,7 @@ public: lme[k+1] = c*lme[k+1]; for(int i=0; i& lmd, - DenseVector& lme, - int N1, - int N2, - DenseVector& Qt, - GridBase *grid){ - const int size = Nm; -// tevals.resize(size); -// tevecs.resize(size); - int NN = N1; - double evals_tmp[NN]; - double evec_tmp[NN][NN]; - memset(evec_tmp[0],0,sizeof(double)*NN*NN); -// double AA[NN][NN]; - double DD[NN]; - double EE[NN]; - for (int i = 0; i< NN; i++) - for (int j = i - 1; j <= i + 1; j++) - if ( j < NN && j >= 0 ) { - if (i==j) DD[i] = lmd[i]; - if (i==j) evals_tmp[i] = lmd[i]; - if (j==(i-1)) EE[j] = lme[j]; + DenseVector& lme, + int N1, + int N2, + DenseVector& Qt, + GridBase *grid) + { + const int size = Nm; + int NN = N1; + double evals_tmp[NN]; + double evec_tmp[NN][NN]; + memset(evec_tmp[0],0,sizeof(double)*NN*NN); + double DD[NN]; + double EE[NN]; + for (int i = 0; i< NN; i++) { + for (int j = i - 1; j <= i + 1; j++) { + if ( j < NN && j >= 0 ) { + if (i==j) DD[i] = lmd[i]; + if (i==j) evals_tmp[i] = lmd[i]; + if (j==(i-1)) EE[j] = lme[j]; + } + } } - int evals_found; - int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ; - int liwork = 3+NN*10 ; - int iwork[liwork]; - double work[lwork]; - int isuppz[2*NN]; - char jobz = 'V'; // calculate evals & evecs - char range = 'I'; // calculate all evals - // char range = 'A'; // calculate all evals - char uplo = 'U'; // refer to upper half of original matrix - char compz = 'I'; // Compute eigenvectors of tridiagonal matrix - int ifail[NN]; - int info; -// int total = QMP_get_number_of_nodes(); -// int node = QMP_get_node_number(); -// GridBase *grid = evec[0]._grid; - int total = grid->_Nprocessors; - int node = grid->_processor; - int interval = (NN/total)+1; - double vl = 0.0, vu = 0.0; - int il = interval*node+1 , iu = interval*(node+1); - if (iu > NN) iu=NN; - double tol = 0.0; - if (1) { - memset(evals_tmp,0,sizeof(double)*NN); - if ( il <= NN){ - printf("total=%d node=%d il=%d iu=%d\n",total,node,il,iu); - LAPACK_dstegr(&jobz, &range, &NN, - (double*)DD, (double*)EE, - &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A' - &tol, // tolerance - &evals_found, evals_tmp, (double*)evec_tmp, &NN, - isuppz, - work, &lwork, iwork, &liwork, - &info); - for (int i = iu-1; i>= il-1; i--){ - printf("node=%d evals_found=%d evals_tmp[%d] = %g\n",node,evals_found, i - (il-1),evals_tmp[i - (il-1)]); - evals_tmp[i] = evals_tmp[i - (il-1)]; - if (il>1) evals_tmp[i-(il-1)]=0.; - for (int j = 0; j< NN; j++){ - evec_tmp[i][j] = evec_tmp[i - (il-1)][j]; - if (il>1) evec_tmp[i-(il-1)][j]=0.; - } - } + int evals_found; + int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ; + int liwork = 3+NN*10 ; + int iwork[liwork]; + double work[lwork]; + int isuppz[2*NN]; + char jobz = 'V'; // calculate evals & evecs + char range = 'I'; // calculate all evals + // char range = 'A'; // calculate all evals + char uplo = 'U'; // refer to upper half of original matrix + char compz = 'I'; // Compute eigenvectors of tridiagonal matrix + int ifail[NN]; + int info; + int total = grid->_Nprocessors; + int node = grid->_processor; + int interval = (NN/total)+1; + double vl = 0.0, vu = 0.0; + int il = interval*node+1 , iu = interval*(node+1); + if (iu > NN) iu=NN; + double tol = 0.0; + if (1) { + memset(evals_tmp,0,sizeof(double)*NN); + if ( il <= NN){ + LAPACK_dstegr(&jobz, &range, &NN, + (double*)DD, (double*)EE, + &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A' + &tol, // tolerance + &evals_found, evals_tmp, (double*)evec_tmp, &NN, + isuppz, + work, &lwork, iwork, &liwork, + &info); + for (int i = iu-1; i>= il-1; i--){ + evals_tmp[i] = evals_tmp[i - (il-1)]; + if (il>1) evals_tmp[i-(il-1)]=0.; + for (int j = 0; j< NN; j++){ + evec_tmp[i][j] = evec_tmp[i - (il-1)][j]; + if (il>1) evec_tmp[i-(il-1)][j]=0.; + } + } + } + { + grid->GlobalSumVector(evals_tmp,NN); + grid->GlobalSumVector((double*)evec_tmp,NN*NN); + } + } + // cheating a bit. + // It is better to sort instead of just reversing it, + // but the document of the routine says evals are sorted in increasing order. + // qr gives evals in decreasing order. + for(int i=0;iGlobalSumVector(evals_tmp,NN); - grid->GlobalSumVector((double*)evec_tmp,NN*NN); - } - } -// cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order. - for(int i=0;i& lmd, DenseVector& lme, int N2, @@ -354,24 +324,23 @@ public: if(!check_lapack) return diagonalize_lapack(lmd,lme,N2,N1,Qt,grid); - DenseVector lmd2(N1); - DenseVector lme2(N1); - DenseVector Qt2(N1*N1); - for(int k=0; k lmd2(N1); + DenseVector lme2(N1); + DenseVector Qt2(N1*N1); + for(int k=0; k lmd3(N2); - for(int k=0; k lmd3(N2); + for(int k=0; kSMALL) std::cout <<"lmd(qr) lmd(lapack) "<< k << ": " << lmd2[k] <<" "<< lmd3[k] <SMALL) std::cout <<"lme(qr)-lme(lapack) "<< k << ": " << lme2[k] - lme[k] <SMALL) std::cout <<"Qt(qr)-Qt(lapack) "<< k << ": " << Qt2[k] - Qt[k] < &Qt) { for(int i=0; i & bq, - Field &bf, - DenseMatrix &H){ - - GridBase *grid = bq[0]._grid; - - RealD beta; - RealD sqbt; - RealD alpha; - - for(int i=start;i 1) std::cout << "orthagonality refined " << re << " times" < evals, - DenseVector evecs){ - int N= evals.size(); - _sort.push(evals,evecs, evals.size(),N); - } - - void ImplicitRestart(int TM, DenseVector &evals, DenseVector > &evecs, DenseVector &bq, Field &bf, int cont) - { - std::cout << "ImplicitRestart begin. Eigensort starting\n"; - - DenseMatrix H; Resize(H,Nm,Nm); - - EigenSort(evals, evecs); - - ///Assign shifts - int K=Nk; - int M=Nm; - int P=Np; - int converged=0; - if(K - converged < 4) P = (M - K-1); //one - // DenseVector shifts(P + shift_extra.size()); - DenseVector shifts(P); - for(int k = 0; k < P; ++k) - shifts[k] = evals[k]; - - /// Shift to form a new H and q - DenseMatrix Q; Resize(Q,TM,TM); - Unity(Q); - Shift(Q, shifts); // H is implicitly passed in in Rudy's Shift routine - - int ff = K; - - /// Shifted H defines a new K step Arnoldi factorization - RealD beta = H[ff][ff-1]; - RealD sig = Q[TM - 1][ff - 1]; - std::cout << "beta = " << beta << " sig = " << real(sig) < q Q - times_real(bq, Q, TM); - - std::cout << norm2(bq[0]) << " -- after " << ff < &bq, Field &bf, DenseVector > & evecs,DenseVector &evals) - { - init(); - - int M=Nm; - - DenseMatrix H; Resize(H,Nm,Nm); - Resize(evals,Nm); - Resize(evecs,Nm); - - int ff = Lanczos_Factor(0, M, cont, bq,bf,H); // 0--M to begin with - - if(ff < M) { - std::cout << "Krylov: aborting ff "< " << it << std::endl; - int lock_num = lock ? converged : 0; - DenseVector tevals(M - lock_num ); - DenseMatrix tevecs; Resize(tevecs,M - lock_num,M - lock_num); - - //check residual of polynominal - TestConv(H,M, tevals, tevecs); - - if(converged >= Nk) - break; - - ImplicitRestart(ff, tevals,tevecs,H); - } - Wilkinson(H, evals, evecs, small); - // Check(); - - std::cout << "Done "< & H,DenseMatrix &Q, DenseVector shifts) { - - int P; Size(shifts,P); - int M; SizeSquare(Q,M); - - Unity(Q); - - int lock_num = lock ? converged : 0; - - RealD t_Househoulder_vector(0.0); - RealD t_Househoulder_mult(0.0); - - for(int i=0;i ck(3), v(3); - - x = H[lock_num+0][lock_num+0]-shifts[i]; - y = H[lock_num+1][lock_num+0]; - ck[0] = x; ck[1] = y; ck[2] = 0; - - normalise(ck); ///Normalization cancels in PHP anyway - RealD beta; - - Householder_vector(ck, 0, 2, v, beta); - Householder_mult(H,v,beta,0,lock_num+0,lock_num+2,0); - Householder_mult(H,v,beta,0,lock_num+0,lock_num+2,1); - ///Accumulate eigenvector - Householder_mult(Q,v,beta,0,lock_num+0,lock_num+2,1); - - int sw = 0; - for(int k=lock_num+0;k(ck, 0, 2-sw, v, beta); - Householder_mult(H,v, beta,0,k+1,k+3-sw,0); - Householder_mult(H,v, beta,0,k+1,k+3-sw,1); - ///Accumulate eigenvector - Householder_mult(Q,v, beta,0,k+1,k+3-sw,1); - } - } - } - - void TestConv(DenseMatrix & H,int SS, - DenseVector &bq, Field &bf, - DenseVector &tevals, DenseVector > &tevecs, - int lock, int converged) - { - std::cout << "Converged " << converged << " so far." << std::endl; - int lock_num = lock ? converged : 0; - int M = Nm; - - ///Active Factorization - DenseMatrix AH; Resize(AH,SS - lock_num,SS - lock_num ); - - AH = GetSubMtx(H,lock_num, SS, lock_num, SS); - - int NN=tevals.size(); - int AHsize=SS-lock_num; - - RealD small=1.0e-16; - Wilkinson(AH, tevals, tevecs, small); - - EigenSort(tevals, tevecs); - - RealD resid_nrm= norm2(bf); - - if(!lock) converged = 0; -#if 0 - for(int i = SS - lock_num - 1; i >= SS - Nk && i >= 0; --i){ - - RealD diff = 0; - diff = abs( tevecs[i][Nm - 1 - lock_num] ) * resid_nrm; - - std::cout << "residual estimate " << SS-1-i << " " << diff << " of (" << tevals[i] << ")" << std::endl; - - if(diff < converged) { - - if(lock) { - - DenseMatrix Q; Resize(Q,M,M); - bool herm = true; - - Lock(H, Q, tevals[i], converged, small, SS, herm); - - times_real(bq, Q, bq.size()); - bf = Q[M - 1][M - 1]* bf; - lock_num++; - } - converged++; - std::cout << " converged on eval " << converged << " of " << Nk << std::endl; - } else { - break; - } - } -#endif - std::cout << "Got " << converged << " so far " < &evals, - DenseVector > &evecs) { - - DenseVector goodval(this->get); - - EigenSort(evals,evecs); - - int NM = Nm; - - DenseVector< DenseVector > V; Size(V,NM); - DenseVector QZ(NM*NM); - - for(int i = 0; i < NM; i++){ - for(int j = 0; j < NM; j++){ - // evecs[i][j]; - } - } - } - - -/** - There is some matrix Q such that for any vector y - Q.e_1 = y and Q is unitary. -**/ - template - static T orthQ(DenseMatrix &Q, DenseVector y){ - int N = y.size(); //Matrix Size - Fill(Q,0.0); - T tau; - for(int i=0;i 0.0){ - - T gam = conj( (y[j]/tau)/tau0 ); - for(int k=0;k<=j-1;k++){ - Q[k][j]=-gam*y[k]; - } - Q[j][j]=tau0/tau; - } else { - Q[j-1][j]=1.0; - } - tau0 = tau; - } - return tau; - } - -/** - There is some matrix Q such that for any vector y - Q.e_k = y and Q is unitary. -**/ - template< class T> - static T orthU(DenseMatrix &Q, DenseVector y){ - T tau = orthQ(Q,y); - SL(Q); - return tau; - } - - -/** - Wind up with a matrix with the first con rows untouched - -say con = 2 - Q is such that Qdag H Q has {x, x, val, 0, 0, 0, 0, ...} as 1st colum - and the matrix is upper hessenberg - and with f and Q appropriately modidied with Q is the arnoldi factorization - -**/ - -template -static void Lock(DenseMatrix &H, // Hess mtx - DenseMatrix &Q, // Lock Transform - T val, // value to be locked - int con, // number already locked - RealD small, - int dfg, - bool herm) -{ - //ForceTridiagonal(H); - - int M = H.dim; - DenseVector vec; Resize(vec,M-con); - - DenseMatrix AH; Resize(AH,M-con,M-con); - AH = GetSubMtx(H,con, M, con, M); - - DenseMatrix QQ; Resize(QQ,M-con,M-con); - - Unity(Q); Unity(QQ); - - DenseVector evals; Resize(evals,M-con); - DenseMatrix evecs; Resize(evecs,M-con,M-con); - - Wilkinson(AH, evals, evecs, small); - - int k=0; - RealD cold = abs( val - evals[k]); - for(int i=1;icon+2; j--){ - - DenseMatrix U; Resize(U,j-1-con,j-1-con); - DenseVector z; Resize(z,j-1-con); - T nm = norm(z); - for(int k = con+0;k Hb; Resize(Hb,j-1-con,M); - - for(int a = 0;a Qb; Resize(Qb,M,M); - - for(int a = 0;a Hc; Resize(Hc,M,M); - - for(int a = 0;a { FieldMetaData header; IldgReader _IldgReader; _IldgReader.open(config); - _IldgReader.readConfiguration(config,U,header); // format from the header + _IldgReader.readConfiguration(U,header); // format from the header _IldgReader.close(); std::cout << GridLogMessage << "Read ILDG Configuration from " << config diff --git a/tests/solver/Test_dwf_lanczos.cc b/tests/solver/Test_dwf_lanczos.cc index bb978186..48cca378 100644 --- a/tests/solver/Test_dwf_lanczos.cc +++ b/tests/solver/Test_dwf_lanczos.cc @@ -54,7 +54,7 @@ int main (int argc, char ** argv) GridParallelRNG RNG5rb(FrbGrid); RNG5.SeedFixedIntegers(seeds5); LatticeGaugeField Umu(UGrid); - SU3::TepidConfiguration(RNG4, Umu); + SU3::HotConfiguration(RNG4, Umu); std::vector U(4,UGrid); for(int mu=0;mu Date: Wed, 21 Jun 2017 02:26:03 +0100 Subject: [PATCH 084/170] Simplified lanczos, added Eigen diagonalisation. Curious if we can deprecate dependencly on BLAS. Will see when we get 48^3 running on our BG/Q port --- .../iterative/BlockConjugateGradient.h | 7 +- lib/algorithms/iterative/EigenSort.h | 81 -- .../iterative/ImplicitlyRestartedLanczos.h | 1074 +++++++++-------- tests/solver/Test_dwf_lanczos.cc | 9 +- 4 files changed, 547 insertions(+), 624 deletions(-) delete mode 100644 lib/algorithms/iterative/EigenSort.h diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h index f8b83b1f..9418f63c 100644 --- a/lib/algorithms/iterative/BlockConjugateGradient.h +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -56,11 +56,8 @@ class BlockConjugateGradient : public OperatorFunction { Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true) - : Tolerance(tol), - CGtype(cgtype), - blockDim(_Orthog), - MaxIterations(maxit), - ErrorOnNoConverge(err_on_no_conv){}; + : Tolerance(tol), CGtype(cgtype), blockDim(_Orthog), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv) + {}; //////////////////////////////////////////////////////////////////////////////////////////////////// // Thin QR factorisation (google it) diff --git a/lib/algorithms/iterative/EigenSort.h b/lib/algorithms/iterative/EigenSort.h deleted file mode 100644 index 23621544..00000000 --- a/lib/algorithms/iterative/EigenSort.h +++ /dev/null @@ -1,81 +0,0 @@ - /************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: ./lib/algorithms/iterative/EigenSort.h - - Copyright (C) 2015 - -Author: Peter Boyle - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ -#ifndef GRID_EIGENSORT_H -#define GRID_EIGENSORT_H - - -namespace Grid { - ///////////////////////////////////////////////////////////// - // Eigen sorter to begin with - ///////////////////////////////////////////////////////////// - -template -class SortEigen { - private: - -//hacking for testing for now - private: - static bool less_lmd(RealD left,RealD right){ - return left > right; - } - static bool less_pair(std::pair& left, - std::pair& right){ - return left.first > (right.first); - } - - - public: - - void push(DenseVector& lmd, - DenseVector& evec,int N) { - DenseVector cpy(lmd.size(),evec[0]._grid); - for(int i=0;i > emod(lmd.size()); - for(int i=0;i(lmd[i],&cpy[i]); - - partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair); - - typename DenseVector >::iterator it = emod.begin(); - for(int i=0;ifirst; - evec[i]=*(it->second); - ++it; - } - } - void push(DenseVector& lmd,int N) { - std::partial_sort(lmd.begin(),lmd.begin()+N,lmd.end(),less_lmd); - } - bool saturated(RealD lmd, RealD thrs) { - return fabs(lmd) > fabs(thrs); - } -}; - -} -#endif diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h index acd67592..571bf1b2 100644 --- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h +++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h @@ -7,7 +7,8 @@ Copyright (C) 2015 Author: Peter Boyle -Author: paboyle +Author: Chulwoo Jung +Author: Guido Cossu This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -31,35 +32,71 @@ Author: paboyle #include //memset -#ifdef USE_LAPACK -void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e, - double *vl, double *vu, int *il, int *iu, double *abstol, - int *m, double *w, double *z, int *ldz, int *isuppz, - double *work, int *lwork, int *iwork, int *liwork, - int *info); -#endif - -template using DenseVector = std::vector; - -//#include -#include - namespace Grid { + enum IRLdiagonalisation { + IRLdiagonaliseWithDSTEGR, + IRLdiagonaliseWithQR, + IRLdiagonaliseWithEigen + }; + //////////////////////////////////////////////////////////////////////////////// + // Helper class for sorting the evalues AND evectors by Field + // Use pointer swizzle on vectors + //////////////////////////////////////////////////////////////////////////////// +template +class SortEigen { + private: + static bool less_lmd(RealD left,RealD right){ + return left > right; + } + static bool less_pair(std::pair& left, + std::pair& right){ + return left.first > (right.first); + } + + public: + void push(std::vector& lmd,std::vector& evec,int N) { + + //////////////////////////////////////////////////////////////////////// + // PAB: FIXME: VERY VERY VERY wasteful: takes a copy of the entire vector set. + // : The vector reorder should be done by pointer swizzle somehow + //////////////////////////////////////////////////////////////////////// + std::vector cpy(lmd.size(),evec[0]._grid); + for(int i=0;i > emod(lmd.size()); + + for(int i=0;i(lmd[i],&cpy[i]); + + partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair); + + typename std::vector >::iterator it = emod.begin(); + for(int i=0;ifirst; + evec[i]=*(it->second); + ++it; + } + } + void push(std::vector& lmd,int N) { + std::partial_sort(lmd.begin(),lmd.begin()+N,lmd.end(),less_lmd); + } + bool saturated(RealD lmd, RealD thrs) { + return fabs(lmd) > fabs(thrs); + } +}; + ///////////////////////////////////////////////////////////// // Implicitly restarted lanczos ///////////////////////////////////////////////////////////// template class ImplicitlyRestartedLanczos { - -public: - int Niter; // Max iterations - int Nstop; // Number of evecs checked for convergence - int Nk; // Number of converged sought - int Nm; // Nm -- total number of vectors - +private: + int MaxIter; // Max iterations + int Nstop; // Number of evecs checked for convergence + int Nk; // Number of converged sought + int Nm; // Nm -- total number of vectors RealD eresid; - + IRLdiagonalisation diagonalisation; //////////////////////////////////// // Embedded objects //////////////////////////////////// @@ -70,362 +107,20 @@ public: ///////////////////////// // Constructor ///////////////////////// +public: ImplicitlyRestartedLanczos(LinearOperatorBase &Linop, // op - OperatorFunction & poly, // polynmial - int _Nstop, // sought vecs + OperatorFunction & poly, // polynomial + int _Nstop, // really sought vecs int _Nk, // sought vecs int _Nm, // total vecs - RealD _eresid, // resid in lmdue deficit - int _Niter) : // Max iterations + RealD _eresid, // resid in lmd deficit + int _MaxIter, // Max iterations + IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen ) : _Linop(Linop), _poly(poly), - Nstop(_Nstop), Nk(_Nk), Nm(_Nm), - eresid(_eresid), Niter(_Niter) { }; - -#if 0 - ImplicitlyRestartedLanczos(LinearOperatorBase &Linop, // op - OperatorFunction & poly, // polynmial - int _Nk, // sought vecs - int _Nm, // total vecs - RealD _eresid, // resid in lmdue deficit - int _Niter) : // Max iterations - _Linop(Linop), _poly(poly), - Nstop(_Nk), Nk(_Nk), Nm(_Nm), - eresid(_eresid), Niter(_Niter) { }; -#endif - -#if 0 - void calc(DenseVector& eval, - DenseVector& evec, - const Field& src, - int& Nconv); - - void step(DenseVector& lmd, - DenseVector& lme, - DenseVector& evec, - Field& w,int Nm,int k); - - void setUnit_Qt(int Nm, DenseVector &Qt) ; - - static RealD normalise(Field& v) ; - void orthogonalize(Field& w, DenseVector& evec, int k); - void diagonalize(DenseVector& lmd, - DenseVector& lme, - int N2, int N1, - DenseVector& Qt, - GridBase *grid); - - void qr_decomp(DenseVector& lmd, - DenseVector& lme, - int Nk, int Nm, - DenseVector& Qt, - RealD Dsh, int kmin, int kmax); - -#ifdef USE_LAPACK - void diagonalize_lapack(DenseVector& lmd, - DenseVector& lme, - int N1, int N2, - DenseVector& Qt, - GridBase *grid); -#endif -#endif - -/* Saad PP. 195 -1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0 -2. For k = 1,2,...,m Do: -3. wk:=Avk−βkv_{k−1} -4. αk:=(wk,vk) // -5. wk:=wk−αkvk // wk orthog vk -6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop -7. vk+1 := wk/βk+1 -8. EndDo - */ - void step(DenseVector& lmd, - DenseVector& lme, - DenseVector& evec, - Field& w,int Nm,int k) - { - const RealD tiny = 1.0e-20; - assert( k< Nm ); - - _poly(_Linop,evec[k],w); // 3. wk:=Avk−βkv_{k−1} - - if(k>0) w -= lme[k-1] * evec[k-1]; - - ComplexD zalph = innerProduct(evec[k],w); // 4. αk:=(wk,vk) - RealD alph = real(zalph); - - w = w - alph * evec[k];// 5. wk:=wk−αkvk - - RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop - // 7. vk+1 := wk/βk+1 - - lmd[k] = alph; - lme[k] = beta; - - if ( k > 0 ) orthogonalize(w,evec,k); // orthonormalise - if ( k < Nm-1) evec[k+1] = w; - - if ( beta < tiny ) std::cout << " beta is tiny "<& lmd, // Nm - DenseVector& lme, // Nm - int Nk, int Nm, - DenseVector& Qt, // Nm x Nm matrix - RealD Dsh, int kmin, int kmax) - { - int k = kmin-1; - RealD x; - - RealD Fden = 1.0/hypot(lmd[k]-Dsh,lme[k]); - RealD c = ( lmd[k] -Dsh) *Fden; - RealD s = -lme[k] *Fden; - - RealD tmpa1 = lmd[k]; - RealD tmpa2 = lmd[k+1]; - RealD tmpb = lme[k]; - - lmd[k] = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb; - lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb; - lme[k] = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb; - x =-s*lme[k+1]; - lme[k+1] = c*lme[k+1]; - - for(int i=0; i& lmd, - DenseVector& lme, - int N1, - int N2, - DenseVector& Qt, - GridBase *grid) - { - const int size = Nm; - int NN = N1; - double evals_tmp[NN]; - double evec_tmp[NN][NN]; - memset(evec_tmp[0],0,sizeof(double)*NN*NN); - double DD[NN]; - double EE[NN]; - for (int i = 0; i< NN; i++) { - for (int j = i - 1; j <= i + 1; j++) { - if ( j < NN && j >= 0 ) { - if (i==j) DD[i] = lmd[i]; - if (i==j) evals_tmp[i] = lmd[i]; - if (j==(i-1)) EE[j] = lme[j]; - } - } - } - int evals_found; - int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ; - int liwork = 3+NN*10 ; - int iwork[liwork]; - double work[lwork]; - int isuppz[2*NN]; - char jobz = 'V'; // calculate evals & evecs - char range = 'I'; // calculate all evals - // char range = 'A'; // calculate all evals - char uplo = 'U'; // refer to upper half of original matrix - char compz = 'I'; // Compute eigenvectors of tridiagonal matrix - int ifail[NN]; - int info; - int total = grid->_Nprocessors; - int node = grid->_processor; - int interval = (NN/total)+1; - double vl = 0.0, vu = 0.0; - int il = interval*node+1 , iu = interval*(node+1); - if (iu > NN) iu=NN; - double tol = 0.0; - if (1) { - memset(evals_tmp,0,sizeof(double)*NN); - if ( il <= NN){ - LAPACK_dstegr(&jobz, &range, &NN, - (double*)DD, (double*)EE, - &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A' - &tol, // tolerance - &evals_found, evals_tmp, (double*)evec_tmp, &NN, - isuppz, - work, &lwork, iwork, &liwork, - &info); - for (int i = iu-1; i>= il-1; i--){ - evals_tmp[i] = evals_tmp[i - (il-1)]; - if (il>1) evals_tmp[i-(il-1)]=0.; - for (int j = 0; j< NN; j++){ - evec_tmp[i][j] = evec_tmp[i - (il-1)][j]; - if (il>1) evec_tmp[i-(il-1)][j]=0.; - } - } - } - { - grid->GlobalSumVector(evals_tmp,NN); - grid->GlobalSumVector((double*)evec_tmp,NN*NN); - } - } - // cheating a bit. - // It is better to sort instead of just reversing it, - // but the document of the routine says evals are sorted in increasing order. - // qr gives evals in decreasing order. - for(int i=0;i& lmd, - DenseVector& lme, - int N2, - int N1, - DenseVector& Qt, - GridBase *grid) - { - -#ifdef USE_LAPACK - const int check_lapack=0; // just use lapack if 0, check against lapack if 1 - - if(!check_lapack) - return diagonalize_lapack(lmd,lme,N2,N1,Qt,grid); - - DenseVector lmd2(N1); - DenseVector lme2(N1); - DenseVector Qt2(N1*N1); - for(int k=0; k= kmin; --j){ - RealD dds = fabs(lmd[j-1])+fabs(lmd[j]); - if(fabs(lme[j-1])+dds > dds){ - kmax = j+1; - goto continued; - } - } - Niter = iter; -#ifdef USE_LAPACK - if(check_lapack){ - const double SMALL=1e-8; - diagonalize_lapack(lmd2,lme2,N2,N1,Qt2,grid); - DenseVector lmd3(N2); - for(int k=0; kSMALL) std::cout <<"lmd(qr) lmd(lapack) "<< k << ": " << lmd2[k] <<" "<< lmd3[k] < dds){ - kmin = j+1; - break; - } - } - } - std::cout << "[QL method] Error - Too many iteration: "<& evec, - int k) - { - typedef typename Field::scalar_type MyComplex; - MyComplex ip; - - if ( 0 ) { - for(int j=0; j &Qt) { - for(int i=0; i& eval, - DenseVector& evec, - const Field& src, - int& Nconv) - { - - GridBase *grid = evec[0]._grid; - assert(grid == src._grid); - - std::cout << " -- seek Nk = " << Nk <<" vectors"<< std::endl; - std::cout << " -- accept Nstop = " << Nstop <<" vectors"<< std::endl; - std::cout << " -- total Nm = " << Nm <<" vectors"<< std::endl; - std::cout << " -- size of eval = " << eval.size() << std::endl; - std::cout << " -- size of evec = " << evec.size() << std::endl; - - assert(Nm == evec.size() && Nm == eval.size()); - - DenseVector lme(Nm); - DenseVector lme2(Nm); - DenseVector eval2(Nm); - DenseVector Qt(Nm*Nm); - DenseVector Iconv(Nm); - - DenseVector B(Nm,grid); // waste of space replicating - - Field f(grid); - Field v(grid); - - int k1 = 1; - int k2 = Nk; - - Nconv = 0; - - RealD beta_k; - - // Set initial vector - evec[0] = src; - std:: cout <<"norm2(src)= " << norm2(src)<& eval, std::vector& evec, const Field& src, int& Nconv) + { - for(int i=0; i<(Nk+1); ++i) B[i] = 0.0; - - for(int j=k1-1; j=Nstop ){ - goto converged; - } - } // end of iter loop + GridBase *grid = evec[0]._grid; + assert(grid == src._grid); + + std::cout << GridLogMessage <<"**************************************************************************"<< std::endl; + std::cout << GridLogMessage <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 / "<< MaxIter<< std::endl; + std::cout << GridLogMessage <<"**************************************************************************"<< std::endl; + std::cout << GridLogMessage <<" -- seek Nk = " << Nk <<" vectors"<< std::endl; + std::cout << GridLogMessage <<" -- accept Nstop = " << Nstop <<" vectors"<< std::endl; + std::cout << GridLogMessage <<" -- total Nm = " << Nm <<" vectors"<< std::endl; + std::cout << GridLogMessage <<" -- size of eval = " << eval.size() << std::endl; + std::cout << GridLogMessage <<" -- size of evec = " << evec.size() << std::endl; + if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) { + std::cout << GridLogMessage << "Diagonalisation is DSTEGR "< lme(Nm); + std::vector lme2(Nm); + std::vector eval2(Nm); + Eigen::MatrixXd Qt = Eigen::MatrixXd::Zero(Nm,Nm); + std::vector Iconv(Nm); + + std::vector B(Nm,grid); // waste of space replicating + + Field f(grid); + Field v(grid); + + int k1 = 1; + int k2 = Nk; + + Nconv = 0; + + RealD beta_k; + + // Set initial vector + evec[0] = src; + std::cout << GridLogMessage <<"norm2(src)= " << norm2(src)<=Nstop ){ + goto converged; + } + } // end of iter loop + + std::cout << GridLogMessage <<"**************************************************************************"<< std::endl; + std::cout<< GridLogError <<" ImplicitlyRestartedLanczos::calc() NOT converged."; + std::cout << GridLogMessage <<"**************************************************************************"<< std::endl; + abort(); + + converged: + // Sorting + eval.resize(Nconv); + evec.resize(Nconv,grid); + for(int i=0; i& lmd, + std::vector& lme, + std::vector& evec, + Field& w,int Nm,int k) + { + const RealD tiny = 1.0e-20; + assert( k< Nm ); + + _poly(_Linop,evec[k],w); // 3. wk:=Avk−βkv_{k−1} + + if(k>0) w -= lme[k-1] * evec[k-1]; + + ComplexD zalph = innerProduct(evec[k],w); // 4. αk:=(wk,vk) + RealD alph = real(zalph); + + w = w - alph * evec[k];// 5. wk:=wk−αkvk + + RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop + // 7. vk+1 := wk/βk+1 + + lmd[k] = alph; + lme[k] = beta; + + if ( k > 0 ) orthogonalize(w,evec,k); // orthonormalise + if ( k < Nm-1) evec[k+1] = w; + + if ( beta < tiny ) std::cout << GridLogMessage << " beta is tiny "<& lmd, // Nm + std::vector& lme, // Nm + int Nk, int Nm, // Nk, Nm + Eigen::MatrixXd& Qt, // Nm x Nm matrix + RealD Dsh, int kmin, int kmax) + { + int k = kmin-1; + RealD x; + + RealD Fden = 1.0/hypot(lmd[k]-Dsh,lme[k]); + RealD c = ( lmd[k] -Dsh) *Fden; + RealD s = -lme[k] *Fden; + + RealD tmpa1 = lmd[k]; + RealD tmpa2 = lmd[k+1]; + RealD tmpb = lme[k]; + + lmd[k] = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb; + lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb; + lme[k] = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb; + x =-s*lme[k+1]; + lme[k+1] = c*lme[k+1]; + + for(int i=0; i& lmd, std::vector& lme, + int Nk, int Nm, + Eigen::MatrixXd & Qt, + GridBase *grid) + { + Qt = Eigen::MatrixXd::Identity(Nm,Nm); + if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) { + diagonalize_lapack(lmd,lme,Nk,Nm,Qt,grid); + } else if ( diagonalisation == IRLdiagonaliseWithQR ) { + diagonalize_QR(lmd,lme,Nk,Nm,Qt,grid); + } else if ( diagonalisation == IRLdiagonaliseWithEigen ) { + diagonalize_Eigen(lmd,lme,Nk,Nm,Qt,grid); + } else { + assert(0); + } + } + +#ifdef USE_LAPACK +void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e, + double *vl, double *vu, int *il, int *iu, double *abstol, + int *m, double *w, double *z, int *ldz, int *isuppz, + double *work, int *lwork, int *iwork, int *liwork, + int *info); #endif +void diagonalize_lapack(std::vector& lmd, + std::vector& lme, + int Nk, int Nm, + Eigen::MatrixXd& Qt, + GridBase *grid) +{ +#ifdef USE_LAPACK + const int size = Nm; + int NN = Nk; + double evals_tmp[NN]; + double evec_tmp[NN][NN]; + memset(evec_tmp[0],0,sizeof(double)*NN*NN); + double DD[NN]; + double EE[NN]; + for (int i = 0; i< NN; i++) { + for (int j = i - 1; j <= i + 1; j++) { + if ( j < NN && j >= 0 ) { + if (i==j) DD[i] = lmd[i]; + if (i==j) evals_tmp[i] = lmd[i]; + if (j==(i-1)) EE[j] = lme[j]; + } + } + } + int evals_found; + int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ; + int liwork = 3+NN*10 ; + int iwork[liwork]; + double work[lwork]; + int isuppz[2*NN]; + char jobz = 'V'; // calculate evals & evecs + char range = 'I'; // calculate all evals + // char range = 'A'; // calculate all evals + char uplo = 'U'; // refer to upper half of original matrix + char compz = 'I'; // Compute eigenvectors of tridiagonal matrix + int ifail[NN]; + int info; + int total = grid->_Nprocessors; + int node = grid->_processor; + int interval = (NN/total)+1; + double vl = 0.0, vu = 0.0; + int il = interval*node+1 , iu = interval*(node+1); + if (iu > NN) iu=NN; + double tol = 0.0; + if (1) { + memset(evals_tmp,0,sizeof(double)*NN); + if ( il <= NN){ + LAPACK_dstegr(&jobz, &range, &NN, + (double*)DD, (double*)EE, + &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A' + &tol, // tolerance + &evals_found, evals_tmp, (double*)evec_tmp, &NN, + isuppz, + work, &lwork, iwork, &liwork, + &info); + for (int i = iu-1; i>= il-1; i--){ + evals_tmp[i] = evals_tmp[i - (il-1)]; + if (il>1) evals_tmp[i-(il-1)]=0.; + for (int j = 0; j< NN; j++){ + evec_tmp[i][j] = evec_tmp[i - (il-1)][j]; + if (il>1) evec_tmp[i-(il-1)][j]=0.; + } + } + } + { + grid->GlobalSumVector(evals_tmp,NN); + grid->GlobalSumVector((double*)evec_tmp,NN*NN); + } + } + // Safer to sort instead of just reversing it, + // but the document of the routine says evals are sorted in increasing order. + // qr gives evals in decreasing order. + for(int i=0;i& lmd, std::vector& lme, + int Nk, int Nm, + Eigen::MatrixXd & Qt, + GridBase *grid) + { + int Niter = 100*Nm; + int kmin = 1; + int kmax = Nk; + + // (this should be more sophisticated) + for(int iter=0; iter= kmin; --j){ + RealD dds = fabs(lmd[j-1])+fabs(lmd[j]); + if(fabs(lme[j-1])+dds > dds){ + kmax = j+1; + goto continued; + } + } + Niter = iter; + return; + + continued: + for(int j=0; j dds){ + kmin = j+1; + break; + } + } + } + std::cout << GridLogError << "[QL method] Error - Too many iteration: "<& lmd, std::vector& lme, + int Nk, int Nm, + Eigen::MatrixXd & Qt, // Nm x Nm + GridBase *grid) + { + Eigen::MatrixXd TriDiag = Eigen::MatrixXd::Zero(Nk,Nk); + + for(int i=0;i eigensolver(TriDiag); + + for (int i = 0; i < Nk; i++) { + lmd[Nk-1-i] = eigensolver.eigenvalues()(i); + } + for (int i = 0; i < Nk; i++) { + for (int j = 0; j < Nk; j++) { + Qt(Nk-1-i,j) = eigensolver.eigenvectors()(j,i); + } + } + } + + + static RealD normalise(Field& v) + { + RealD nn = norm2(v); + nn = sqrt(nn); + v = v * (1.0/nn); + return nn; + } + + void orthogonalize(Field& w, std::vector& evec, int k) + { + typedef typename Field::scalar_type MyComplex; + MyComplex ip; + + for(int j=0; j eval(Nm); - FermionField src(FrbGrid); gaussian(RNG5rb,src); + FermionField src(FrbGrid); + gaussian(RNG5rb,src); std::vector evec(Nm,FrbGrid); for(int i=0;i<1;i++){ - std::cout << i<<" / "<< Nm<< " grid pointer "< Date: Wed, 21 Jun 2017 02:50:09 +0100 Subject: [PATCH 085/170] Clean up finished. Could shrink Lanczos to around 400 lines at a push --- .../iterative/ImplicitlyRestartedLanczos.h | 114 +++++++++--------- tests/debug/Test_synthetic_lanczos.cc | 4 +- 2 files changed, 62 insertions(+), 56 deletions(-) diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h index 571bf1b2..a8723f32 100644 --- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h +++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h @@ -39,10 +39,11 @@ namespace Grid { IRLdiagonaliseWithQR, IRLdiagonaliseWithEigen }; - //////////////////////////////////////////////////////////////////////////////// - // Helper class for sorting the evalues AND evectors by Field - // Use pointer swizzle on vectors - //////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// Helper class for sorting the evalues AND evectors by Field +// Use pointer swizzle on vectors +//////////////////////////////////////////////////////////////////////////////// template class SortEigen { private: @@ -90,7 +91,9 @@ class SortEigen { ///////////////////////////////////////////////////////////// template class ImplicitlyRestartedLanczos { + private: + int MaxIter; // Max iterations int Nstop; // Number of evecs checked for convergence int Nk; // Number of converged sought @@ -122,6 +125,29 @@ public: diagonalisation(_diagonalisation) { }; + //////////////////////////////// + // Helpers + //////////////////////////////// + static RealD normalise(Field& v) + { + RealD nn = norm2(v); + nn = sqrt(nn); + v = v * (1.0/nn); + return nn; + } + + void orthogonalize(Field& w, std::vector& evec, int k) + { + typedef typename Field::scalar_type MyComplex; + MyComplex ip; + + for(int j=0; j K P = M − K † @@ -167,9 +193,10 @@ until convergence std::vector lme(Nm); std::vector lme2(Nm); std::vector eval2(Nm); - Eigen::MatrixXd Qt = Eigen::MatrixXd::Zero(Nm,Nm); - std::vector Iconv(Nm); + Eigen::MatrixXd Qt = Eigen::MatrixXd::Zero(Nm,Nm); + + std::vector Iconv(Nm); std::vector B(Nm,grid); // waste of space replicating Field f(grid); @@ -218,6 +245,7 @@ until convergence // Implicitly shifted QR transformations Qt = Eigen::MatrixXd::Identity(Nm,Nm); for(int ip=k2; ip& lmd, std::vector& lme, + int Nk, int Nm, + Eigen::MatrixXd & Qt, // Nm x Nm + GridBase *grid) + { + Eigen::MatrixXd TriDiag = Eigen::MatrixXd::Zero(Nk,Nk); + + for(int i=0;i eigensolver(TriDiag); + + for (int i = 0; i < Nk; i++) { + lmd[Nk-1-i] = eigensolver.eigenvalues()(i); + } + for (int i = 0; i < Nk; i++) { + for (int j = 0; j < Nk; j++) { + Qt(Nk-1-i,j) = eigensolver.eigenvectors()(j,i); + } + } + } + /////////////////////////////////////////////////////////////////////////// + // File could end here if settle on Eigen ??? + /////////////////////////////////////////////////////////////////////////// + void qr_decomp(std::vector& lmd, // Nm std::vector& lme, // Nm int Nk, int Nm, // Nk, Nm @@ -570,50 +620,6 @@ void diagonalize_lapack(std::vector& lmd, abort(); } - void diagonalize_Eigen(std::vector& lmd, std::vector& lme, - int Nk, int Nm, - Eigen::MatrixXd & Qt, // Nm x Nm - GridBase *grid) - { - Eigen::MatrixXd TriDiag = Eigen::MatrixXd::Zero(Nk,Nk); - - for(int i=0;i eigensolver(TriDiag); - - for (int i = 0; i < Nk; i++) { - lmd[Nk-1-i] = eigensolver.eigenvalues()(i); - } - for (int i = 0; i < Nk; i++) { - for (int j = 0; j < Nk; j++) { - Qt(Nk-1-i,j) = eigensolver.eigenvectors()(j,i); - } - } - } - - - static RealD normalise(Field& v) - { - RealD nn = norm2(v); - nn = sqrt(nn); - v = v * (1.0/nn); - return nn; - } - - void orthogonalize(Field& w, std::vector& evec, int k) - { - typedef typename Field::scalar_type MyComplex; - MyComplex ip; - - for(int j=0; j IRL(HermOp,X,Nk,Nm,eresid,Nit); - ImplicitlyRestartedLanczos ChebyIRL(HermOp,Cheby,Nk,Nm,eresid,Nit); + ImplicitlyRestartedLanczos IRL(HermOp,X,Nk,Nk,Nm,eresid,Nit); + ImplicitlyRestartedLanczos ChebyIRL(HermOp,Cheby,Nk,Nk,Nm,eresid,Nit); LatticeComplex src(grid); gaussian(RNG,src); { From ef4f2b8c410d449ff0beea1682cfc3de9bda3f79 Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 21 Jun 2017 09:22:20 +0100 Subject: [PATCH 086/170] todo update --- TODO | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/TODO b/TODO index eeb7dfa5..8f80903e 100644 --- a/TODO +++ b/TODO @@ -2,8 +2,8 @@ TODO: --------------- Large item work list: -1)- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- -2)- MultiRHS with spread out extra dim +1)- MultiRHS with spread out extra dim +2)- Christoph's local basis expansion Lanczos 3)- BG/Q port and check 4)- Precision conversion and sort out localConvert <-- partial - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet @@ -13,6 +13,7 @@ Large item work list: 8)- HDCR resume Recent DONE +-- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE -- GaugeFix into central location <-- DONE -- Scidac and Ildg metadata handling <-- DONE -- Binary I/O MPI2 IO <-- DONE From 9e56c6573007ccc857571aefa2ce3b6851f7b891 Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 21 Jun 2017 14:02:58 +0100 Subject: [PATCH 087/170] Updated TODO list --- TODO | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/TODO b/TODO index 8f80903e..001c6c0c 100644 --- a/TODO +++ b/TODO @@ -2,7 +2,8 @@ TODO: --------------- Large item work list: -1)- MultiRHS with spread out extra dim +1)- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O + 2)- Christoph's local basis expansion Lanczos 3)- BG/Q port and check 4)- Precision conversion and sort out localConvert <-- partial From af71c63f4ce48ccbe9bfdaf40d4171913483add7 Mon Sep 17 00:00:00 2001 From: Lanny91 Date: Fri, 23 Jun 2017 11:03:12 +0200 Subject: [PATCH 088/170] AVX2 fix --- lib/simd/Grid_avx.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index 57d9064d..f4634432 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -711,7 +711,7 @@ namespace Optimization { v2 = _mm256_hadd_epi32(v1, v1); u1 = _mm256_castsi256_si128(v2); // upper half u2 = _mm256_extracti128_si256(v2, 1); // lower half - ret = _mm256_add_epi32(u1, u2); + ret = _mm_add_epi32(u1, u2); #else // No AVX horizontal add; extract upper and lower halves of register & use // SSE intrinsics. From 56abbdf4c2fa3848fe9037cf95cf5e4930631d3a Mon Sep 17 00:00:00 2001 From: Lanny91 Date: Fri, 23 Jun 2017 11:09:14 +0200 Subject: [PATCH 089/170] AVX512 integer reduce fix (for non-intel compiler) --- lib/simd/Grid_avx512.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index 458a8f7c..85d27421 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -558,7 +558,7 @@ namespace Optimization { v2 = _mm256_hadd_epi32(v1, v1); u1 = _mm256_castsi256_si128(v2) // upper half u2 = _mm256_extracti128_si256(v2, 1); // lower half - ret = _mm256_add_epi32(u1, u2); + ret = _mm_add_epi32(u1, u2); return _mm_cvtsi128_si32(ret); } #else From 869b99ec1efde04d94bdd02eb041a457accb930e Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 24 Jun 2017 10:55:54 +0100 Subject: [PATCH 090/170] Threaded calls to multiple communicators --- lib/communicator/Communicator_mpit.cc | 260 ++++++++++++++++++++++++++ 1 file changed, 260 insertions(+) create mode 100644 lib/communicator/Communicator_mpit.cc diff --git a/lib/communicator/Communicator_mpit.cc b/lib/communicator/Communicator_mpit.cc new file mode 100644 index 00000000..07522900 --- /dev/null +++ b/lib/communicator/Communicator_mpit.cc @@ -0,0 +1,260 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/communicator/Communicator_mpi.cc + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include +#include + +namespace Grid { + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Info that is setup once and indept of cartesian layout +/////////////////////////////////////////////////////////////////////////////////////////////////// +MPI_Comm CartesianCommunicator::communicator_world; + +// Should error check all MPI calls. +void CartesianCommunicator::Init(int *argc, char ***argv) { + int flag; + int provided; + MPI_Initialized(&flag); // needed to coexist with other libs apparently + if ( !flag ) { + MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); + if ( provided != MPI_THREAD_MULTIPLE ) { + QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute; + } + } + MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); + ShmInitGeneric(); +} + +CartesianCommunicator::CartesianCommunicator(const std::vector &processors) +{ + _ndimension = processors.size(); + std::vector periodic(_ndimension,1); + + _Nprocessors=1; + _processors = processors; + _processor_coor.resize(_ndimension); + + MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator); + MPI_Comm_rank(communicator,&_processor); + MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]); + + for(int i=0;i<_ndimension;i++){ + _Nprocessors*=_processors[i]; + } + + communicator_halo.resize (2*_ndimension); + for(int i=0;i<_ndimension*2;i++){ + MPI_Comm_dup(communicator,&communicator_halo[i]); + } + + int Size; + MPI_Comm_size(communicator,&Size); + + assert(Size==_Nprocessors); +} +void CartesianCommunicator::GlobalSum(uint32_t &u){ + int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalSum(uint64_t &u){ + int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalXOR(uint32_t &u){ + int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalXOR(uint64_t &u){ + int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalSum(float &f){ + int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalSumVector(float *f,int N) +{ + int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalSum(double &d) +{ + int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalSumVector(double *d,int N) +{ + int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); + assert(ierr==0); +} +void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) +{ + int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest); + assert(ierr==0); +} +int CartesianCommunicator::RankFromProcessorCoor(std::vector &coor) +{ + int rank; + int ierr=MPI_Cart_rank (communicator, &coor[0], &rank); + assert(ierr==0); + return rank; +} +void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector &coor) +{ + coor.resize(_ndimension); + int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]); + assert(ierr==0); +} + +// Basic Halo comms primitive +void CartesianCommunicator::SendToRecvFrom(void *xmit, + int dest, + void *recv, + int from, + int bytes) +{ + std::vector reqs(0); + SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes); + SendToRecvFromComplete(reqs); +} + +void CartesianCommunicator::SendRecvPacket(void *xmit, + void *recv, + int sender, + int receiver, + int bytes) +{ + MPI_Status stat; + assert(sender != receiver); + int tag = sender; + if ( _processor == sender ) { + MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator); + } + if ( _processor == receiver ) { + MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat); + } +} + +// Basic Halo comms primitive +void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, + void *xmit, + int dest, + void *recv, + int from, + int bytes) +{ + int myrank = _processor; + int ierr; + if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { + MPI_Request xrq; + MPI_Request rrq; + + ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); + ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); + + assert(ierr==0); + list.push_back(xrq); + list.push_back(rrq); + } else { + // Give the CPU to MPI immediately; can use threads to overlap optionally + ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, + recv,bytes,MPI_CHAR,from, from, + communicator,MPI_STATUS_IGNORE); + assert(ierr==0); + } +} +void CartesianCommunicator::SendToRecvFromComplete(std::vector &list) +{ + if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { + int nreq=list.size(); + std::vector status(nreq); + int ierr = MPI_Waitall(nreq,&list[0],&status[0]); + assert(ierr==0); + } +} + +void CartesianCommunicator::Barrier(void) +{ + int ierr = MPI_Barrier(communicator); + assert(ierr==0); +} + +void CartesianCommunicator::Broadcast(int root,void* data, int bytes) +{ + int ierr=MPI_Bcast(data, + bytes, + MPI_BYTE, + root, + communicator); + assert(ierr==0); +} + /////////////////////////////////////////////////////// + // Should only be used prior to Grid Init finished. + // Check for this? + /////////////////////////////////////////////////////// +int CartesianCommunicator::RankWorld(void){ + int r; + MPI_Comm_rank(communicator_world,&r); + return r; +} +void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) +{ + int ierr= MPI_Bcast(data, + bytes, + MPI_BYTE, + root, + communicator_world); + assert(ierr==0); +} + + double CartesianCommunicator::StencilSendToRecvFromBegin(int dir, + std::vector &list, + void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes) +{ + int myrank = _processor; + int ierr; + // Give the CPU to MPI immediately; can use threads to overlap optionally + ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, + recv,bytes,MPI_CHAR,from, from, + communicator_halo[dir],MPI_STATUS_IGNORE); + assert(ierr==0); + return 2.0*bytes; +} +void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &waitall){ }; + + + +} + From d2e8372df3c0a39b9eb2c000c7f190c670a75501 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Sat, 24 Jun 2017 23:03:39 +0100 Subject: [PATCH 091/170] SU(N) algebra fix (was not working) --- lib/qcd/utils/SUn.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/qcd/utils/SUn.h b/lib/qcd/utils/SUn.h index 99a620bc..8f0c0a7b 100644 --- a/lib/qcd/utils/SUn.h +++ b/lib/qcd/utils/SUn.h @@ -716,8 +716,7 @@ template for (int a = 0; a < AdjointDimension; a++) { generator(a, Ta); - auto tmp = - 2.0 * (trace(timesI(Ta) * in)) * scale;// 2.0 for the normalization of the trace in the fundamental rep - pokeColour(h_out, tmp, a); + pokeColour(h_out, - 2.0 * (trace(timesI(Ta) * in)) * scale, a); } } From 0af740dc1521656ee549094fea038176791d6cac Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Sat, 24 Jun 2017 23:04:05 +0100 Subject: [PATCH 092/170] minor scalar HMC code improvement --- lib/qcd/action/scalar/ScalarImpl.h | 8 +++++--- lib/qcd/action/scalar/ScalarInteractionAction.h | 2 +- lib/qcd/hmc/HMC.h | 2 +- lib/qcd/hmc/HMCResourceManager.h | 3 ++- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/lib/qcd/action/scalar/ScalarImpl.h b/lib/qcd/action/scalar/ScalarImpl.h index 5342a1fa..174553a2 100644 --- a/lib/qcd/action/scalar/ScalarImpl.h +++ b/lib/qcd/action/scalar/ScalarImpl.h @@ -93,6 +93,8 @@ class ScalarImplTypes { class ScalarAdjMatrixImplTypes { public: typedef S Simd; + typedef QCD::SU Group; + template using iImplField = iScalar>>; template @@ -108,7 +110,7 @@ class ScalarImplTypes { typedef Field PropagatorField; static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) { - QCD::SU::GaussianFundamentalLieAlgebraMatrix(pRNG, P); + Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P); } static inline Field projectForce(Field& P) {return P;} @@ -122,11 +124,11 @@ class ScalarImplTypes { } static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) { - QCD::SU::LieRandomize(pRNG, U); + Group::LieRandomize(pRNG, U); } static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) { - QCD::SU::LieRandomize(pRNG, U, 0.01); + Group::LieRandomize(pRNG, U, 0.01); } static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) { diff --git a/lib/qcd/action/scalar/ScalarInteractionAction.h b/lib/qcd/action/scalar/ScalarInteractionAction.h index 5f4c630c..1ff8fd37 100644 --- a/lib/qcd/action/scalar/ScalarInteractionAction.h +++ b/lib/qcd/action/scalar/ScalarInteractionAction.h @@ -98,7 +98,7 @@ namespace Grid { permute(temp2, *temp, permute_type); action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2; } else { - action._odata[i] -= *temp*(*t_p) + (*t_p)*(*temp); + action._odata[i] -= (*temp)*(*t_p) + (*t_p)*(*temp); } } else { action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset]; diff --git a/lib/qcd/hmc/HMC.h b/lib/qcd/hmc/HMC.h index ac690b60..5688bb24 100644 --- a/lib/qcd/hmc/HMC.h +++ b/lib/qcd/hmc/HMC.h @@ -76,7 +76,7 @@ struct HMCparameters: Serializable { template < class ReaderClass > void initialize(Reader &TheReader){ - std::cout << "Reading HMC\n"; + std::cout << GridLogMessage << "Reading HMC\n"; read(TheReader, "HMC", *this); } diff --git a/lib/qcd/hmc/HMCResourceManager.h b/lib/qcd/hmc/HMCResourceManager.h index 9f4c99a9..cf0000ed 100644 --- a/lib/qcd/hmc/HMCResourceManager.h +++ b/lib/qcd/hmc/HMCResourceManager.h @@ -253,6 +253,7 @@ class HMCResourceManager { template void AddObservable(Types&&... Args){ ObservablesList.push_back(std::unique_ptr(new T(std::forward(Args)...))); + ObservablesList.back()->print_parameters(); } std::vector* > GetObservables(){ @@ -297,4 +298,4 @@ private: } } -#endif // HMC_RESOURCE_MANAGER_H \ No newline at end of file +#endif // HMC_RESOURCE_MANAGER_H From 54e94360ad06cde7edbaeede2cf18eb0d5a1227b Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 24 Jun 2017 23:10:24 +0100 Subject: [PATCH 093/170] Experimental: Multiple communicators to see if we can avoid thread locks in --enable-comms=mpit --- benchmarks/Benchmark_comms.cc | 27 ++++++----- configure.ac | 10 ++--- lib/Makefile.am | 4 +- lib/communicator/Communicator_base.cc | 22 +++++---- lib/communicator/Communicator_base.h | 20 +++++---- lib/communicator/Communicator_mpi3.cc | 12 ++--- lib/communicator/Communicator_mpit.cc | 26 ++++++----- lib/cshift/Cshift.h | 2 +- lib/log/Log.cc | 2 +- lib/parallelIO/BinaryIO.h | 2 +- lib/qcd/action/fermion/WilsonFermion5D.cc | 55 +++++++++++------------ lib/stencil/Stencil.h | 45 ++++++++++++++++--- lib/util/Init.cc | 2 +- 13 files changed, 139 insertions(+), 90 deletions(-) diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc index 532532f8..753b8a58 100644 --- a/benchmarks/Benchmark_comms.cc +++ b/benchmarks/Benchmark_comms.cc @@ -68,7 +68,7 @@ int main (int argc, char ** argv) int Nloop=100; int nmu=0; - int maxlat=24; + int maxlat=32; for(int mu=0;mu1) nmu++; std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl; @@ -80,7 +80,7 @@ int main (int argc, char ** argv) std::cout< latt_size ({lat*mpi_layout[0], lat*mpi_layout[1], @@ -163,7 +163,7 @@ int main (int argc, char ** argv) header(); for(int lat=4;lat<=maxlat;lat+=4){ - for(int Ls=8;Ls<=32;Ls*=2){ + for(int Ls=8;Ls<=8;Ls*=2){ std::vector latt_size ({lat,lat,lat,lat}); @@ -249,7 +249,7 @@ int main (int argc, char ** argv) header(); for(int lat=4;lat<=maxlat;lat+=4){ - for(int Ls=8;Ls<=32;Ls*=2){ + for(int Ls=8;Ls<=8;Ls*=2){ std::vector latt_size ({lat*mpi_layout[0], lat*mpi_layout[1], @@ -299,7 +299,7 @@ int main (int argc, char ** argv) xmit_to_rank, (void *)&rbuf[mu][0], recv_from_rank, - bytes); + bytes,mu); comm_proc = mpi_layout[mu]-1; @@ -310,11 +310,11 @@ int main (int argc, char ** argv) xmit_to_rank, (void *)&rbuf[mu+4][0], recv_from_rank, - bytes); + bytes,mu+4); } } - Grid.StencilSendToRecvFromComplete(requests); + Grid.StencilSendToRecvFromComplete(requests,0); Grid.Barrier(); double stop=usecond(); t_time[i] = stop-start; // microseconds @@ -346,7 +346,7 @@ int main (int argc, char ** argv) header(); for(int lat=4;lat<=maxlat;lat+=4){ - for(int Ls=8;Ls<=32;Ls*=2){ + for(int Ls=8;Ls<=8;Ls*=2){ std::vector latt_size ({lat*mpi_layout[0], lat*mpi_layout[1], @@ -393,8 +393,8 @@ int main (int argc, char ** argv) xmit_to_rank, (void *)&rbuf[mu][0], recv_from_rank, - bytes); - Grid.StencilSendToRecvFromComplete(requests); + bytes,mu); + Grid.StencilSendToRecvFromComplete(requests,mu); requests.resize(0); comm_proc = mpi_layout[mu]-1; @@ -406,8 +406,8 @@ int main (int argc, char ** argv) xmit_to_rank, (void *)&rbuf[mu+4][0], recv_from_rank, - bytes); - Grid.StencilSendToRecvFromComplete(requests); + bytes,mu+4); + Grid.StencilSendToRecvFromComplete(requests,mu+4); requests.resize(0); } @@ -435,6 +435,9 @@ int main (int argc, char ** argv) } } + std::cout< &list, - void *xmit, - int xmit_to_rank, - void *recv, - int recv_from_rank, - int bytes) + void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes, int dir) { + // Discard the "dir" SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); return 2.0*bytes; } -void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &waitall) +void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &waitall,int dir) { SendToRecvFromComplete(waitall); } +#endif + +#if !defined( GRID_COMMS_MPI3) + void CartesianCommunicator::StencilBarrier(void){}; commVector CartesianCommunicator::ShmBufStorageVector; diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h index 12a8429f..4e471b43 100644 --- a/lib/communicator/Communicator_base.h +++ b/lib/communicator/Communicator_base.h @@ -38,7 +38,7 @@ Author: Peter Boyle #ifdef GRID_COMMS_MPI3 #include #endif -#ifdef GRID_COMMS_MPI3L +#ifdef GRID_COMMS_MPIT #include #endif #ifdef GRID_COMMS_SHMEM @@ -64,7 +64,7 @@ class CartesianCommunicator { std::vector _processor_coor; // linear processor coordinate unsigned long _ndimension; -#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L) +#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT) static MPI_Comm communicator_world; MPI_Comm communicator; typedef MPI_Request CommsRequest_t; @@ -72,6 +72,10 @@ class CartesianCommunicator { typedef int CommsRequest_t; #endif +#if defined (GRID_COMMS_MPIT) + std::vector communicator_halo; +#endif + //////////////////////////////////////////////////////////////////// // Helper functionality for SHM Windows common to all other impls //////////////////////////////////////////////////////////////////// @@ -212,13 +216,13 @@ class CartesianCommunicator { void SendToRecvFromComplete(std::vector &waitall); double StencilSendToRecvFromBegin(std::vector &list, - void *xmit, - int xmit_to_rank, - void *recv, - int recv_from_rank, - int bytes); + void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes,int dir); - void StencilSendToRecvFromComplete(std::vector &waitall); + void StencilSendToRecvFromComplete(std::vector &waitall,int i); void StencilBarrier(void); //////////////////////////////////////////////////////////// diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index 632eb991..8046fef6 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -600,11 +600,11 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis } double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, - void *xmit, - int dest, - void *recv, - int from, - int bytes) + void *xmit, + int dest, + void *recv, + int from, + int bytes,int dir) { MPI_Request xrq; MPI_Request rrq; @@ -643,7 +643,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &waitall) +void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &waitall,int dir) { SendToRecvFromComplete(waitall); } diff --git a/lib/communicator/Communicator_mpit.cc b/lib/communicator/Communicator_mpit.cc index 07522900..24a518ec 100644 --- a/lib/communicator/Communicator_mpit.cc +++ b/lib/communicator/Communicator_mpit.cc @@ -235,24 +235,30 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) assert(ierr==0); } - double CartesianCommunicator::StencilSendToRecvFromBegin(int dir, - std::vector &list, - void *xmit, - int xmit_to_rank, - void *recv, - int recv_from_rank, - int bytes) +double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, + void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes,int dir) { + int myrank = _processor; int ierr; + assert(dir < communicator_halo.size()); + + // std::cout << " sending on communicator "< &waitall){ }; +void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &waitall,int dir) +{ + // Do nothing +}; diff --git a/lib/cshift/Cshift.h b/lib/cshift/Cshift.h index cd162e35..7d0caeee 100644 --- a/lib/cshift/Cshift.h +++ b/lib/cshift/Cshift.h @@ -42,7 +42,7 @@ Author: Peter Boyle #include #endif -#ifdef GRID_COMMS_MPI3L +#ifdef GRID_COMMS_MPIT #include #endif diff --git a/lib/log/Log.cc b/lib/log/Log.cc index 69a9a0a8..65dc2812 100644 --- a/lib/log/Log.cc +++ b/lib/log/Log.cc @@ -95,7 +95,7 @@ void GridLogConfigure(std::vector &logstreams) { //////////////////////////////////////////////////////////// void Grid_quiesce_nodes(void) { int me = 0; -#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L) +#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) MPI_Comm_rank(MPI_COMM_WORLD, &me); #endif #ifdef GRID_COMMS_SHMEM diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h index 117bec01..480afa01 100644 --- a/lib/parallelIO/BinaryIO.h +++ b/lib/parallelIO/BinaryIO.h @@ -29,7 +29,7 @@ #ifndef GRID_BINARY_IO_H #define GRID_BINARY_IO_H -#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) +#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) #define USE_MPI_IO #else #undef USE_MPI_IO diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 27319fb0..6a6bc1f8 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -379,7 +379,6 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg { #ifdef GRID_OMP // assert((dag==DaggerNo) ||(dag==DaggerYes)); - typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; Compressor compressor(dag); @@ -388,46 +387,46 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg DhopFaceTime-=usecond(); st.HaloExchangeOptGather(in,compressor); + st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms DhopFaceTime+=usecond(); - std::vector > reqs; // Rely on async comms; start comms before merge of local data + DhopComputeTime-=usecond(); DhopCommTime-=usecond(); - st.CommunicateBegin(reqs); - - DhopFaceTime-=usecond(); - st.CommsMergeSHM(compressor); - DhopFaceTime+=usecond(); - - // Perhaps use omp task and region #pragma omp parallel { - int nthreads = omp_get_num_threads(); - int me = omp_get_thread_num(); - int myoff, mywork; + // Should time this somehow; hard as the threads fork nowait + st.CommunicateThreaded(); - GridThread::GetWork(len,me-1,mywork,myoff,nthreads-1); - int sF = LLs * myoff; - - if ( me == 0 ) { - st.CommunicateComplete(reqs); - DhopCommTime+=usecond(); - } else { - // Interior links in stencil - if ( me==1 ) DhopComputeTime-=usecond(); - if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0); - else Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0); - if ( me==1 ) DhopComputeTime+=usecond(); + if (dag == DaggerYes) { +#pragma omp for + for (int ss = 0; ss < U._grid->oSites(); ss++) { + int sU = ss; + int sF = LLs * sU; + Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0); + } + } else { +#pragma omp for + for (int ss = 0; ss < U._grid->oSites(); ss++) { + int sU = ss; + int sF = LLs * sU; + Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0); } } +#pragma omp single + DhopComputeTime+=usecond(); + +#pragma omp taskwait + +#pragma omp single + DhopCommTime+=usecond(); + } // Closes parallel region and waits the comms (I hope) + DhopFaceTime-=usecond(); st.CommsMerge(compressor); DhopFaceTime+=usecond(); - // Load imbalance alert. Should use dynamic schedule OMP for loop - // Perhaps create a list of only those sites with face work, and - // load balance process the list. DhopComputeTime2-=usecond(); if (dag == DaggerYes) { int sz=st.surface_list.size(); @@ -448,11 +447,9 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg #else assert(0); #endif - } - template void WilsonFermion5D::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField & U, diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index 2894778a..17db64d8 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -248,24 +248,57 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal ////////////////////////////////////////// // Comms packet queue for asynch thread ////////////////////////////////////////// + void CommunicateThreaded() + { + for(int i=0;i reqs; + bytes=_grid->StencilSendToRecvFromBegin(reqs, + Packets[i].send_buf, + Packets[i].to_rank, + Packets[i].recv_buf, + Packets[i].from_rank, + Packets[i].bytes,i); + _grid->StencilSendToRecvFromComplete(reqs,i); + // Last task logged; this is approximate but hard to catch + // the last to complete + stop = usecond(); + stop = stop - start; + + if ( i==0 ) commtime+=stop; + +#pragma omp critical + { + comms_bytes+=bytes; + } + + } + } + + } void CommunicateBegin(std::vector > &reqs) { reqs.resize(Packets.size()); commtime-=usecond(); for(int i=0;iStencilSendToRecvFromBegin(reqs[i], - Packets[i].send_buf, - Packets[i].to_rank, - Packets[i].recv_buf, - Packets[i].from_rank, - Packets[i].bytes); + Packets[i].send_buf, + Packets[i].to_rank, + Packets[i].recv_buf, + Packets[i].from_rank, + Packets[i].bytes,i); } } void CommunicateComplete(std::vector > &reqs) { for(int i=0;iStencilSendToRecvFromComplete(reqs[i]); + _grid->StencilSendToRecvFromComplete(reqs[i],i); } commtime+=usecond(); } diff --git a/lib/util/Init.cc b/lib/util/Init.cc index fe3b1734..fc701ac1 100644 --- a/lib/util/Init.cc +++ b/lib/util/Init.cc @@ -393,7 +393,7 @@ void Grid_init(int *argc,char ***argv) void Grid_finalize(void) { -#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) +#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT) MPI_Finalize(); Grid_unquiesce_nodes(); #endif From 7d7220cbd72278050a1cfda6a083a87b85fecbca Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Tue, 27 Jun 2017 14:38:45 +0100 Subject: [PATCH 094/170] scalar: lambda/4! convention --- lib/qcd/action/scalar/ScalarInteractionAction.h | 4 ++-- tests/hmc/Test_hmc_ScalarActionNxN.cc | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/qcd/action/scalar/ScalarInteractionAction.h b/lib/qcd/action/scalar/ScalarInteractionAction.h index 1ff8fd37..ac2d4fbb 100644 --- a/lib/qcd/action/scalar/ScalarInteractionAction.h +++ b/lib/qcd/action/scalar/ScalarInteractionAction.h @@ -81,7 +81,7 @@ namespace Grid { phiStencil.HaloExchange(p, compressor); Field action(p._grid), pshift(p._grid), phisquared(p._grid); phisquared = p*p; - action = (2.0*Ndim + mass_square)*phisquared + lambda*phisquared*phisquared; + action = (2.0*Ndim + mass_square)*phisquared + lambda/24.*phisquared*phisquared; for (int mu = 0; mu < Ndim; mu++) { // pshift = Cshift(p, mu, +1); // not efficient, implement with stencils parallel_for (int i = 0; i < p._grid->oSites(); i++) { @@ -113,7 +113,7 @@ namespace Grid { virtual void deriv(const Field &p, Field &force) { assert(p._grid->Nd() == Ndim); - force = (2.0*Ndim + mass_square)*p + 2.0*lambda*p*p*p; + force = (2.0*Ndim + mass_square)*p + lambda/12.*p*p*p; // move this outside static Stencil phiStencil(p._grid, npoint, 0, directions, displacements); phiStencil.HaloExchange(p, compressor); diff --git a/tests/hmc/Test_hmc_ScalarActionNxN.cc b/tests/hmc/Test_hmc_ScalarActionNxN.cc index a7490f51..a4dad1a3 100644 --- a/tests/hmc/Test_hmc_ScalarActionNxN.cc +++ b/tests/hmc/Test_hmc_ScalarActionNxN.cc @@ -45,7 +45,7 @@ using namespace Grid; using namespace Grid::QCD; template -class MagLogger : public HmcObservable { +class MagMeas : public HmcObservable { public: typedef typename Impl::Field Field; typedef typename Impl::Simd::scalar_type Trace; @@ -72,13 +72,13 @@ private: }; template -class MagMod: public ObservableModule, NoParameters>{ - typedef ObservableModule, NoParameters> ObsBase; +class MagMod: public ObservableModule, NoParameters>{ + typedef ObservableModule, NoParameters> ObsBase; using ObsBase::ObsBase; // for constructors // acquire resource virtual void initialize(){ - this->ObservablePtr.reset(new MagLogger()); + this->ObservablePtr.reset(new MagMeas()); } public: MagMod(): ObsBase(NoParameters()){} From 15e87a460725f07dd380bd21b538b43b687a0551 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Tue, 27 Jun 2017 14:39:27 +0100 Subject: [PATCH 095/170] HDF5 IO fix --- lib/serialisation/Hdf5IO.cc | 4 +++- lib/serialisation/Hdf5IO.h | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/serialisation/Hdf5IO.cc b/lib/serialisation/Hdf5IO.cc index b9bb0b87..1fb7be0c 100644 --- a/lib/serialisation/Hdf5IO.cc +++ b/lib/serialisation/Hdf5IO.cc @@ -65,10 +65,12 @@ Hdf5Reader::Hdf5Reader(const std::string &fileName) Hdf5Type::type()); } -void Hdf5Reader::push(const std::string &s) +bool Hdf5Reader::push(const std::string &s) { group_ = group_.openGroup(s); path_.push_back(s); + + return true; } void Hdf5Reader::pop(void) diff --git a/lib/serialisation/Hdf5IO.h b/lib/serialisation/Hdf5IO.h index 2f891cd4..94ad9736 100644 --- a/lib/serialisation/Hdf5IO.h +++ b/lib/serialisation/Hdf5IO.h @@ -54,7 +54,7 @@ namespace Grid public: Hdf5Reader(const std::string &fileName); virtual ~Hdf5Reader(void) = default; - void push(const std::string &s); + bool push(const std::string &s); void pop(void); template void readDefault(const std::string &s, U &output); From bf729766ddd36c9ebe8c6be35d7527353aff2963 Mon Sep 17 00:00:00 2001 From: Nils Meyer Date: Tue, 27 Jun 2017 20:32:24 +0200 Subject: [PATCH 096/170] removed collision with QPX implementation --- lib/simd/Grid_generic_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/simd/Grid_generic_types.h b/lib/simd/Grid_generic_types.h index eac65e09..642f6ffe 100644 --- a/lib/simd/Grid_generic_types.h +++ b/lib/simd/Grid_generic_types.h @@ -26,7 +26,7 @@ Author: Antonin Portelli See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ -#define GEN_SIMD_WIDTH 16 + static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes"); //#define VECTOR_LOOPS From e43a8b6b8ad52b88ed8e9ae1c140dc34a3da18d4 Mon Sep 17 00:00:00 2001 From: Nils Meyer Date: Tue, 27 Jun 2017 20:58:48 +0200 Subject: [PATCH 097/170] removed comments --- Grid_vector_types.h | 857 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 857 insertions(+) create mode 100644 Grid_vector_types.h diff --git a/Grid_vector_types.h b/Grid_vector_types.h new file mode 100644 index 00000000..e05fecc4 --- /dev/null +++ b/Grid_vector_types.h @@ -0,0 +1,857 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/simd/Grid_vector_type.h + +Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Guido Cossu +Author: Peter Boyle +Author: neo + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +//--------------------------------------------------------------------------- +/*! @file Grid_vector_types.h + @brief Defines templated class Grid_simd to deal with inner vector types +*/ +// Time-stamp: <2015-07-10 17:45:33 neo> +//--------------------------------------------------------------------------- +#ifndef GRID_VECTOR_TYPES +#define GRID_VECTOR_TYPES + +#ifdef GEN +#include "Grid_generic.h" +#endif +#ifdef SSE4 +#include "Grid_sse4.h" +#endif +#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4) +#include "Grid_avx.h" +#endif +#if defined AVX512 +#include "Grid_avx512.h" +#endif +#if defined IMCI +#include "Grid_imci.h" +#endif +#ifdef NEONv8 +#include "Grid_neon.h" +#endif +#if defined QPX +#include "Grid_qpx.h" +#endif + +#include "l1p.h" + +namespace Grid { + +////////////////////////////////////// +// To take the floating point type of real/complex type +////////////////////////////////////// +template +struct RealPart { + typedef T type; +}; +template +struct RealPart > { + typedef T type; +}; + +#include + +////////////////////////////////////// +// demote a vector to real type +////////////////////////////////////// +// type alias used to simplify the syntax of std::enable_if +template using Invoke = typename T::type; +template using EnableIf = Invoke >; +template using NotEnableIf = Invoke >; + +//////////////////////////////////////////////////////// +// Check for complexity with type traits +template struct is_complex : public std::false_type {}; +template <> struct is_complex > : public std::true_type {}; +template <> struct is_complex > : public std::true_type {}; + +template using IfReal = Invoke::value, int> >; +template using IfComplex = Invoke::value, int> >; +template using IfInteger = Invoke::value, int> >; +template using IfSame = Invoke::value, int> >; + +template using IfNotReal = Invoke::value, int> >; +template using IfNotComplex = Invoke::value, int> >; +template using IfNotInteger = Invoke::value, int> >; +template using IfNotSame = Invoke::value, int> >; + +//////////////////////////////////////////////////////// +// Define the operation templates functors +// general forms to allow for vsplat syntax +// need explicit declaration of types when used since +// clang cannot automatically determine the output type sometimes +template +Out trinary(Input1 src_1, Input2 src_2, Input3 src_3, Operation op) { + return op(src_1, src_2, src_3); +} + +template +Out binary(Input1 src_1, Input2 src_2, Operation op) { + return op(src_1, src_2); +} + +template +Out unary(Input src, Operation op) { + return op(src); +} +/////////////////////////////////////////////// + +/* + @brief Grid_simd class for the SIMD vector type operations + */ +template +class Grid_simd { + public: + typedef typename RealPart::type Real; + typedef Vector_type vector_type; + typedef Scalar_type scalar_type; + + typedef union conv_t_union { + Vector_type v; + Scalar_type s[sizeof(Vector_type) / sizeof(Scalar_type)]; + conv_t_union(){}; + } conv_t; + + Vector_type v; + + static inline constexpr int Nsimd(void) { + return sizeof(Vector_type) / sizeof(Scalar_type); + } + + Grid_simd &operator=(const Grid_simd &&rhs) { + v = rhs.v; + return *this; + }; + Grid_simd &operator=(const Grid_simd &rhs) { + v = rhs.v; + return *this; + }; // faster than not declaring it and leaving to the compiler + Grid_simd() = default; + Grid_simd(const Grid_simd &rhs) : v(rhs.v){}; // compiles in movaps + Grid_simd(const Grid_simd &&rhs) : v(rhs.v){}; + + ///////////////////////////// + // Constructors + ///////////////////////////// + Grid_simd &operator=(Zero &z) { + vzero(*this); + return (*this); + } + + // Enable if complex type + template + Grid_simd(const typename std::enable_if::value, S>::type a) { + vsplat(*this, a); + }; + + Grid_simd(const Real a) { vsplat(*this, Scalar_type(a)); }; + + /////////////////////////////////////////////// + // mac, mult, sub, add, adj + /////////////////////////////////////////////// + + // FIXME -- alias this to an inline MAC struct. + friend inline void mac(Grid_simd *__restrict__ y, + const Grid_simd *__restrict__ a, + const Grid_simd *__restrict__ x) { + *y = (*a) * (*x) + (*y); + }; + + friend inline void mult(Grid_simd *__restrict__ y, + const Grid_simd *__restrict__ l, + const Grid_simd *__restrict__ r) { + *y = (*l) * (*r); + } + + friend inline void sub(Grid_simd *__restrict__ y, + const Grid_simd *__restrict__ l, + const Grid_simd *__restrict__ r) { + *y = (*l) - (*r); + } + friend inline void add(Grid_simd *__restrict__ y, + const Grid_simd *__restrict__ l, + const Grid_simd *__restrict__ r) { + *y = (*l) + (*r); + } + friend inline void mac(Grid_simd *__restrict__ y, + const Scalar_type *__restrict__ a, + const Grid_simd *__restrict__ x) { + *y = (*a) * (*x) + (*y); + }; + friend inline void mult(Grid_simd *__restrict__ y, + const Scalar_type *__restrict__ l, + const Grid_simd *__restrict__ r) { + *y = (*l) * (*r); + } + friend inline void sub(Grid_simd *__restrict__ y, + const Scalar_type *__restrict__ l, + const Grid_simd *__restrict__ r) { + *y = (*l) - (*r); + } + friend inline void add(Grid_simd *__restrict__ y, + const Scalar_type *__restrict__ l, + const Grid_simd *__restrict__ r) { + *y = (*l) + (*r); + } + + friend inline void mac(Grid_simd *__restrict__ y, + const Grid_simd *__restrict__ a, + const Scalar_type *__restrict__ x) { + *y = (*a) * (*x) + (*y); + }; + friend inline void mult(Grid_simd *__restrict__ y, + const Grid_simd *__restrict__ l, + const Scalar_type *__restrict__ r) { + *y = (*l) * (*r); + } + friend inline void sub(Grid_simd *__restrict__ y, + const Grid_simd *__restrict__ l, + const Scalar_type *__restrict__ r) { + *y = (*l) - (*r); + } + friend inline void add(Grid_simd *__restrict__ y, + const Grid_simd *__restrict__ l, + const Scalar_type *__restrict__ r) { + *y = (*l) + (*r); + } + + //////////////////////////////////////////////////////////////////////// + // FIXME: gonna remove these load/store, get, set, prefetch + //////////////////////////////////////////////////////////////////////// + friend inline void vset(Grid_simd &ret, Scalar_type *a) { + ret.v = unary(a, VsetSIMD()); + } + + /////////////////////// + // Vstore + /////////////////////// + friend inline void vstore(const Grid_simd &ret, Scalar_type *a) { + binary(ret.v, (Real *)a, VstoreSIMD()); + } + + /////////////////////// + // Vprefetch + /////////////////////// + friend inline void vprefetch(const Grid_simd &v) { + prefetch_HINT_T0((const char *)&v.v); + } + + /////////////////////// + // Reduce + /////////////////////// + friend inline Scalar_type Reduce(const Grid_simd &in) { + return unary(in.v, ReduceSIMD()); + } + + //////////////////////////// + // operator scalar * simd + //////////////////////////// + friend inline Grid_simd operator*(const Scalar_type &a, Grid_simd b) { + Grid_simd va; + vsplat(va, a); + return va * b; + } + friend inline Grid_simd operator*(Grid_simd b, const Scalar_type &a) { + return a * b; + } + + ////////////////////////////////// + // Divides + ////////////////////////////////// + friend inline Grid_simd operator/(const Scalar_type &a, Grid_simd b) { + Grid_simd va; + vsplat(va, a); + return va / b; + } + friend inline Grid_simd operator/(Grid_simd b, const Scalar_type &a) { + Grid_simd va; + vsplat(va, a); + return b / a; + } + + /////////////////////// + // Unary negation + /////////////////////// + friend inline Grid_simd operator-(const Grid_simd &r) { + Grid_simd ret; + vzero(ret); + ret = ret - r; + return ret; + } + // *=,+=,-= operators + inline Grid_simd &operator*=(const Grid_simd &r) { + *this = (*this) * r; + return *this; + // return (*this)*r; ? + } + inline Grid_simd &operator+=(const Grid_simd &r) { + *this = *this + r; + return *this; + } + inline Grid_simd &operator-=(const Grid_simd &r) { + *this = *this - r; + return *this; + } + + /////////////////////////////////////// + // Not all functions are supported + // through SIMD and must breakout to + // scalar type and back again. This + // provides support + /////////////////////////////////////// + + template + friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) { + Grid_simd ret; + Grid_simd::conv_t conv; + Grid_simd::scalar_type s; + + conv.v = v.v; + for (int i = 0; i < Nsimd(); i++) { + s = conv.s[i]; + conv.s[i] = func(s); + } + ret.v = conv.v; + return ret; + } + template + friend inline Grid_simd SimdApplyBinop(const functor &func, + const Grid_simd &x, + const Grid_simd &y) { + Grid_simd ret; + Grid_simd::conv_t cx; + Grid_simd::conv_t cy; + Grid_simd::scalar_type sx,sy; + + cx.v = x.v; + cy.v = y.v; + for (int i = 0; i < Nsimd(); i++) { + sx = cx.s[i]; + sy = cy.s[i]; + cx.s[i] = func(sx,sy); + } + ret.v = cx.v; + return ret; + } + /////////////////////// + // Exchange + // Al Ah , Bl Bh -> Al Bl Ah,Bh + /////////////////////// + friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n) + { + if (n==3) { + Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v); + } else if(n==2) { + Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v); + } else if(n==1) { + Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v); + } else if(n==0) { + Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v); + } + } + + //////////////////////////////////////////////////////////////////// + // General permute; assumes vector length is same across + // all subtypes; may not be a good assumption, but could + // add the vector width as a template param for BG/Q for example + //////////////////////////////////////////////////////////////////// + friend inline void permute0(Grid_simd &y, Grid_simd b) { + y.v = Optimization::Permute::Permute0(b.v); + } + friend inline void permute1(Grid_simd &y, Grid_simd b) { + y.v = Optimization::Permute::Permute1(b.v); + } + friend inline void permute2(Grid_simd &y, Grid_simd b) { + y.v = Optimization::Permute::Permute2(b.v); + } + friend inline void permute3(Grid_simd &y, Grid_simd b) { + y.v = Optimization::Permute::Permute3(b.v); + } + friend inline void permute(Grid_simd &y, Grid_simd b, int perm) { + if (perm & RotateBit) { + int dist = perm & 0xF; + y = rotate(b, dist); + return; + } + else if(perm==3) permute3(y, b); + else if(perm==2) permute2(y, b); + else if(perm==1) permute1(y, b); + else if(perm==0) permute0(y, b); + } + + /////////////////////////////// + // Getting single lanes + /////////////////////////////// + inline Scalar_type getlane(int lane) { + return ((Scalar_type*)&v)[lane]; + } + + inline void putlane(const Scalar_type &S, int lane){ + ((Scalar_type*)&v)[lane] = S; + } + + + +}; // end of Grid_simd class definition + +inline void permute(ComplexD &y,ComplexD b, int perm) { y=b; } +inline void permute(ComplexF &y,ComplexF b, int perm) { y=b; } +inline void permute(RealD &y,RealD b, int perm) { y=b; } +inline void permute(RealF &y,RealF b, int perm) { y=b; } + +//////////////////////////////////////////////////////////////////// +// General rotate +//////////////////////////////////////////////////////////////////// +template = 0> +inline Grid_simd rotate(Grid_simd b, int nrot) { + nrot = nrot % Grid_simd::Nsimd(); + Grid_simd ret; + ret.v = Optimization::Rotate::rotate(b.v, nrot); + return ret; +} +template = 0> +inline Grid_simd rotate(Grid_simd b, int nrot) { + nrot = nrot % Grid_simd::Nsimd(); + Grid_simd ret; + ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot); + return ret; +} +template =0> +inline void rotate( Grid_simd &ret,Grid_simd b,int nrot) +{ + nrot = nrot % Grid_simd::Nsimd(); + ret.v = Optimization::Rotate::rotate(b.v,nrot); +} +template =0> +inline void rotate(Grid_simd &ret,Grid_simd b,int nrot) +{ + nrot = nrot % Grid_simd::Nsimd(); + ret.v = Optimization::Rotate::rotate(b.v,2*nrot); +} + +template +inline void vbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ + S* typepun =(S*) &src; + vsplat(ret,typepun[lane]); +} +template =0> +inline void rbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ + S* typepun =(S*) &src; + ret.v = unary(real(typepun[lane]), VsplatSIMD()); +} + + + +/////////////////////// +// Splat +/////////////////////// + +// this is only for the complex version +template = 0, class ABtype> +inline void vsplat(Grid_simd &ret, ABtype a, ABtype b) { + ret.v = binary(a, b, VsplatSIMD()); +} + +// overload if complex +template +inline void vsplat(Grid_simd &ret, EnableIf, S> c) { + vsplat(ret, real(c), imag(c)); +} +template +inline void rsplat(Grid_simd &ret, EnableIf, S> c) { + vsplat(ret, real(c), real(c)); +} + +// if real fill with a, if complex fill with a in the real part (first function +// above) +template +inline void vsplat(Grid_simd &ret, NotEnableIf, S> a) { + ret.v = unary(a, VsplatSIMD()); +} +////////////////////////// + +/////////////////////////////////////////////// +// Initialise to 1,0,i for the correct types +/////////////////////////////////////////////// +// For complex types +template = 0> +inline void vone(Grid_simd &ret) { + vsplat(ret, S(1.0, 0.0)); +} +template = 0> +inline void vzero(Grid_simd &ret) { + vsplat(ret, S(0.0, 0.0)); +} // use xor? +template = 0> +inline void vcomplex_i(Grid_simd &ret) { + vsplat(ret, S(0.0, 1.0)); +} + +template = 0> +inline void visign(Grid_simd &ret) { + vsplat(ret, S(1.0, -1.0)); +} +template = 0> +inline void vrsign(Grid_simd &ret) { + vsplat(ret, S(-1.0, 1.0)); +} + +// if not complex overload here +template = 0> +inline void vone(Grid_simd &ret) { + vsplat(ret, S(1.0)); +} +template = 0> +inline void vzero(Grid_simd &ret) { + vsplat(ret, S(0.0)); +} + +// For integral types +template = 0> +inline void vone(Grid_simd &ret) { + vsplat(ret, 1); +} +template = 0> +inline void vzero(Grid_simd &ret) { + vsplat(ret, 0); +} +template = 0> +inline void vtrue(Grid_simd &ret) { + vsplat(ret, 0xFFFFFFFF); +} +template = 0> +inline void vfalse(Grid_simd &ret) { + vsplat(ret, 0); +} +template +inline void zeroit(Grid_simd &z) { + vzero(z); +} + +/////////////////////// +// Vstream +/////////////////////// +template = 0> +inline void vstream(Grid_simd &out, const Grid_simd &in) { + binary((S *)&out.v, in.v, VstreamSIMD()); +} +template = 0> +inline void vstream(Grid_simd &out, const Grid_simd &in) { + typedef typename S::value_type T; + binary((T *)&out.v, in.v, VstreamSIMD()); +} +template = 0> +inline void vstream(Grid_simd &out, const Grid_simd &in) { + out = in; +} + +//////////////////////////////////// +// Arithmetic operator overloads +,-,* +//////////////////////////////////// +template +inline Grid_simd operator+(Grid_simd a, Grid_simd b) { + Grid_simd ret; + ret.v = binary(a.v, b.v, SumSIMD()); + return ret; +}; + +template +inline Grid_simd operator-(Grid_simd a, Grid_simd b) { + Grid_simd ret; + ret.v = binary(a.v, b.v, SubSIMD()); + return ret; +}; + +// Distinguish between complex types and others +template = 0> +inline Grid_simd real_mult(Grid_simd a, Grid_simd b) { + Grid_simd ret; + ret.v = binary(a.v, b.v, MultRealPartSIMD()); + return ret; +}; +template = 0> +inline Grid_simd real_madd(Grid_simd a, Grid_simd b, Grid_simd c) { + Grid_simd ret; + ret.v = trinary(a.v, b.v, c.v, MaddRealPartSIMD()); + return ret; +}; + + +// Distinguish between complex types and others +template = 0> +inline Grid_simd operator*(Grid_simd a, Grid_simd b) { + Grid_simd ret; + ret.v = binary(a.v, b.v, MultComplexSIMD()); + return ret; +}; + +// Real/Integer types +template = 0> +inline Grid_simd operator*(Grid_simd a, Grid_simd b) { + Grid_simd ret; + ret.v = binary(a.v, b.v, MultSIMD()); + return ret; +}; + +// Distinguish between complex types and others +template = 0> +inline Grid_simd operator/(Grid_simd a, Grid_simd b) { + typedef Grid_simd simd; + + simd ret; + simd den; + typename simd::conv_t conv; + + ret = a * conjugate(b) ; + den = b * conjugate(b) ; + + + auto real_den = toReal(den); + + ret.v=binary(ret.v, real_den.v, DivSIMD()); + + return ret; +}; + +// Real/Integer types +template = 0> +inline Grid_simd operator/(Grid_simd a, Grid_simd b) { + Grid_simd ret; + ret.v = binary(a.v, b.v, DivSIMD()); + return ret; +}; + +/////////////////////// +// Conjugate +/////////////////////// +template = 0> +inline Grid_simd conjugate(const Grid_simd &in) { + Grid_simd ret; + ret.v = unary(in.v, ConjSIMD()); + return ret; +} +template = 0> +inline Grid_simd conjugate(const Grid_simd &in) { + return in; // for real objects +} +// Suppress adj for integer types... // odd; why conjugate above but not adj?? +template = 0> +inline Grid_simd adj(const Grid_simd &in) { + return conjugate(in); +} + +/////////////////////// +// timesMinusI +/////////////////////// +template = 0> +inline void timesMinusI(Grid_simd &ret, const Grid_simd &in) { + ret.v = binary(in.v, ret.v, TimesMinusISIMD()); +} +template = 0> +inline Grid_simd timesMinusI(const Grid_simd &in) { + Grid_simd ret; + timesMinusI(ret, in); + return ret; +} +template = 0> +inline Grid_simd timesMinusI(const Grid_simd &in) { + return in; +} + +/////////////////////// +// timesI +/////////////////////// +template = 0> +inline void timesI(Grid_simd &ret, const Grid_simd &in) { + ret.v = binary(in.v, ret.v, TimesISIMD()); +} +template = 0> +inline Grid_simd timesI(const Grid_simd &in) { + Grid_simd ret; + timesI(ret, in); + return ret; +} +template = 0> +inline Grid_simd timesI(const Grid_simd &in) { + return in; +} + +///////////////////// +// Inner, outer +///////////////////// + +template +inline Grid_simd innerProduct(const Grid_simd &l, + const Grid_simd &r) { + return conjugate(l) * r; +} +template +inline Grid_simd outerProduct(const Grid_simd &l, + const Grid_simd &r) { + return l * conjugate(r); +} + +template +inline Grid_simd trace(const Grid_simd &arg) { + return arg; +} + +//////////////////////////////////////////////////////////// +// copy/splat complex real parts into real; +// insert real into complex and zero imag; +//////////////////////////////////////////////////////////// + +// real = toReal( complex ) +template = 0> +inline Grid_simd toReal(const Grid_simd, V> &in) { + typedef Grid_simd simd; + simd ret; + typename simd::conv_t conv; + conv.v = in.v; // copy the vector content (bytewise) + for (int i = 0; i < simd::Nsimd(); i += 2) { + conv.s[i + 1] = conv.s[i]; // duplicate (r,r);(r,r);(r,r); etc... + } + ret.v = conv.v; + return ret; +} + +// complex = toComplex( real ) +template = 0> // must be a real arg +inline Grid_simd, V> toComplex(const Grid_simd &in) { + typedef Grid_simd Rsimd; + typedef Grid_simd, V> Csimd; + typename Rsimd::conv_t conv; // address as real + + conv.v = in.v; + for (int i = 0; i < Rsimd::Nsimd(); i += 2) { + assert(conv.s[i + 1] == conv.s[i]); + // trap any cases where real was not duplicated + // indicating the SIMD grids of real and imag assignment did not correctly + // match + conv.s[i + 1] = 0.0; // zero imaginary parts + } + Csimd ret; + ret.v = conv.v; + return ret; +} + +/////////////////////////////// +// Define available types +/////////////////////////////// +typedef Grid_simd vRealF; +typedef Grid_simd vRealD; +typedef Grid_simd, SIMD_Ftype> vComplexF; +typedef Grid_simd, SIMD_Dtype> vComplexD; +typedef Grid_simd vInteger; + +// Half precision; no arithmetic support +typedef Grid_simd vRealH; +typedef Grid_simd, SIMD_Htype> vComplexH; + +inline void precisionChange(vRealF *out,vRealD *in,int nvec) +{ + assert((nvec&0x1)==0); + for(int m=0;m*2 +struct is_simd : public std::false_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; + +template using IfSimd = Invoke::value, int> >; +template using IfNotSimd = Invoke::value, unsigned> >; +} + +#endif From a9c816a26883056c367ea5972bca41fc1b24b3a3 Mon Sep 17 00:00:00 2001 From: Nils Meyer Date: Tue, 27 Jun 2017 21:39:15 +0200 Subject: [PATCH 098/170] moved file to correct folder --- Grid_vector_types.h | 857 ----------------------------------- lib/simd/Grid_vector_types.h | 75 +-- 2 files changed, 22 insertions(+), 910 deletions(-) delete mode 100644 Grid_vector_types.h diff --git a/Grid_vector_types.h b/Grid_vector_types.h deleted file mode 100644 index e05fecc4..00000000 --- a/Grid_vector_types.h +++ /dev/null @@ -1,857 +0,0 @@ -/************************************************************************************* - -Grid physics library, www.github.com/paboyle/Grid - -Source file: ./lib/simd/Grid_vector_type.h - -Copyright (C) 2015 - -Author: Azusa Yamaguchi -Author: Guido Cossu -Author: Peter Boyle -Author: neo - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along -with this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - -See the full license in the file "LICENSE" in the top level distribution -directory -*************************************************************************************/ -/* END LEGAL */ -//--------------------------------------------------------------------------- -/*! @file Grid_vector_types.h - @brief Defines templated class Grid_simd to deal with inner vector types -*/ -// Time-stamp: <2015-07-10 17:45:33 neo> -//--------------------------------------------------------------------------- -#ifndef GRID_VECTOR_TYPES -#define GRID_VECTOR_TYPES - -#ifdef GEN -#include "Grid_generic.h" -#endif -#ifdef SSE4 -#include "Grid_sse4.h" -#endif -#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4) -#include "Grid_avx.h" -#endif -#if defined AVX512 -#include "Grid_avx512.h" -#endif -#if defined IMCI -#include "Grid_imci.h" -#endif -#ifdef NEONv8 -#include "Grid_neon.h" -#endif -#if defined QPX -#include "Grid_qpx.h" -#endif - -#include "l1p.h" - -namespace Grid { - -////////////////////////////////////// -// To take the floating point type of real/complex type -////////////////////////////////////// -template -struct RealPart { - typedef T type; -}; -template -struct RealPart > { - typedef T type; -}; - -#include - -////////////////////////////////////// -// demote a vector to real type -////////////////////////////////////// -// type alias used to simplify the syntax of std::enable_if -template using Invoke = typename T::type; -template using EnableIf = Invoke >; -template using NotEnableIf = Invoke >; - -//////////////////////////////////////////////////////// -// Check for complexity with type traits -template struct is_complex : public std::false_type {}; -template <> struct is_complex > : public std::true_type {}; -template <> struct is_complex > : public std::true_type {}; - -template using IfReal = Invoke::value, int> >; -template using IfComplex = Invoke::value, int> >; -template using IfInteger = Invoke::value, int> >; -template using IfSame = Invoke::value, int> >; - -template using IfNotReal = Invoke::value, int> >; -template using IfNotComplex = Invoke::value, int> >; -template using IfNotInteger = Invoke::value, int> >; -template using IfNotSame = Invoke::value, int> >; - -//////////////////////////////////////////////////////// -// Define the operation templates functors -// general forms to allow for vsplat syntax -// need explicit declaration of types when used since -// clang cannot automatically determine the output type sometimes -template -Out trinary(Input1 src_1, Input2 src_2, Input3 src_3, Operation op) { - return op(src_1, src_2, src_3); -} - -template -Out binary(Input1 src_1, Input2 src_2, Operation op) { - return op(src_1, src_2); -} - -template -Out unary(Input src, Operation op) { - return op(src); -} -/////////////////////////////////////////////// - -/* - @brief Grid_simd class for the SIMD vector type operations - */ -template -class Grid_simd { - public: - typedef typename RealPart::type Real; - typedef Vector_type vector_type; - typedef Scalar_type scalar_type; - - typedef union conv_t_union { - Vector_type v; - Scalar_type s[sizeof(Vector_type) / sizeof(Scalar_type)]; - conv_t_union(){}; - } conv_t; - - Vector_type v; - - static inline constexpr int Nsimd(void) { - return sizeof(Vector_type) / sizeof(Scalar_type); - } - - Grid_simd &operator=(const Grid_simd &&rhs) { - v = rhs.v; - return *this; - }; - Grid_simd &operator=(const Grid_simd &rhs) { - v = rhs.v; - return *this; - }; // faster than not declaring it and leaving to the compiler - Grid_simd() = default; - Grid_simd(const Grid_simd &rhs) : v(rhs.v){}; // compiles in movaps - Grid_simd(const Grid_simd &&rhs) : v(rhs.v){}; - - ///////////////////////////// - // Constructors - ///////////////////////////// - Grid_simd &operator=(Zero &z) { - vzero(*this); - return (*this); - } - - // Enable if complex type - template - Grid_simd(const typename std::enable_if::value, S>::type a) { - vsplat(*this, a); - }; - - Grid_simd(const Real a) { vsplat(*this, Scalar_type(a)); }; - - /////////////////////////////////////////////// - // mac, mult, sub, add, adj - /////////////////////////////////////////////// - - // FIXME -- alias this to an inline MAC struct. - friend inline void mac(Grid_simd *__restrict__ y, - const Grid_simd *__restrict__ a, - const Grid_simd *__restrict__ x) { - *y = (*a) * (*x) + (*y); - }; - - friend inline void mult(Grid_simd *__restrict__ y, - const Grid_simd *__restrict__ l, - const Grid_simd *__restrict__ r) { - *y = (*l) * (*r); - } - - friend inline void sub(Grid_simd *__restrict__ y, - const Grid_simd *__restrict__ l, - const Grid_simd *__restrict__ r) { - *y = (*l) - (*r); - } - friend inline void add(Grid_simd *__restrict__ y, - const Grid_simd *__restrict__ l, - const Grid_simd *__restrict__ r) { - *y = (*l) + (*r); - } - friend inline void mac(Grid_simd *__restrict__ y, - const Scalar_type *__restrict__ a, - const Grid_simd *__restrict__ x) { - *y = (*a) * (*x) + (*y); - }; - friend inline void mult(Grid_simd *__restrict__ y, - const Scalar_type *__restrict__ l, - const Grid_simd *__restrict__ r) { - *y = (*l) * (*r); - } - friend inline void sub(Grid_simd *__restrict__ y, - const Scalar_type *__restrict__ l, - const Grid_simd *__restrict__ r) { - *y = (*l) - (*r); - } - friend inline void add(Grid_simd *__restrict__ y, - const Scalar_type *__restrict__ l, - const Grid_simd *__restrict__ r) { - *y = (*l) + (*r); - } - - friend inline void mac(Grid_simd *__restrict__ y, - const Grid_simd *__restrict__ a, - const Scalar_type *__restrict__ x) { - *y = (*a) * (*x) + (*y); - }; - friend inline void mult(Grid_simd *__restrict__ y, - const Grid_simd *__restrict__ l, - const Scalar_type *__restrict__ r) { - *y = (*l) * (*r); - } - friend inline void sub(Grid_simd *__restrict__ y, - const Grid_simd *__restrict__ l, - const Scalar_type *__restrict__ r) { - *y = (*l) - (*r); - } - friend inline void add(Grid_simd *__restrict__ y, - const Grid_simd *__restrict__ l, - const Scalar_type *__restrict__ r) { - *y = (*l) + (*r); - } - - //////////////////////////////////////////////////////////////////////// - // FIXME: gonna remove these load/store, get, set, prefetch - //////////////////////////////////////////////////////////////////////// - friend inline void vset(Grid_simd &ret, Scalar_type *a) { - ret.v = unary(a, VsetSIMD()); - } - - /////////////////////// - // Vstore - /////////////////////// - friend inline void vstore(const Grid_simd &ret, Scalar_type *a) { - binary(ret.v, (Real *)a, VstoreSIMD()); - } - - /////////////////////// - // Vprefetch - /////////////////////// - friend inline void vprefetch(const Grid_simd &v) { - prefetch_HINT_T0((const char *)&v.v); - } - - /////////////////////// - // Reduce - /////////////////////// - friend inline Scalar_type Reduce(const Grid_simd &in) { - return unary(in.v, ReduceSIMD()); - } - - //////////////////////////// - // operator scalar * simd - //////////////////////////// - friend inline Grid_simd operator*(const Scalar_type &a, Grid_simd b) { - Grid_simd va; - vsplat(va, a); - return va * b; - } - friend inline Grid_simd operator*(Grid_simd b, const Scalar_type &a) { - return a * b; - } - - ////////////////////////////////// - // Divides - ////////////////////////////////// - friend inline Grid_simd operator/(const Scalar_type &a, Grid_simd b) { - Grid_simd va; - vsplat(va, a); - return va / b; - } - friend inline Grid_simd operator/(Grid_simd b, const Scalar_type &a) { - Grid_simd va; - vsplat(va, a); - return b / a; - } - - /////////////////////// - // Unary negation - /////////////////////// - friend inline Grid_simd operator-(const Grid_simd &r) { - Grid_simd ret; - vzero(ret); - ret = ret - r; - return ret; - } - // *=,+=,-= operators - inline Grid_simd &operator*=(const Grid_simd &r) { - *this = (*this) * r; - return *this; - // return (*this)*r; ? - } - inline Grid_simd &operator+=(const Grid_simd &r) { - *this = *this + r; - return *this; - } - inline Grid_simd &operator-=(const Grid_simd &r) { - *this = *this - r; - return *this; - } - - /////////////////////////////////////// - // Not all functions are supported - // through SIMD and must breakout to - // scalar type and back again. This - // provides support - /////////////////////////////////////// - - template - friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) { - Grid_simd ret; - Grid_simd::conv_t conv; - Grid_simd::scalar_type s; - - conv.v = v.v; - for (int i = 0; i < Nsimd(); i++) { - s = conv.s[i]; - conv.s[i] = func(s); - } - ret.v = conv.v; - return ret; - } - template - friend inline Grid_simd SimdApplyBinop(const functor &func, - const Grid_simd &x, - const Grid_simd &y) { - Grid_simd ret; - Grid_simd::conv_t cx; - Grid_simd::conv_t cy; - Grid_simd::scalar_type sx,sy; - - cx.v = x.v; - cy.v = y.v; - for (int i = 0; i < Nsimd(); i++) { - sx = cx.s[i]; - sy = cy.s[i]; - cx.s[i] = func(sx,sy); - } - ret.v = cx.v; - return ret; - } - /////////////////////// - // Exchange - // Al Ah , Bl Bh -> Al Bl Ah,Bh - /////////////////////// - friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n) - { - if (n==3) { - Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v); - } else if(n==2) { - Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v); - } else if(n==1) { - Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v); - } else if(n==0) { - Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v); - } - } - - //////////////////////////////////////////////////////////////////// - // General permute; assumes vector length is same across - // all subtypes; may not be a good assumption, but could - // add the vector width as a template param for BG/Q for example - //////////////////////////////////////////////////////////////////// - friend inline void permute0(Grid_simd &y, Grid_simd b) { - y.v = Optimization::Permute::Permute0(b.v); - } - friend inline void permute1(Grid_simd &y, Grid_simd b) { - y.v = Optimization::Permute::Permute1(b.v); - } - friend inline void permute2(Grid_simd &y, Grid_simd b) { - y.v = Optimization::Permute::Permute2(b.v); - } - friend inline void permute3(Grid_simd &y, Grid_simd b) { - y.v = Optimization::Permute::Permute3(b.v); - } - friend inline void permute(Grid_simd &y, Grid_simd b, int perm) { - if (perm & RotateBit) { - int dist = perm & 0xF; - y = rotate(b, dist); - return; - } - else if(perm==3) permute3(y, b); - else if(perm==2) permute2(y, b); - else if(perm==1) permute1(y, b); - else if(perm==0) permute0(y, b); - } - - /////////////////////////////// - // Getting single lanes - /////////////////////////////// - inline Scalar_type getlane(int lane) { - return ((Scalar_type*)&v)[lane]; - } - - inline void putlane(const Scalar_type &S, int lane){ - ((Scalar_type*)&v)[lane] = S; - } - - - -}; // end of Grid_simd class definition - -inline void permute(ComplexD &y,ComplexD b, int perm) { y=b; } -inline void permute(ComplexF &y,ComplexF b, int perm) { y=b; } -inline void permute(RealD &y,RealD b, int perm) { y=b; } -inline void permute(RealF &y,RealF b, int perm) { y=b; } - -//////////////////////////////////////////////////////////////////// -// General rotate -//////////////////////////////////////////////////////////////////// -template = 0> -inline Grid_simd rotate(Grid_simd b, int nrot) { - nrot = nrot % Grid_simd::Nsimd(); - Grid_simd ret; - ret.v = Optimization::Rotate::rotate(b.v, nrot); - return ret; -} -template = 0> -inline Grid_simd rotate(Grid_simd b, int nrot) { - nrot = nrot % Grid_simd::Nsimd(); - Grid_simd ret; - ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot); - return ret; -} -template =0> -inline void rotate( Grid_simd &ret,Grid_simd b,int nrot) -{ - nrot = nrot % Grid_simd::Nsimd(); - ret.v = Optimization::Rotate::rotate(b.v,nrot); -} -template =0> -inline void rotate(Grid_simd &ret,Grid_simd b,int nrot) -{ - nrot = nrot % Grid_simd::Nsimd(); - ret.v = Optimization::Rotate::rotate(b.v,2*nrot); -} - -template -inline void vbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ - S* typepun =(S*) &src; - vsplat(ret,typepun[lane]); -} -template =0> -inline void rbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ - S* typepun =(S*) &src; - ret.v = unary(real(typepun[lane]), VsplatSIMD()); -} - - - -/////////////////////// -// Splat -/////////////////////// - -// this is only for the complex version -template = 0, class ABtype> -inline void vsplat(Grid_simd &ret, ABtype a, ABtype b) { - ret.v = binary(a, b, VsplatSIMD()); -} - -// overload if complex -template -inline void vsplat(Grid_simd &ret, EnableIf, S> c) { - vsplat(ret, real(c), imag(c)); -} -template -inline void rsplat(Grid_simd &ret, EnableIf, S> c) { - vsplat(ret, real(c), real(c)); -} - -// if real fill with a, if complex fill with a in the real part (first function -// above) -template -inline void vsplat(Grid_simd &ret, NotEnableIf, S> a) { - ret.v = unary(a, VsplatSIMD()); -} -////////////////////////// - -/////////////////////////////////////////////// -// Initialise to 1,0,i for the correct types -/////////////////////////////////////////////// -// For complex types -template = 0> -inline void vone(Grid_simd &ret) { - vsplat(ret, S(1.0, 0.0)); -} -template = 0> -inline void vzero(Grid_simd &ret) { - vsplat(ret, S(0.0, 0.0)); -} // use xor? -template = 0> -inline void vcomplex_i(Grid_simd &ret) { - vsplat(ret, S(0.0, 1.0)); -} - -template = 0> -inline void visign(Grid_simd &ret) { - vsplat(ret, S(1.0, -1.0)); -} -template = 0> -inline void vrsign(Grid_simd &ret) { - vsplat(ret, S(-1.0, 1.0)); -} - -// if not complex overload here -template = 0> -inline void vone(Grid_simd &ret) { - vsplat(ret, S(1.0)); -} -template = 0> -inline void vzero(Grid_simd &ret) { - vsplat(ret, S(0.0)); -} - -// For integral types -template = 0> -inline void vone(Grid_simd &ret) { - vsplat(ret, 1); -} -template = 0> -inline void vzero(Grid_simd &ret) { - vsplat(ret, 0); -} -template = 0> -inline void vtrue(Grid_simd &ret) { - vsplat(ret, 0xFFFFFFFF); -} -template = 0> -inline void vfalse(Grid_simd &ret) { - vsplat(ret, 0); -} -template -inline void zeroit(Grid_simd &z) { - vzero(z); -} - -/////////////////////// -// Vstream -/////////////////////// -template = 0> -inline void vstream(Grid_simd &out, const Grid_simd &in) { - binary((S *)&out.v, in.v, VstreamSIMD()); -} -template = 0> -inline void vstream(Grid_simd &out, const Grid_simd &in) { - typedef typename S::value_type T; - binary((T *)&out.v, in.v, VstreamSIMD()); -} -template = 0> -inline void vstream(Grid_simd &out, const Grid_simd &in) { - out = in; -} - -//////////////////////////////////// -// Arithmetic operator overloads +,-,* -//////////////////////////////////// -template -inline Grid_simd operator+(Grid_simd a, Grid_simd b) { - Grid_simd ret; - ret.v = binary(a.v, b.v, SumSIMD()); - return ret; -}; - -template -inline Grid_simd operator-(Grid_simd a, Grid_simd b) { - Grid_simd ret; - ret.v = binary(a.v, b.v, SubSIMD()); - return ret; -}; - -// Distinguish between complex types and others -template = 0> -inline Grid_simd real_mult(Grid_simd a, Grid_simd b) { - Grid_simd ret; - ret.v = binary(a.v, b.v, MultRealPartSIMD()); - return ret; -}; -template = 0> -inline Grid_simd real_madd(Grid_simd a, Grid_simd b, Grid_simd c) { - Grid_simd ret; - ret.v = trinary(a.v, b.v, c.v, MaddRealPartSIMD()); - return ret; -}; - - -// Distinguish between complex types and others -template = 0> -inline Grid_simd operator*(Grid_simd a, Grid_simd b) { - Grid_simd ret; - ret.v = binary(a.v, b.v, MultComplexSIMD()); - return ret; -}; - -// Real/Integer types -template = 0> -inline Grid_simd operator*(Grid_simd a, Grid_simd b) { - Grid_simd ret; - ret.v = binary(a.v, b.v, MultSIMD()); - return ret; -}; - -// Distinguish between complex types and others -template = 0> -inline Grid_simd operator/(Grid_simd a, Grid_simd b) { - typedef Grid_simd simd; - - simd ret; - simd den; - typename simd::conv_t conv; - - ret = a * conjugate(b) ; - den = b * conjugate(b) ; - - - auto real_den = toReal(den); - - ret.v=binary(ret.v, real_den.v, DivSIMD()); - - return ret; -}; - -// Real/Integer types -template = 0> -inline Grid_simd operator/(Grid_simd a, Grid_simd b) { - Grid_simd ret; - ret.v = binary(a.v, b.v, DivSIMD()); - return ret; -}; - -/////////////////////// -// Conjugate -/////////////////////// -template = 0> -inline Grid_simd conjugate(const Grid_simd &in) { - Grid_simd ret; - ret.v = unary(in.v, ConjSIMD()); - return ret; -} -template = 0> -inline Grid_simd conjugate(const Grid_simd &in) { - return in; // for real objects -} -// Suppress adj for integer types... // odd; why conjugate above but not adj?? -template = 0> -inline Grid_simd adj(const Grid_simd &in) { - return conjugate(in); -} - -/////////////////////// -// timesMinusI -/////////////////////// -template = 0> -inline void timesMinusI(Grid_simd &ret, const Grid_simd &in) { - ret.v = binary(in.v, ret.v, TimesMinusISIMD()); -} -template = 0> -inline Grid_simd timesMinusI(const Grid_simd &in) { - Grid_simd ret; - timesMinusI(ret, in); - return ret; -} -template = 0> -inline Grid_simd timesMinusI(const Grid_simd &in) { - return in; -} - -/////////////////////// -// timesI -/////////////////////// -template = 0> -inline void timesI(Grid_simd &ret, const Grid_simd &in) { - ret.v = binary(in.v, ret.v, TimesISIMD()); -} -template = 0> -inline Grid_simd timesI(const Grid_simd &in) { - Grid_simd ret; - timesI(ret, in); - return ret; -} -template = 0> -inline Grid_simd timesI(const Grid_simd &in) { - return in; -} - -///////////////////// -// Inner, outer -///////////////////// - -template -inline Grid_simd innerProduct(const Grid_simd &l, - const Grid_simd &r) { - return conjugate(l) * r; -} -template -inline Grid_simd outerProduct(const Grid_simd &l, - const Grid_simd &r) { - return l * conjugate(r); -} - -template -inline Grid_simd trace(const Grid_simd &arg) { - return arg; -} - -//////////////////////////////////////////////////////////// -// copy/splat complex real parts into real; -// insert real into complex and zero imag; -//////////////////////////////////////////////////////////// - -// real = toReal( complex ) -template = 0> -inline Grid_simd toReal(const Grid_simd, V> &in) { - typedef Grid_simd simd; - simd ret; - typename simd::conv_t conv; - conv.v = in.v; // copy the vector content (bytewise) - for (int i = 0; i < simd::Nsimd(); i += 2) { - conv.s[i + 1] = conv.s[i]; // duplicate (r,r);(r,r);(r,r); etc... - } - ret.v = conv.v; - return ret; -} - -// complex = toComplex( real ) -template = 0> // must be a real arg -inline Grid_simd, V> toComplex(const Grid_simd &in) { - typedef Grid_simd Rsimd; - typedef Grid_simd, V> Csimd; - typename Rsimd::conv_t conv; // address as real - - conv.v = in.v; - for (int i = 0; i < Rsimd::Nsimd(); i += 2) { - assert(conv.s[i + 1] == conv.s[i]); - // trap any cases where real was not duplicated - // indicating the SIMD grids of real and imag assignment did not correctly - // match - conv.s[i + 1] = 0.0; // zero imaginary parts - } - Csimd ret; - ret.v = conv.v; - return ret; -} - -/////////////////////////////// -// Define available types -/////////////////////////////// -typedef Grid_simd vRealF; -typedef Grid_simd vRealD; -typedef Grid_simd, SIMD_Ftype> vComplexF; -typedef Grid_simd, SIMD_Dtype> vComplexD; -typedef Grid_simd vInteger; - -// Half precision; no arithmetic support -typedef Grid_simd vRealH; -typedef Grid_simd, SIMD_Htype> vComplexH; - -inline void precisionChange(vRealF *out,vRealD *in,int nvec) -{ - assert((nvec&0x1)==0); - for(int m=0;m*2 -struct is_simd : public std::false_type {}; -template <> struct is_simd : public std::true_type {}; -template <> struct is_simd : public std::true_type {}; -template <> struct is_simd : public std::true_type {}; -template <> struct is_simd : public std::true_type {}; -template <> struct is_simd : public std::true_type {}; - -template using IfSimd = Invoke::value, int> >; -template using IfNotSimd = Invoke::value, unsigned> >; -} - -#endif diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 424b5573..e05fecc4 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -327,16 +327,12 @@ class Grid_simd { // provides support /////////////////////////////////////// - //#if (__GNUC__ == 5 ) || ( ( __GNUC__ == 6 ) && __GNUC_MINOR__ < 3 ) - //#pragma GCC push_options - //#pragma GCC optimize ("O0") - //#endif template friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) { Grid_simd ret; Grid_simd::conv_t conv; Grid_simd::scalar_type s; - + conv.v = v.v; for (int i = 0; i < Nsimd(); i++) { s = conv.s[i]; @@ -364,11 +360,8 @@ class Grid_simd { ret.v = cx.v; return ret; } - //#if (__GNUC__ == 5 ) || ( ( __GNUC__ == 6 ) && __GNUC_MINOR__ < 3 ) - //#pragma GCC pop_options - //#endif /////////////////////// - // Exchange + // Exchange // Al Ah , Bl Bh -> Al Bl Ah,Bh /////////////////////// friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n) @@ -379,7 +372,7 @@ class Grid_simd { Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v); } else if(n==1) { Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v); - } else if(n==0) { + } else if(n==0) { Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v); } } @@ -406,7 +399,7 @@ class Grid_simd { int dist = perm & 0xF; y = rotate(b, dist); return; - } + } else if(perm==3) permute3(y, b); else if(perm==2) permute2(y, b); else if(perm==1) permute1(y, b); @@ -425,10 +418,9 @@ class Grid_simd { } - + }; // end of Grid_simd class definition - inline void permute(ComplexD &y,ComplexD b, int perm) { y=b; } inline void permute(ComplexF &y,ComplexF b, int perm) { y=b; } inline void permute(RealD &y,RealD b, int perm) { y=b; } @@ -451,29 +443,29 @@ inline Grid_simd rotate(Grid_simd b, int nrot) { ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot); return ret; } -template =0> +template =0> inline void rotate( Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); ret.v = Optimization::Rotate::rotate(b.v,nrot); } -template =0> +template =0> inline void rotate(Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); ret.v = Optimization::Rotate::rotate(b.v,2*nrot); } -template +template inline void vbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ S* typepun =(S*) &src; vsplat(ret,typepun[lane]); -} -template =0> +} +template =0> inline void rbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ S* typepun =(S*) &src; ret.v = unary(real(typepun[lane]), VsplatSIMD()); -} +} @@ -604,27 +596,13 @@ inline Grid_simd real_mult(Grid_simd a, Grid_simd b) { ret.v = binary(a.v, b.v, MultRealPartSIMD()); return ret; }; -// TEST for Test_simd -template = 0> -inline Grid_simd real_mult(std::complex a, std::complex b) { - Grid_simd ret; - //ret.v = binary(a.v, b.v, MultRealPartSIMD()); - return ret; -}; - template = 0> inline Grid_simd real_madd(Grid_simd a, Grid_simd b, Grid_simd c) { Grid_simd ret; ret.v = trinary(a.v, b.v, c.v, MaddRealPartSIMD()); return ret; }; -// TEST for Test_simd -template = 0> -inline Grid_simd real_madd(std::complex a, std::complex b) { - Grid_simd ret; - //ret.v = binary(a.v, b.v, MultRealPartSIMD()); - return ret; -}; + // Distinguish between complex types and others template = 0> @@ -654,7 +632,7 @@ inline Grid_simd operator/(Grid_simd a, Grid_simd b) { ret = a * conjugate(b) ; den = b * conjugate(b) ; - + auto real_den = toReal(den); ret.v=binary(ret.v, real_den.v, DivSIMD()); @@ -773,8 +751,8 @@ inline Grid_simd, V> toComplex(const Grid_simd &in) { conv.v = in.v; for (int i = 0; i < Rsimd::Nsimd(); i += 2) { - assert(conv.s[i + 1] == - conv.s[i]); // trap any cases where real was not duplicated + assert(conv.s[i + 1] == conv.s[i]); + // trap any cases where real was not duplicated // indicating the SIMD grids of real and imag assignment did not correctly // match conv.s[i + 1] = 0.0; // zero imaginary parts @@ -852,8 +830,6 @@ inline void precisionChange(vComplexD *out,vComplexF *in,int nvec){ precisionCha inline void precisionChange(vComplexD *out,vComplexH *in,int nvec){ precisionChange((vRealD *)out,(vRealH *)in,nvec);} inline void precisionChange(vComplexF *out,vComplexH *in,int nvec){ precisionChange((vRealF *)out,(vRealH *)in,nvec);} - - // Check our vector types are of an appropriate size. #if defined QPX static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect"); @@ -868,21 +844,14 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc ///////////////////////////////////////// template struct is_simd : public std::false_type {}; -template <> -struct is_simd : public std::true_type {}; -template <> -struct is_simd : public std::true_type {}; -template <> -struct is_simd : public std::true_type {}; -template <> -struct is_simd : public std::true_type {}; -template <> -struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; -template -using IfSimd = Invoke::value, int> >; -template -using IfNotSimd = Invoke::value, unsigned> >; +template using IfSimd = Invoke::value, int> >; +template using IfNotSimd = Invoke::value, unsigned> >; } #endif From 07de925127e15fe7b43e31a9e9f3f2298f5f4261 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Wed, 28 Jun 2017 12:45:44 +0100 Subject: [PATCH 099/170] minor scalar action fixes --- lib/qcd/action/scalar/ScalarImpl.h | 4 ++-- lib/qcd/action/scalar/ScalarInteractionAction.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/qcd/action/scalar/ScalarImpl.h b/lib/qcd/action/scalar/ScalarImpl.h index 174553a2..f85ab840 100644 --- a/lib/qcd/action/scalar/ScalarImpl.h +++ b/lib/qcd/action/scalar/ScalarImpl.h @@ -124,11 +124,11 @@ class ScalarImplTypes { } static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) { - Group::LieRandomize(pRNG, U); + Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U); } static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) { - Group::LieRandomize(pRNG, U, 0.01); + Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U, 0.01); } static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) { diff --git a/lib/qcd/action/scalar/ScalarInteractionAction.h b/lib/qcd/action/scalar/ScalarInteractionAction.h index ac2d4fbb..4d189352 100644 --- a/lib/qcd/action/scalar/ScalarInteractionAction.h +++ b/lib/qcd/action/scalar/ScalarInteractionAction.h @@ -81,7 +81,7 @@ namespace Grid { phiStencil.HaloExchange(p, compressor); Field action(p._grid), pshift(p._grid), phisquared(p._grid); phisquared = p*p; - action = (2.0*Ndim + mass_square)*phisquared + lambda/24.*phisquared*phisquared; + action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared; for (int mu = 0; mu < Ndim; mu++) { // pshift = Cshift(p, mu, +1); // not efficient, implement with stencils parallel_for (int i = 0; i < p._grid->oSites(); i++) { @@ -113,7 +113,7 @@ namespace Grid { virtual void deriv(const Field &p, Field &force) { assert(p._grid->Nd() == Ndim); - force = (2.0*Ndim + mass_square)*p + lambda/12.*p*p*p; + force = (2.0*Ndim + mass_square)*p - lambda/12.*p*p*p; // move this outside static Stencil phiStencil(p._grid, npoint, 0, directions, displacements); phiStencil.HaloExchange(p, compressor); From 08e04b96761a03c703899a7ee6ca3f42dddcf2d2 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 28 Jun 2017 15:30:06 +0100 Subject: [PATCH 100/170] Better benchmarks --- benchmarks/Benchmark_memory_bandwidth.cc | 44 ++++++++++---------- benchmarks/Benchmark_su3.cc | 52 ++++++++++++------------ 2 files changed, 48 insertions(+), 48 deletions(-) diff --git a/benchmarks/Benchmark_memory_bandwidth.cc b/benchmarks/Benchmark_memory_bandwidth.cc index 1aa088f8..1136dfe0 100644 --- a/benchmarks/Benchmark_memory_bandwidth.cc +++ b/benchmarks/Benchmark_memory_bandwidth.cc @@ -55,9 +55,9 @@ int main (int argc, char ** argv) std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; @@ -65,11 +65,11 @@ int main (int argc, char ** argv) uint64_t Nloop=NLOOP; - // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9}); + GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - LatticeVec z(&Grid); //random(pRNG,z); - LatticeVec x(&Grid); //random(pRNG,x); - LatticeVec y(&Grid); //random(pRNG,y); + LatticeVec z(&Grid); random(pRNG,z); + LatticeVec x(&Grid); random(pRNG,x); + LatticeVec y(&Grid); random(pRNG,y); double a=2.0; @@ -94,17 +94,17 @@ int main (int argc, char ** argv) std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); - // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9}); + GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - LatticeVec z(&Grid); //random(pRNG,z); - LatticeVec x(&Grid); //random(pRNG,x); - LatticeVec y(&Grid); //random(pRNG,y); + LatticeVec z(&Grid); random(pRNG,z); + LatticeVec x(&Grid); random(pRNG,x); + LatticeVec y(&Grid); random(pRNG,y); double a=2.0; uint64_t Nloop=NLOOP; @@ -129,7 +129,7 @@ int main (int argc, char ** argv) std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); @@ -138,11 +138,11 @@ int main (int argc, char ** argv) GridCartesian Grid(latt_size,simd_layout,mpi_layout); - // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9}); + GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - LatticeVec z(&Grid); //random(pRNG,z); - LatticeVec x(&Grid); //random(pRNG,x); - LatticeVec y(&Grid); //random(pRNG,y); + LatticeVec z(&Grid); random(pRNG,z); + LatticeVec x(&Grid); random(pRNG,x); + LatticeVec y(&Grid); random(pRNG,y); RealD a=2.0; @@ -166,17 +166,17 @@ int main (int argc, char ** argv) std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; uint64_t Nloop=NLOOP; GridCartesian Grid(latt_size,simd_layout,mpi_layout); - // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9}); - LatticeVec z(&Grid); //random(pRNG,z); - LatticeVec x(&Grid); //random(pRNG,x); - LatticeVec y(&Grid); //random(pRNG,y); + GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); + LatticeVec z(&Grid); random(pRNG,z); + LatticeVec x(&Grid); random(pRNG,x); + LatticeVec y(&Grid); random(pRNG,y); RealD a=2.0; Real nn; double start=usecond(); diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc index 3d7f9bc9..035af2d9 100644 --- a/benchmarks/Benchmark_su3.cc +++ b/benchmarks/Benchmark_su3.cc @@ -37,12 +37,12 @@ int main (int argc, char ** argv) Grid_init(&argc,&argv); #define LMAX (64) - int Nloop=20; + int64_t Nloop=20; std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); std::vector mpi_layout = GridDefaultMpi(); - int threads = GridThread::GetThreads(); + int64_t threads = GridThread::GetThreads(); std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); - int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); - // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9}); + GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - LatticeColourMatrix z(&Grid);// random(pRNG,z); - LatticeColourMatrix x(&Grid);// random(pRNG,x); - LatticeColourMatrix y(&Grid);// random(pRNG,y); + LatticeColourMatrix z(&Grid); random(pRNG,z); + LatticeColourMatrix x(&Grid); random(pRNG,x); + LatticeColourMatrix y(&Grid); random(pRNG,y); double start=usecond(); - for(int i=0;i latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); - int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); - // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9}); + GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - LatticeColourMatrix z(&Grid); //random(pRNG,z); - LatticeColourMatrix x(&Grid); //random(pRNG,x); - LatticeColourMatrix y(&Grid); //random(pRNG,y); + LatticeColourMatrix z(&Grid); random(pRNG,z); + LatticeColourMatrix x(&Grid); random(pRNG,x); + LatticeColourMatrix y(&Grid); random(pRNG,y); double start=usecond(); - for(int i=0;i latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); - int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); - // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9}); + GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - LatticeColourMatrix z(&Grid); //random(pRNG,z); - LatticeColourMatrix x(&Grid); //random(pRNG,x); - LatticeColourMatrix y(&Grid); //random(pRNG,y); + LatticeColourMatrix z(&Grid); random(pRNG,z); + LatticeColourMatrix x(&Grid); random(pRNG,x); + LatticeColourMatrix y(&Grid); random(pRNG,y); double start=usecond(); - for(int i=0;i latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); - int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); - // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9}); + GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - LatticeColourMatrix z(&Grid); //random(pRNG,z); - LatticeColourMatrix x(&Grid); //random(pRNG,x); - LatticeColourMatrix y(&Grid); //random(pRNG,y); + LatticeColourMatrix z(&Grid); random(pRNG,z); + LatticeColourMatrix x(&Grid); random(pRNG,x); + LatticeColourMatrix y(&Grid); random(pRNG,y); double start=usecond(); - for(int i=0;i Date: Wed, 28 Jun 2017 20:22:22 +0200 Subject: [PATCH 101/170] corrected Grid_neon.h --- lib/simd/Grid_neon.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/simd/Grid_neon.h b/lib/simd/Grid_neon.h index f3f802e7..38815389 100644 --- a/lib/simd/Grid_neon.h +++ b/lib/simd/Grid_neon.h @@ -41,6 +41,10 @@ Author: neo //#ifndef ARM_NEON //#define ARM_NEON +#ifndef GEN_SIMD_WIDTH +#define GEN_SIMD_WIDTH 16u +#endif + #include "Grid_generic_types.h" #include From 6f5a5cd9b3269932a720804aebe8b7046d4b68fe Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 28 Jun 2017 23:27:02 +0100 Subject: [PATCH 102/170] Improved threaded comms benchmark --- TODO | 11 ++-- benchmarks/Benchmark_comms.cc | 94 +++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 3 deletions(-) diff --git a/TODO b/TODO index 001c6c0c..3d29215e 100644 --- a/TODO +++ b/TODO @@ -2,10 +2,13 @@ TODO: --------------- Large item work list: -1)- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O +1)- I/O; There appear to be issues with MPI IO and NERSC with large files. + Possible 2GB limit reappeared. GPFS driver in Intel MPI. + +2)- BG/Q port and check + +3)- Christoph's local basis expansion Lanczos; port to use Lattice_transfer features -2)- Christoph's local basis expansion Lanczos -3)- BG/Q port and check 4)- Precision conversion and sort out localConvert <-- partial - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet 5)- Physical propagator interface @@ -14,6 +17,8 @@ Large item work list: 8)- HDCR resume Recent DONE + +-- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O. <--- DONE -- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE -- GaugeFix into central location <-- DONE -- Scidac and Ildg metadata handling <-- DONE diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc index 753b8a58..698f9d25 100644 --- a/benchmarks/Benchmark_comms.cc +++ b/benchmarks/Benchmark_comms.cc @@ -435,6 +435,100 @@ int main (int argc, char ** argv) } } + + + + std::cout< latt_size ({lat*mpi_layout[0], + lat*mpi_layout[1], + lat*mpi_layout[2], + lat*mpi_layout[3]}); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + RealD Nrank = Grid._Nprocessors; + RealD Nnode = Grid.NodeCount(); + RealD ppn = Nrank/Nnode; + + std::vector xbuf(8); + std::vector rbuf(8); + Grid.ShmBufferFreeAll(); + for(int d=0;d<8;d++){ + xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + } + + int ncomm; + int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + double dbytes; + for(int i=0;i requests; + dbytes=0; + ncomm=0; + + parallel_for(int dir=0;dir<8;dir++){ + + double tbytes; + int mu =dir % 4; + + if (mpi_layout[mu]>1 ) { + + ncomm++; + int xmit_to_rank; + int recv_from_rank; + if ( dir == mu ) { + int comm_proc=1; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + } else { + int comm_proc = mpi_layout[mu]-1; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + } + tbytes= Grid.StencilSendToRecvFromBegin(requests, + (void *)&xbuf[dir][0], + xmit_to_rank, + (void *)&rbuf[dir][0], + recv_from_rank, + bytes,dir); + Grid.StencilSendToRecvFromComplete(requests,dir); + requests.resize(0); + +#pragma omp atomic + dbytes+=tbytes; + } + } + Grid.Barrier(); + double stop=usecond(); + t_time[i] = stop-start; // microseconds + } + + timestat.statistics(t_time); + + dbytes=dbytes*ppn; + double xbytes = dbytes*0.5; + double rbytes = dbytes*0.5; + double bidibytes = dbytes; + + + std::cout< Date: Thu, 29 Jun 2017 11:30:29 +0100 Subject: [PATCH 103/170] Small corrections to the NEON port --- configure.ac | 2 +- lib/qcd/smearing/WilsonFlow.h | 9 ++++----- lib/simd/Grid_neon.h | 15 +++++---------- lib/simd/Grid_vector_types.h | 2 +- 4 files changed, 11 insertions(+), 17 deletions(-) diff --git a/configure.ac b/configure.ac index a69b97e3..75cf7891 100644 --- a/configure.ac +++ b/configure.ac @@ -250,7 +250,7 @@ case ${ax_cv_cxx_compiler_vendor} in SIMD_FLAGS='';; NEONv8) AC_DEFINE([NEONV8],[1],[ARMv8 NEON]) - SIMD_FLAGS='';; + SIMD_FLAGS='-march=armv8-a';; QPX|BGQ) AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q]) SIMD_FLAGS='';; diff --git a/lib/qcd/smearing/WilsonFlow.h b/lib/qcd/smearing/WilsonFlow.h index 5e9f2d95..4f5c0d43 100644 --- a/lib/qcd/smearing/WilsonFlow.h +++ b/lib/qcd/smearing/WilsonFlow.h @@ -108,7 +108,7 @@ void WilsonFlow::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real if (maxTau - taus < epsilon){ epsilon = maxTau-taus; } - std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl; + //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl; GaugeField Z(U._grid); GaugeField Zprime(U._grid); GaugeField tmp(U._grid), Uprime(U._grid); @@ -138,10 +138,10 @@ void WilsonFlow::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real // adjust integration step taus += epsilon; - std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl; + //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl; epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.); - std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl; + //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl; } @@ -166,7 +166,6 @@ void WilsonFlow::smear(GaugeField& out, const GaugeField& in) const { out = in; for (unsigned int step = 1; step <= Nstep; step++) { auto start = std::chrono::high_resolution_clock::now(); - std::cout << GridLogMessage << "Evolution time :"<< tau(step) << std::endl; evolve_step(out); auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration diff = end - start; @@ -191,7 +190,7 @@ void WilsonFlow::smear_adaptive(GaugeField& out, const GaugeField& in, Re unsigned int step = 0; do{ step++; - std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl; + //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl; evolve_step_adaptive(out, maxTau); std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " << step << " " diff --git a/lib/simd/Grid_neon.h b/lib/simd/Grid_neon.h index 38815389..d6eb9c5a 100644 --- a/lib/simd/Grid_neon.h +++ b/lib/simd/Grid_neon.h @@ -6,9 +6,9 @@ Copyright (C) 2015 -Author: Nils Meyer -Author: Peter Boyle -Author: neo + Author: Nils Meyer + Author: Peter Boyle + Author: neo This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -27,7 +27,7 @@ Author: neo See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ -//---------------------------------------------------------------------- + /* ARMv8 NEON intrinsics layer by @@ -37,9 +37,6 @@ Author: neo SFB/TRR55 */ -//---------------------------------------------------------------------- -//#ifndef ARM_NEON -//#define ARM_NEON #ifndef GEN_SIMD_WIDTH #define GEN_SIMD_WIDTH 16u @@ -606,6 +603,4 @@ namespace Optimization { typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; -} - -//#endif // ARM_NEON +} \ No newline at end of file diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index e05fecc4..27585547 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -53,7 +53,7 @@ directory #if defined IMCI #include "Grid_imci.h" #endif -#ifdef NEONv8 +#ifdef NEONV8 #include "Grid_neon.h" #endif #if defined QPX From bf630a6821ea8923fc9690a03f621f6d69b31f4e Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Thu, 29 Jun 2017 11:42:25 +0100 Subject: [PATCH 104/170] README file update --- README.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 9432abe1..5d168298 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ License: GPL v2. -Last update Nov 2016. +Last update June 2017. _Please do not send pull requests to the `master` branch which is reserved for releases._ @@ -78,14 +78,17 @@ optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a signifi for most programmers. The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture. -Presently SSE4 (128 bit) AVX, AVX2, QPX (256 bit), IMCI, and AVX512 (512 bit) targets are supported (ARM NEON on the way). +Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported. -These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers. +These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`. MPI, OpenMP, and SIMD parallelism are present in the library. Please see https://arxiv.org/abs/1512.03487 for more detail. +### Required libraries +Grid requires [GMP](https://gmplib.org/), [MPFR](http://www.mpfr.org/) and optionally [HDF5](https://support.hdfgroup.org/HDF5/) and [LIME](http://usqcd-software.github.io/c-lime/) (for ILDG file format support) to be installed. + ### Quick start First, start by cloning the repository: @@ -173,7 +176,8 @@ The following options can be use with the `--enable-simd=` option to target diff | `AVXFMA4` | AVX (256 bit) + FMA4 | | `AVX2` | AVX 2 (256 bit) | | `AVX512` | AVX 512 bit | -| `QPX` | QPX (256 bit) | +| `NEONv8` | ARM NEON (128 bit) | +| `QPX` | IBM QPX (256 bit) | Alternatively, some CPU codenames can be directly used: From 09d09d0fe5bce853e1b42115371cd935a4e29cc0 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Thu, 29 Jun 2017 11:48:11 +0100 Subject: [PATCH 105/170] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 5d168298..1f0b450c 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,7 @@ These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`. MPI, OpenMP, and SIMD parallelism are present in the library. -Please see https://arxiv.org/abs/1512.03487 for more detail. +Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail. ### Required libraries Grid requires [GMP](https://gmplib.org/), [MPFR](http://www.mpfr.org/) and optionally [HDF5](https://support.hdfgroup.org/HDF5/) and [LIME](http://usqcd-software.github.io/c-lime/) (for ILDG file format support) to be installed. @@ -176,7 +176,7 @@ The following options can be use with the `--enable-simd=` option to target diff | `AVXFMA4` | AVX (256 bit) + FMA4 | | `AVX2` | AVX 2 (256 bit) | | `AVX512` | AVX 512 bit | -| `NEONv8` | ARM NEON (128 bit) | +| `NEONv8` | [ARM NEON](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch07s03.html) (128 bit) | | `QPX` | IBM QPX (256 bit) | Alternatively, some CPU codenames can be directly used: @@ -216,4 +216,4 @@ where `` is the UNIX prefix where GMP and MPFR are installed. If you are w --with-mpfr= \ --enable-mkl \ CXX=CC CC=cc -``` \ No newline at end of file +``` From ac1f1838bc9c143a3e2091e75d3f68e4455d0231 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 30 Jun 2017 10:15:32 +0100 Subject: [PATCH 106/170] KNL only --- lib/perfmon/PerfCount.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/perfmon/PerfCount.cc b/lib/perfmon/PerfCount.cc index 4778295a..c6f92b9f 100644 --- a/lib/perfmon/PerfCount.cc +++ b/lib/perfmon/PerfCount.cc @@ -40,7 +40,7 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES , "CPUCYCLES.........." , INSTRUCTIONS}, { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS , "INSTRUCTIONS......." , CPUCYCLES }, // 4 -#ifdef AVX512 +#ifdef KNL { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES }, { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS }, { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS }, From 2d3737a133b6f1208849cd8580badba4ff152a4d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 30 Jun 2017 10:15:59 +0100 Subject: [PATCH 107/170] O3, KNL --- configure.ac | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index f7284d48..8175e8b0 100644 --- a/configure.ac +++ b/configure.ac @@ -27,7 +27,7 @@ AX_GXX_VERSION AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"], [version of g++ that will compile the code]) -CXXFLAGS="-g $CXXFLAGS" +CXXFLAGS="-O3 $CXXFLAGS" ############### Checks for typedefs, structures, and compiler characteristics @@ -241,6 +241,7 @@ case ${ax_cv_cxx_compiler_vendor} in SIMD_FLAGS='';; KNL) AC_DEFINE([AVX512],[1],[AVX512 intrinsics]) + AC_DEFINE([KNL],[1],[Knights landing processor]) SIMD_FLAGS='-march=knl';; GEN) AC_DEFINE([GEN],[1],[generic vector code]) @@ -276,6 +277,7 @@ case ${ax_cv_cxx_compiler_vendor} in SIMD_FLAGS='';; KNL) AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing]) + AC_DEFINE([KNL],[1],[Knights landing processor]) SIMD_FLAGS='-xmic-avx512';; GEN) AC_DEFINE([GEN],[1],[generic vector code]) From 694b305cab39e1b7870ca57107521679486c611a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 30 Jun 2017 10:16:13 +0100 Subject: [PATCH 108/170] Update to reporting --- benchmarks/Benchmark_dwf.cc | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index a071c050..d50cc3a0 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -165,7 +165,7 @@ int main (int argc, char ** argv) std::cout << GridLogMessage<< "*****************************************************************" <Barrier(); Dw.ZeroCounters(); @@ -302,6 +302,7 @@ int main (int argc, char ** argv) std::cout<< "sD ERR \n " << err < Date: Fri, 30 Jun 2017 10:16:35 +0100 Subject: [PATCH 109/170] Switch off counters by default --- benchmarks/Benchmark_dwf.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index d50cc3a0..7814ec7d 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -383,7 +383,7 @@ int main (int argc, char ** argv) assert(error<1.0e-4); } - if(1){ + if(0){ std::cout << "Single cache warm call to sDw.Dhop " < Date: Fri, 30 Jun 2017 10:23:51 +0100 Subject: [PATCH 110/170] Interleave code path; not enabled --- lib/stencil/Lebesgue.cc | 25 ++++++++++++++++++++++++- lib/stencil/Lebesgue.h | 2 ++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/lib/stencil/Lebesgue.cc b/lib/stencil/Lebesgue.cc index 4551878c..0c644fc1 100644 --- a/lib/stencil/Lebesgue.cc +++ b/lib/stencil/Lebesgue.cc @@ -51,8 +51,31 @@ LebesgueOrder::LebesgueOrder(GridBase *_grid) if ( Block[0]==0) ZGraph(); else if ( Block[1]==0) NoBlocking(); else CartesianBlocking(); -} + if (0) { + std::cout << "Thread Interleaving"< reorder = _LebesgueReorder; + std::vector throrder; + int vol = _LebesgueReorder.size(); + int threads = GridThread::GetThreads(); + int blockbits=3; + int blocklen = 8; + int msk = 0x7; + + for(int t=0;t> blockbits) % threads == t ) { + throrder.push_back(reorder[ss]); + } + } + } + _LebesgueReorder = throrder; +} void LebesgueOrder::NoBlocking(void) { std::cout< & xi, std::vector &dims); + void ThreadInterleave(void); + private: std::vector _LebesgueReorder; From f20eceb6cd6469c496e07e01055a08c0e0e4f7c8 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 30 Jun 2017 10:48:27 +0100 Subject: [PATCH 111/170] First touch once per page in a threaded loop --- lib/allocator/AlignedAllocator.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/lib/allocator/AlignedAllocator.h b/lib/allocator/AlignedAllocator.h index 6e85ab27..54090024 100644 --- a/lib/allocator/AlignedAllocator.h +++ b/lib/allocator/AlignedAllocator.h @@ -98,7 +98,12 @@ public: #else if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes); #endif - + // First touch optimise in threaded loop + uint8_t *cp = (uint8_t *)ptr; +#pragma omp parallel for + for(size_type n=0;n Date: Fri, 30 Jun 2017 10:49:08 +0100 Subject: [PATCH 112/170] Guard first touch --- lib/allocator/AlignedAllocator.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/allocator/AlignedAllocator.h b/lib/allocator/AlignedAllocator.h index 54090024..4513ce26 100644 --- a/lib/allocator/AlignedAllocator.h +++ b/lib/allocator/AlignedAllocator.h @@ -100,7 +100,9 @@ public: #endif // First touch optimise in threaded loop uint8_t *cp = (uint8_t *)ptr; +#ifdef GRID_OMP #pragma omp parallel for +#endif for(size_type n=0;n Date: Fri, 30 Jun 2017 10:53:22 +0100 Subject: [PATCH 113/170] Best option for Xeon cache blocking set --- lib/stencil/Lebesgue.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/stencil/Lebesgue.cc b/lib/stencil/Lebesgue.cc index 0c644fc1..2880e4b6 100644 --- a/lib/stencil/Lebesgue.cc +++ b/lib/stencil/Lebesgue.cc @@ -32,8 +32,11 @@ Author: paboyle namespace Grid { int LebesgueOrder::UseLebesgueOrder; +#ifdef KNL std::vector LebesgueOrder::Block({8,2,2,2}); - +#else +std::vector LebesgueOrder::Block({2,2,2,2}); +#endif LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){ n--; // 1000 0011 --> 1000 0010 n |= n >> 1; // 1000 0010 | 0100 0001 = 1100 0011 From f3b0a92e71af2577afb68c3021b1f9a8467f3e8e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 1 Jul 2017 09:48:00 +0100 Subject: [PATCH 114/170] Update README.md --- README.md | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 94 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 1f0b450c..072f7404 100644 --- a/README.md +++ b/README.md @@ -158,7 +158,6 @@ The following options can be use with the `--enable-comms=` option to target dif | `none` | no communications | | `mpi[-auto]` | MPI communications | | `mpi3[-auto]` | MPI communications using MPI 3 shared memory | -| `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model | | `shmem ` | Cray SHMEM communications | For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead. @@ -199,21 +198,109 @@ The following configuration is recommended for the Intel Knights Landing platfor ``` bash ../configure --enable-precision=double\ --enable-simd=KNL \ - --enable-comms=mpi-auto \ - --with-gmp= \ - --with-mpfr= \ + --enable-comms=mpi-auto \ --enable-mkl \ CXX=icpc MPICXX=mpiicpc ``` +The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library. -where `` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: +If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: ``` bash ../configure --enable-precision=double\ --enable-simd=KNL \ --enable-comms=mpi \ - --with-gmp= \ - --with-mpfr= \ --enable-mkl \ CXX=CC CC=cc ``` + +If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed: +``` --with-gmp= \ + --with-mpfr= \ +``` +where `` is the UNIX prefix where GMP and MPFR are installed. + +Knight's Landing with Intel Omnipath adapters with two adapters per node +presently performs better with use of more than one rank per node, using shared memory +for interior communication. This is the mpi3 communications implementation. +We recommend four ranks per node for best performance, but optimum is local volume dependent. + +``` bash +../configure --enable-precision=double\ + --enable-simd=KNL \ + --enable-comms=mpi3 \ + --enable-mkl \ + CXX=mpiicpc +``` + +### Build setup for Intel Haswell Xeon platform + +The following configuration is recommended for the Intel Knights Landing platform: + +``` bash +../configure --enable-precision=double\ + --enable-simd=AVX2 \ + --enable-comms=mpi3 \ + --enable-mkl \ + CXX=mpiicpc +``` +The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library. + +If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed: +``` --with-gmp= \ + --with-mpfr= \ +``` +where `` is the UNIX prefix where GMP and MPFR are installed. + +If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: + +``` bash +../configure --enable-precision=double\ + --enable-simd=AVX2 \ + --enable-comms=mpi3 \ + --enable-mkl \ + CXX=CC CC=cc +``` +Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of +one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using +``` + export I_MPI_PIN=1 +``` +This is the default. + +### Build setup for Intel Skylake Xeon platform + +The following configuration is recommended for the Intel Knights Landing platform: + +``` bash +../configure --enable-precision=double\ + --enable-simd=AVX512 \ + --enable-comms=mpi3 \ + --enable-mkl \ + CXX=mpiicpc +``` +The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library. + +If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed: +``` --with-gmp= \ + --with-mpfr= \ +``` +where `` is the UNIX prefix where GMP and MPFR are installed. + +If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: + +``` bash +../configure --enable-precision=double\ + --enable-simd=AVX512 \ + --enable-comms=mpi3 \ + --enable-mkl \ + CXX=CC CC=cc +``` +Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of +one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using +``` + export I_MPI_PIN=1 +``` +This is the default. + + From e18929eaa0c8e6de539abf2c2ef259ea0816ea7e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 1 Jul 2017 09:53:15 +0100 Subject: [PATCH 115/170] Update README.md --- README.md | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 072f7404..f3645b3a 100644 --- a/README.md +++ b/README.md @@ -215,7 +215,8 @@ If you are working on a Cray machine that does not use the `mpiicpc` wrapper, pl ``` If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed: -``` --with-gmp= \ +``` bash + --with-gmp= \ --with-mpfr= \ ``` where `` is the UNIX prefix where GMP and MPFR are installed. @@ -228,26 +229,27 @@ We recommend four ranks per node for best performance, but optimum is local volu ``` bash ../configure --enable-precision=double\ --enable-simd=KNL \ - --enable-comms=mpi3 \ + --enable-comms=mpi3-auto \ --enable-mkl \ - CXX=mpiicpc + CC=icpc MPICXX=mpiicpc ``` ### Build setup for Intel Haswell Xeon platform -The following configuration is recommended for the Intel Knights Landing platform: +The following configuration is recommended for the Intel Haswell platform: ``` bash ../configure --enable-precision=double\ --enable-simd=AVX2 \ - --enable-comms=mpi3 \ + --enable-comms=mpi3-auto \ --enable-mkl \ - CXX=mpiicpc + CXX=icpc MPICXX=mpiicpc ``` The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library. If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed: -``` --with-gmp= \ +``` bash + --with-gmp= \ --with-mpfr= \ ``` where `` is the UNIX prefix where GMP and MPFR are installed. @@ -270,7 +272,7 @@ This is the default. ### Build setup for Intel Skylake Xeon platform -The following configuration is recommended for the Intel Knights Landing platform: +The following configuration is recommended for the Intel Skylake platform: ``` bash ../configure --enable-precision=double\ @@ -282,7 +284,8 @@ The following configuration is recommended for the Intel Knights Landing platfor The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library. If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed: -``` --with-gmp= \ +``` bash + --with-gmp= \ --with-mpfr= \ ``` where `` is the UNIX prefix where GMP and MPFR are installed. @@ -298,7 +301,7 @@ If you are working on a Cray machine that does not use the `mpiicpc` wrapper, pl ``` Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using -``` +``` bash export I_MPI_PIN=1 ``` This is the default. From 251a97fe1be59f28686e1d07f8576c7d9f815517 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 1 Jul 2017 09:55:36 +0100 Subject: [PATCH 116/170] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f3645b3a..f9fd7ab5 100644 --- a/README.md +++ b/README.md @@ -301,7 +301,7 @@ If you are working on a Cray machine that does not use the `mpiicpc` wrapper, pl ``` Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using -``` bash +``` export I_MPI_PIN=1 ``` This is the default. From 1354b46338bfaaa338e4e3ad7430e8b8fe087057 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 1 Jul 2017 10:04:32 +0100 Subject: [PATCH 117/170] Update README.md --- README.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/README.md b/README.md index f9fd7ab5..8f0babd9 100644 --- a/README.md +++ b/README.md @@ -306,4 +306,20 @@ one rank per socket. If using the Intel MPI library, threads should be pinned to ``` This is the default. +### Build setup for laptops, other compilers, non-cluster builds + +Many versions of g++ and clang++ work with Grid, and involve merely replacing CXX (and MPICXX), +and omit the enable-mkl flag. + +Single node builds are enabled with +``` + --enable-comms=none +``` + +FFTW support that is not in the default search path may then enabled with +``` + --with-fftw= +``` + +BLAS will not be compiled in by default, and Lanczos will default to Eigen diagonalisation. From 3d09e3e9e0c3b24e1646db3083aba01537bcf88a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 1 Jul 2017 10:05:46 +0100 Subject: [PATCH 118/170] Update README.md --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 8f0babd9..8f7a3d42 100644 --- a/README.md +++ b/README.md @@ -306,6 +306,14 @@ one rank per socket. If using the Intel MPI library, threads should be pinned to ``` This is the default. +### Build setup for BlueGene/Q + +To be written... + +### Build setup for ARM Neon + +To be written.. + ### Build setup for laptops, other compilers, non-cluster builds Many versions of g++ and clang++ work with Grid, and involve merely replacing CXX (and MPICXX), From 37263fd9b181f1190ff201203da6ac6a431e045d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 1 Jul 2017 10:06:24 +0100 Subject: [PATCH 119/170] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8f7a3d42..afb751f5 100644 --- a/README.md +++ b/README.md @@ -312,7 +312,7 @@ To be written... ### Build setup for ARM Neon -To be written.. +To be written... ### Build setup for laptops, other compilers, non-cluster builds From b68ad0cc0bf6ab479199020fd6b976229c0cb047 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 1 Jul 2017 10:20:07 +0100 Subject: [PATCH 120/170] Update README.md --- README.md | 74 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index afb751f5..a786bc6c 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,37 @@ Last update June 2017. _Please do not send pull requests to the `master` branch which is reserved for releases._ + + +### Description +This library provides data parallel C++ container classes with internal memory layout +that is transformed to map efficiently to SIMD architectures. CSHIFT facilities +are provided, similar to HPF and cmfortran, and user control is given over the mapping of +array indices to both MPI tasks and SIMD processing elements. + +* Identically shaped arrays then be processed with perfect data parallelisation. +* Such identically shaped arrays are called conformable arrays. + +The transformation is based on the observation that Cartesian array processing involves +identical processing to be performed on different regions of the Cartesian array. + +The library will both geometrically decompose into MPI tasks and across SIMD lanes. +Local vector loops are parallelised with OpenMP pragmas. + +Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but +optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification +for most programmers. + +The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture. +Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported. + +These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. +The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`. + +MPI, OpenMP, and SIMD parallelism are present in the library. +Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail. + + ### Compilers Intel ICPC v16.0.3 and later @@ -56,38 +87,19 @@ When you file an issue, please go though the following checklist: 6. Attach the output of `make V=1`. 7. Describe the issue and any previous attempt to solve it. If relevant, show how to reproduce the issue using a minimal working example. - - -### Description -This library provides data parallel C++ container classes with internal memory layout -that is transformed to map efficiently to SIMD architectures. CSHIFT facilities -are provided, similar to HPF and cmfortran, and user control is given over the mapping of -array indices to both MPI tasks and SIMD processing elements. - -* Identically shaped arrays then be processed with perfect data parallelisation. -* Such identically shaped arrays are called conformable arrays. - -The transformation is based on the observation that Cartesian array processing involves -identical processing to be performed on different regions of the Cartesian array. - -The library will both geometrically decompose into MPI tasks and across SIMD lanes. -Local vector loops are parallelised with OpenMP pragmas. - -Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but -optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification -for most programmers. - -The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture. -Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported. - -These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. -The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`. - -MPI, OpenMP, and SIMD parallelism are present in the library. -Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail. - ### Required libraries -Grid requires [GMP](https://gmplib.org/), [MPFR](http://www.mpfr.org/) and optionally [HDF5](https://support.hdfgroup.org/HDF5/) and [LIME](http://usqcd-software.github.io/c-lime/) (for ILDG file format support) to be installed. +Grid requires: +[GMP](https://gmplib.org/), +[MPFR](http://www.mpfr.org/) + +Bootstrapping grid downloads and uses for internal dense matrix (non-QCD operations) the Eigen library. + +Grid optionally uses: +[HDF5](https://support.hdfgroup.org/HDF5/) +[LIME](http://usqcd-software.github.io/c-lime/) (for ILDG file format support) +[FFTW](http://www.fftw.org) (Either generic or via the Intel MKL library) +[LAPACK]( either generic or Intel MKL library) + ### Quick start First, start by cloning the repository: From 7b0237b0819d6981767a0189f7550546a58a8683 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 1 Jul 2017 10:24:41 +0100 Subject: [PATCH 121/170] Update README.md --- README.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a786bc6c..3572be26 100644 --- a/README.md +++ b/README.md @@ -89,16 +89,22 @@ When you file an issue, please go though the following checklist: ### Required libraries Grid requires: + [GMP](https://gmplib.org/), + [MPFR](http://www.mpfr.org/) Bootstrapping grid downloads and uses for internal dense matrix (non-QCD operations) the Eigen library. Grid optionally uses: + [HDF5](https://support.hdfgroup.org/HDF5/) -[LIME](http://usqcd-software.github.io/c-lime/) (for ILDG file format support) -[FFTW](http://www.fftw.org) (Either generic or via the Intel MKL library) -[LAPACK]( either generic or Intel MKL library) + +[LIME](http://usqcd-software.github.io/c-lime/) for ILDG and SciDAC file format support. + +[FFTW](http://www.fftw.org) either generic version or via the Intel MKL library. + +LAPACK either generic version or Intel MKL library. ### Quick start From 40e119c61cac619b7fa1874e5fa7ccdc1dcb77cb Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 8 Jul 2017 22:27:11 -0400 Subject: [PATCH 122/170] NUMA improvements worth preserving from AMD EPYC tests --- benchmarks/Benchmark_memory_bandwidth.cc | 48 ++++++++++++------------ lib/allocator/AlignedAllocator.h | 3 +- lib/communicator/Communicator_mpi3.cc | 20 +++++++++- 3 files changed, 45 insertions(+), 26 deletions(-) diff --git a/benchmarks/Benchmark_memory_bandwidth.cc b/benchmarks/Benchmark_memory_bandwidth.cc index 1136dfe0..848f271d 100644 --- a/benchmarks/Benchmark_memory_bandwidth.cc +++ b/benchmarks/Benchmark_memory_bandwidth.cc @@ -60,16 +60,16 @@ int main (int argc, char ** argv) for(int lat=8;lat<=lmax;lat+=8){ std::vector latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); - int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); uint64_t Nloop=NLOOP; - GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); + // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - LatticeVec z(&Grid); random(pRNG,z); - LatticeVec x(&Grid); random(pRNG,x); - LatticeVec y(&Grid); random(pRNG,y); + LatticeVec z(&Grid);// random(pRNG,z); + LatticeVec x(&Grid);// random(pRNG,x); + LatticeVec y(&Grid);// random(pRNG,y); double a=2.0; @@ -83,7 +83,7 @@ int main (int argc, char ** argv) double time = (stop-start)/Nloop*1000; double flops=vol*Nvec*2;// mul,add - double bytes=3*vol*Nvec*sizeof(Real); + double bytes=3.0*vol*Nvec*sizeof(Real); std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); - int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); - GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); + // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - LatticeVec z(&Grid); random(pRNG,z); - LatticeVec x(&Grid); random(pRNG,x); - LatticeVec y(&Grid); random(pRNG,y); + LatticeVec z(&Grid);// random(pRNG,z); + LatticeVec x(&Grid);// random(pRNG,x); + LatticeVec y(&Grid);// random(pRNG,y); double a=2.0; uint64_t Nloop=NLOOP; @@ -119,7 +119,7 @@ int main (int argc, char ** argv) double time = (stop-start)/Nloop*1000; double flops=vol*Nvec*2;// mul,add - double bytes=3*vol*Nvec*sizeof(Real); + double bytes=3.0*vol*Nvec*sizeof(Real); std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); - int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; uint64_t Nloop=NLOOP; GridCartesian Grid(latt_size,simd_layout,mpi_layout); - GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); + // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - LatticeVec z(&Grid); random(pRNG,z); - LatticeVec x(&Grid); random(pRNG,x); - LatticeVec y(&Grid); random(pRNG,y); + LatticeVec z(&Grid);// random(pRNG,z); + LatticeVec x(&Grid);// random(pRNG,x); + LatticeVec y(&Grid);// random(pRNG,y); RealD a=2.0; @@ -154,7 +154,7 @@ int main (int argc, char ** argv) double stop=usecond(); double time = (stop-start)/Nloop*1000; - double bytes=2*vol*Nvec*sizeof(Real); + double bytes=2.0*vol*Nvec*sizeof(Real); double flops=vol*Nvec*1;// mul std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); - int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; uint64_t Nloop=NLOOP; GridCartesian Grid(latt_size,simd_layout,mpi_layout); - GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - LatticeVec z(&Grid); random(pRNG,z); - LatticeVec x(&Grid); random(pRNG,x); - LatticeVec y(&Grid); random(pRNG,y); + // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); + LatticeVec z(&Grid);// random(pRNG,z); + LatticeVec x(&Grid);// random(pRNG,x); + LatticeVec y(&Grid);// random(pRNG,y); RealD a=2.0; Real nn; double start=usecond(); @@ -187,7 +187,7 @@ int main (int argc, char ** argv) double stop=usecond(); double time = (stop-start)/Nloop*1000; - double bytes=vol*Nvec*sizeof(Real); + double bytes=1.0*vol*Nvec*sizeof(Real); double flops=vol*Nvec*2;// mul,add std::cout< #include #include #include -//#include +#include +#include #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif @@ -214,6 +215,23 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); } assert(((uint64_t)ptr&0x3F)==0); + + int status; + int flags=MPOL_MF_MOVE; +#ifdef KNL + int nodes=1; // numa domain == MCDRAM + // Find out if in SNC2,SNC4 mode ? +#else + int nodes=r; // numa domain == MPI ID +#endif + unsigned long count=1; + for(uint64_t page=0;page Date: Sun, 9 Jul 2017 00:11:54 +0100 Subject: [PATCH 123/170] Update README.md --- README.md | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/README.md b/README.md index 3572be26..e0a9bb14 100644 --- a/README.md +++ b/README.md @@ -324,6 +324,60 @@ one rank per socket. If using the Intel MPI library, threads should be pinned to ``` This is the default. +### Build setup for AMD EPYC / RYZEN + +The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores. +So, even with a single socket node there is a quad-chip module. Dual socket nodes with 64 cores total +are common. Each chip within the module exposes a separate NUMA domain. +There are four NUMA domains per socket and we recommend one MPI rank per NUMA domain. +MPI-3 is recommended with the use of four ranks per socket, +and 8 threads per rank. + +The following configuration is recommended for the AMD EPYC platform. + +``` bash +../configure --enable-precision=double\ + --enable-simd=AVX2 \ + --enable-comms=mpi3 \ + CXX=mpicxx +``` + +If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed: +``` bash + --with-gmp= \ + --with-mpfr= \ +``` +where `` is the UNIX prefix where GMP and MPFR are installed. + +Using MPICH and g++ v4.9.2, best performance can be obtained using explicit GOMP_CPU_AFFINITY flags for each MPI rank. +This can be done by invoking MPI on a wrapper script omp_bind.sh to handle this. + +It is recommended to run 8 MPI ranks on a single dual socket AMD EPYC, with 8 threads per rank using MPI3 and +shared memory to communicate within this node: + +mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --mpi 2.2.2.1 --dslash-unroll --threads 8 --grid 16.16.16.16 --cacheblocking 4.4.4.4 + +Where omp_bind.sh does the following: +``` +#!/bin/bash + +numanode=` expr $PMI_RANK % 8 ` +basecore=`expr $numanode \* 16` +core0=`expr $basecore + 0 ` +core1=`expr $basecore + 2 ` +core2=`expr $basecore + 4 ` +core3=`expr $basecore + 6 ` +core4=`expr $basecore + 8 ` +core5=`expr $basecore + 10 ` +core6=`expr $basecore + 12 ` +core7=`expr $basecore + 14 ` + +export GOMP_CPU_AFFINITY="$core0 $core1 $core2 $core3 $core4 $core5 $core6 $core7" +echo GOMP_CUP_AFFINITY $GOMP_CPU_AFFINITY + +$@ +``` + ### Build setup for BlueGene/Q To be written... From dc6f078246b006ad1b3e61c513273b73f8f0da81 Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Tue, 11 Jul 2017 14:15:08 +0100 Subject: [PATCH 124/170] fixed the header file for mpi3 --- configure.ac | 8 +++++++- lib/communicator/Communicator_mpi3.cc | 18 +++++++++++------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/configure.ac b/configure.ac index 8c43d67a..dc6754da 100644 --- a/configure.ac +++ b/configure.ac @@ -51,6 +51,7 @@ AC_CHECK_HEADERS(malloc/malloc.h) AC_CHECK_HEADERS(malloc.h) AC_CHECK_HEADERS(endian.h) AC_CHECK_HEADERS(execinfo.h) +AC_CHECK_HEADERS(numaif.h) AC_CHECK_DECLS([ntohll],[], [], [[#include ]]) AC_CHECK_DECLS([be64toh],[], [], [[#include ]]) @@ -186,9 +187,14 @@ Info at: http://usqcd.jlab.org/usqcd-docs/c-lime/)]) AC_SEARCH_LIBS([crc32], [z], [AC_DEFINE([HAVE_ZLIB], [1], [Define to 1 if you have the `LIBZ' library])] - [have_zlib=true], + [have_zlib=true] [LIBS="${LIBS} -lz"], [AC_MSG_ERROR(zlib library was not found in your system.)]) +AC_SEARCH_LIBS([move_pages], [numa], + [AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if you have the `LIBNUMA' library])] + [have_libnuma=true] [LIBS="${LIBS} -lnuma"], + [AC_MSG_WARN(libnuma library was not found in your system. Some optimisations will not apply)]) + AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp], [AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])] [have_hdf5=true] diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index f5646d44..4192300b 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -38,7 +38,9 @@ Author: Peter Boyle #include #include #include +#ifdef HAVE_NUMAIF_H #include +#endif #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif @@ -216,6 +218,8 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); } assert(((uint64_t)ptr&0x3F)==0); + // Try to force numa domain on the shm segment if we have numaif.h +#ifdef HAVE_NUMAIF_H int status; int flags=MPOL_MF_MOVE; #ifdef KNL @@ -225,13 +229,13 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { int nodes=r; // numa domain == MPI ID #endif unsigned long count=1; - for(uint64_t page=0;page Date: Wed, 12 Jul 2017 15:01:48 +0100 Subject: [PATCH 125/170] For test/solver Fixed --- lib/lattice/Lattice_reduction.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h index c5b20f3c..38982891 100644 --- a/lib/lattice/Lattice_reduction.h +++ b/lib/lattice/Lattice_reduction.h @@ -540,7 +540,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice for(int i=0;i Date: Fri, 14 Jul 2017 22:52:16 +0100 Subject: [PATCH 126/170] Update README.md --- README.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/README.md b/README.md index e0a9bb14..ea20d0ec 100644 --- a/README.md +++ b/README.md @@ -324,6 +324,17 @@ one rank per socket. If using the Intel MPI library, threads should be pinned to ``` This is the default. +** Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): ** + +mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18 +Average mflops/s per call per node (full): ** 498739 ** 4d vec +Average mflops/s per call per node (full): ** 457786 ** 4d vec, fp16 comms +Average mflops/s per call per node (full): ** 572645 ** 5d vec +Average mflops/s per call per node (full): ** 721206 ** 5d vec, red black +Average mflops/s per call per node (full): ** 634542 ** 4d vec, red black + + + ### Build setup for AMD EPYC / RYZEN The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores. @@ -378,6 +389,17 @@ echo GOMP_CUP_AFFINITY $GOMP_CPU_AFFINITY $@ ``` +Performance: + +** Expected EPYC 7601 Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): ** + +mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4 +Average mflops/s per call per node (full): **420235** 4d vec +Average mflops/s per call per node (full): **437617** 4d vec, fp16 comms +Average mflops/s per call per node (full): **522988** 5d vec +Average mflops/s per call per node (full): **588984** 5d vec, red black +Average mflops/s per call per node (full): **508423** 4d vec, red black + ### Build setup for BlueGene/Q To be written... From 169f4b2711f0131f1909738c2b631ced3e47c9e1 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 14 Jul 2017 22:56:06 +0100 Subject: [PATCH 127/170] Update README.md --- README.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index ea20d0ec..124c7bfa 100644 --- a/README.md +++ b/README.md @@ -327,11 +327,11 @@ This is the default. ** Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): ** mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18 -Average mflops/s per call per node (full): ** 498739 ** 4d vec -Average mflops/s per call per node (full): ** 457786 ** 4d vec, fp16 comms -Average mflops/s per call per node (full): ** 572645 ** 5d vec -Average mflops/s per call per node (full): ** 721206 ** 5d vec, red black -Average mflops/s per call per node (full): ** 634542 ** 4d vec, red black +- Average mflops/s per call per node (full): 498739 : 4d vec +- Average mflops/s per call per node (full): 457786 : 4d vec, fp16 comms +- Average mflops/s per call per node (full): 572645 : 5d vec +- Average mflops/s per call per node (full): 721206 : 5d vec, red black +- Average mflops/s per call per node (full): 634542 : 4d vec, red black @@ -391,14 +391,14 @@ $@ Performance: -** Expected EPYC 7601 Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): ** +### Expected EPYC 7601 Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): ** mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4 -Average mflops/s per call per node (full): **420235** 4d vec -Average mflops/s per call per node (full): **437617** 4d vec, fp16 comms -Average mflops/s per call per node (full): **522988** 5d vec -Average mflops/s per call per node (full): **588984** 5d vec, red black -Average mflops/s per call per node (full): **508423** 4d vec, red black +- Average mflops/s per call per node (full): 420235 : 4d vec +- Average mflops/s per call per node (full): 437617 : 4d vec, fp16 comms +- Average mflops/s per call per node (full): 522988 : 5d vec +- Average mflops/s per call per node (full): 588984 : 5d vec, red black +- Average mflops/s per call per node (full): 508423 : 4d vec, red black ### Build setup for BlueGene/Q From f038c6babe1ec5cd3772c4bcb892d19709dc96f5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 14 Jul 2017 22:59:16 +0100 Subject: [PATCH 128/170] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 124c7bfa..a185063e 100644 --- a/README.md +++ b/README.md @@ -324,7 +324,7 @@ one rank per socket. If using the Intel MPI library, threads should be pinned to ``` This is the default. -** Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): ** +#### Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18 - Average mflops/s per call per node (full): 498739 : 4d vec @@ -391,7 +391,7 @@ $@ Performance: -### Expected EPYC 7601 Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): ** +#### Expected AMD EPYC 7601 dual socket (single prec, single node 32+32 cores) performance using NUMA MPI mapping): mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4 - Average mflops/s per call per node (full): 420235 : 4d vec From fe4912880d3ceaf96023e5074682cc4ee43cb871 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 17 Jul 2017 09:53:07 +0100 Subject: [PATCH 129/170] Update README.md --- README.md | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index a185063e..1e0988f3 100644 --- a/README.md +++ b/README.md @@ -327,12 +327,8 @@ This is the default. #### Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18 -- Average mflops/s per call per node (full): 498739 : 4d vec -- Average mflops/s per call per node (full): 457786 : 4d vec, fp16 comms -- Average mflops/s per call per node (full): 572645 : 5d vec -- Average mflops/s per call per node (full): 721206 : 5d vec, red black -- Average mflops/s per call per node (full): 634542 : 4d vec, red black +TBA ### Build setup for AMD EPYC / RYZEN @@ -394,11 +390,8 @@ Performance: #### Expected AMD EPYC 7601 dual socket (single prec, single node 32+32 cores) performance using NUMA MPI mapping): mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4 -- Average mflops/s per call per node (full): 420235 : 4d vec -- Average mflops/s per call per node (full): 437617 : 4d vec, fp16 comms -- Average mflops/s per call per node (full): 522988 : 5d vec -- Average mflops/s per call per node (full): 588984 : 5d vec, red black -- Average mflops/s per call per node (full): 508423 : 4d vec, red black + +TBA ### Build setup for BlueGene/Q From 0f214ad427c2f903bc5effeb453f5bed27034cc5 Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Fri, 21 Jul 2017 11:13:51 -0400 Subject: [PATCH 130/170] Moved FourierAcceleratedGaugeFixer into Grid::QCD namespace and removed 'using namespace' directives from header --- lib/qcd/utils/GaugeFix.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/qcd/utils/GaugeFix.h b/lib/qcd/utils/GaugeFix.h index 4ff216e4..f2ea1aa2 100644 --- a/lib/qcd/utils/GaugeFix.h +++ b/lib/qcd/utils/GaugeFix.h @@ -26,12 +26,12 @@ Author: Peter Boyle /* END LEGAL */ //#include -using namespace Grid; -using namespace Grid::QCD; +namespace Grid { +namespace QCD { template class FourierAcceleratedGaugeFixer : public Gimpl { - public: + public: INHERIT_GIMPL_TYPES(Gimpl); typedef typename Gimpl::GaugeLinkField GaugeMat; @@ -186,3 +186,5 @@ class FourierAcceleratedGaugeFixer : public Gimpl { } }; +} +} From 56967818626452a318c058684b9594adca4f7fa4 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Wed, 26 Jul 2017 12:07:34 +0100 Subject: [PATCH 131/170] Debug error in Tensor mult --- lib/tensors/Tensor_arith_mul.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/tensors/Tensor_arith_mul.h b/lib/tensors/Tensor_arith_mul.h index c24853b7..a474db9c 100644 --- a/lib/tensors/Tensor_arith_mul.h +++ b/lib/tensors/Tensor_arith_mul.h @@ -98,7 +98,9 @@ template strong_inline void mult(iVector * __restrict__ ret, const iVector * __restrict__ rhs, const iScalar * __restrict__ lhs){ - mult(ret,lhs,rhs); + for(int c1=0;c1_internal[c1],&rhs->_internal[c1],&lhs->_internal); + } } From 237cfd11ab493e1ea8ffaf24fc1da5171b8b929a Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Wed, 26 Jul 2017 12:08:51 +0100 Subject: [PATCH 132/170] Solving the spurious O2 flags --- configure.ac | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index dc6754da..a028fb0a 100644 --- a/configure.ac +++ b/configure.ac @@ -13,6 +13,10 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) ################ Get git info #AC_REVISION([m4_esyscmd_s([./scripts/configure.commit])]) +################ Set flags +# do not move! +CXXFLAGS="-O3 $CXXFLAGS" + ############### Checks for programs AC_PROG_CXX AC_PROG_RANLIB @@ -27,7 +31,6 @@ AX_GXX_VERSION AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"], [version of g++ that will compile the code]) -CXXFLAGS="-O3 $CXXFLAGS" ############### Checks for typedefs, structures, and compiler characteristics From 7abc5613bde6fb4e704145b0f2a4c8fa19090944 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Wed, 26 Jul 2017 16:21:17 +0100 Subject: [PATCH 133/170] Added smearing to the topological charge observable --- lib/qcd/modules/ObservableModules.h | 15 ++--- lib/qcd/observables/topological_charge.h | 70 +++++++++++++++++++++--- tests/hmc/Test_hmc_WilsonGauge.cc | 5 +- 3 files changed, 73 insertions(+), 17 deletions(-) diff --git a/lib/qcd/modules/ObservableModules.h b/lib/qcd/modules/ObservableModules.h index 579fc1ec..24511617 100644 --- a/lib/qcd/modules/ObservableModules.h +++ b/lib/qcd/modules/ObservableModules.h @@ -84,8 +84,6 @@ class PlaquetteMod: public ObservableModule, NoParameters> typedef ObservableModule, NoParameters> ObsBase; using ObsBase::ObsBase; // for constructors - - // acquire resource virtual void initialize(){ this->ObservablePtr.reset(new PlaquetteLogger()); @@ -94,23 +92,22 @@ class PlaquetteMod: public ObservableModule, NoParameters> PlaquetteMod(): ObsBase(NoParameters()){} }; + template < class Impl > -class TopologicalChargeMod: public ObservableModule, NoParameters>{ - typedef ObservableModule, NoParameters> ObsBase; +class TopologicalChargeMod: public ObservableModule, TopologyObsParameters>{ + typedef ObservableModule, TopologyObsParameters> ObsBase; using ObsBase::ObsBase; // for constructors - - // acquire resource virtual void initialize(){ - this->ObservablePtr.reset(new TopologicalCharge()); + this->ObservablePtr.reset(new TopologicalCharge(this->Par_)); } public: - TopologicalChargeMod(): ObsBase(NoParameters()){} + TopologicalChargeMod(TopologyObsParameters Par): ObsBase(Par){} + TopologicalChargeMod(): ObsBase(){} }; - }// QCD temporarily here diff --git a/lib/qcd/observables/topological_charge.h b/lib/qcd/observables/topological_charge.h index 5d09c420..c2c419fb 100644 --- a/lib/qcd/observables/topological_charge.h +++ b/lib/qcd/observables/topological_charge.h @@ -33,9 +33,45 @@ directory namespace Grid { namespace QCD { +struct TopologySmearingParameters : Serializable { + GRID_SERIALIZABLE_CLASS_MEMBERS(TopologySmearingParameters, + int, steps, + float, step_size, + int, meas_interval, + float, maxTau); + + TopologySmearingParameters(int s = 0, float ss = 0.0f, int mi = 0, float mT = 0.0f): + steps(s), step_size(ss), meas_interval(mi), maxTau(mT){} + + template < class ReaderClass > + TopologySmearingParameters(Reader& Reader){ + read(Reader, "Smearing", *this); + } +}; + + + +struct TopologyObsParameters : Serializable { + GRID_SERIALIZABLE_CLASS_MEMBERS(TopologyObsParameters, + int, interval, + bool, do_smearing, + TopologySmearingParameters, Smearing); + + TopologyObsParameters(int interval = 1, bool smearing = false): + interval(interval), Smearing(smearing){} + + template + TopologyObsParameters(Reader& Reader){ + read(Reader, "TopologyMeasurement", *this); + } +}; + + // this is only defined for a gauge theory template class TopologicalCharge : public HmcObservable { + TopologyObsParameters Pars; + public: // here forces the Impl to be of gauge fields // if not the compiler will complain @@ -44,20 +80,40 @@ class TopologicalCharge : public HmcObservable { // necessary for HmcObservable compatibility typedef typename Impl::Field Field; + TopologicalCharge(int interval = 1, bool do_smearing = false): + Pars(interval, do_smearing){} + + TopologicalCharge(TopologyObsParameters P):Pars(P){ + std::cout << GridLogDebug << "Creating TopologicalCharge " << std::endl; + } + void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) { - Real q = WilsonLoops::TopologicalCharge(U); + if (traj%Pars.interval == 0){ + // Smearing + Field Usmear = U; + int def_prec = std::cout.precision(); + + if (Pars.do_smearing){ + // using wilson flow by default here + std::cout << "1. " << Pars.Smearing.step_size << std::endl; + WilsonFlow WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval); + WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau); + Real T0 = WF.energyDensityPlaquette(Usmear); + std::cout << GridLogMessage << std::setprecision(std::numeric_limits::digits10 + 1) + << "T0 : [ " << traj << " ] "<< T0 << std::endl; + } - int def_prec = std::cout.precision(); + Real q = WilsonLoops::TopologicalCharge(Usmear); + std::cout << GridLogMessage + << std::setprecision(std::numeric_limits::digits10 + 1) + << "Topological Charge: [ " << traj << " ] "<< q << std::endl; - std::cout << GridLogMessage - << std::setprecision(std::numeric_limits::digits10 + 1) - << "Topological Charge: [ " << traj << " ] "<< q << std::endl; - - std::cout.precision(def_prec); + std::cout.precision(def_prec); + } } }; diff --git a/tests/hmc/Test_hmc_WilsonGauge.cc b/tests/hmc/Test_hmc_WilsonGauge.cc index b2d5fb02..4cf6d923 100644 --- a/tests/hmc/Test_hmc_WilsonGauge.cc +++ b/tests/hmc/Test_hmc_WilsonGauge.cc @@ -66,7 +66,10 @@ int main(int argc, char **argv) { typedef PlaquetteMod PlaqObs; typedef TopologicalChargeMod QObs; TheHMC.Resources.AddObservable(); - TheHMC.Resources.AddObservable(); + TopologyObsParameters TopParams; + TopParams.interval = 1; + TopParams.do_smearing = false; + TheHMC.Resources.AddObservable(TopParams); ////////////////////////////////////////////// ///////////////////////////////////////////////////////////// From c0485d799d915637fdc455dfa900ee9786f7cd69 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Wed, 26 Jul 2017 16:26:04 +0100 Subject: [PATCH 134/170] Explicit parameter declaration in the WilsonGauge test --- lib/qcd/observables/topological_charge.h | 1 - tests/hmc/Test_hmc_WilsonGauge.cc | 8 ++++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/lib/qcd/observables/topological_charge.h b/lib/qcd/observables/topological_charge.h index c2c419fb..5af8d77b 100644 --- a/lib/qcd/observables/topological_charge.h +++ b/lib/qcd/observables/topological_charge.h @@ -99,7 +99,6 @@ class TopologicalCharge : public HmcObservable { if (Pars.do_smearing){ // using wilson flow by default here - std::cout << "1. " << Pars.Smearing.step_size << std::endl; WilsonFlow WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval); WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau); Real T0 = WF.energyDensityPlaquette(Usmear); diff --git a/tests/hmc/Test_hmc_WilsonGauge.cc b/tests/hmc/Test_hmc_WilsonGauge.cc index 4cf6d923..05bf81a2 100644 --- a/tests/hmc/Test_hmc_WilsonGauge.cc +++ b/tests/hmc/Test_hmc_WilsonGauge.cc @@ -67,8 +67,12 @@ int main(int argc, char **argv) { typedef TopologicalChargeMod QObs; TheHMC.Resources.AddObservable(); TopologyObsParameters TopParams; - TopParams.interval = 1; - TopParams.do_smearing = false; + TopParams.interval = 5; + TopParams.do_smearing = true; + TopParams.Smearing.steps = 200; + TopParams.Smearing.step_size = 0.01; + TopParams.Smearing.meas_interval = 50; + TopParams.Smearing.maxTau = 2.0; TheHMC.Resources.AddObservable(TopParams); ////////////////////////////////////////////// From c7036f671754710c41de00cb0fa90a6e35104467 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Thu, 27 Jul 2017 11:15:09 +0100 Subject: [PATCH 135/170] Adding checks for libm and libstdc++ --- configure.ac | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/configure.ac b/configure.ac index a028fb0a..bf078b13 100644 --- a/configure.ac +++ b/configure.ac @@ -58,6 +58,10 @@ AC_CHECK_HEADERS(numaif.h) AC_CHECK_DECLS([ntohll],[], [], [[#include ]]) AC_CHECK_DECLS([be64toh],[], [], [[#include ]]) +############## Standard libraries +AC_CHECK_LIB([m],[cos]) +AC_CHECK_LIB([stdc++],[abort]) + ############### GMP and MPFR AC_ARG_WITH([gmp], [AS_HELP_STRING([--with-gmp=prefix], From 8bd869da37fc3911665213f96e431e3b60cb0332 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Thu, 27 Jul 2017 15:12:50 +0100 Subject: [PATCH 136/170] Correcting a bug in the IO routines --- lib/parallelIO/BinaryIO.h | 133 ++++++++++++++++++++++++-------------- 1 file changed, 86 insertions(+), 47 deletions(-) diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h index 117bec01..108e7ef8 100644 --- a/lib/parallelIO/BinaryIO.h +++ b/lib/parallelIO/BinaryIO.h @@ -98,35 +98,39 @@ class BinaryIO { NerscChecksum(grid,scalardata,nersc_csum); } - - template static inline void NerscChecksum(GridBase *grid,std::vector &fbuf,uint32_t &nersc_csum) + + template + static inline void NerscChecksum(GridBase *grid, std::vector &fbuf, uint32_t &nersc_csum) { - const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t); + const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t); - - uint64_t lsites =grid->lSites(); - if (fbuf.size()==1) { - lsites=1; + uint64_t lsites = grid->lSites(); + if (fbuf.size() == 1) + { + lsites = 1; } -#pragma omp parallel - { - uint32_t nersc_csum_thr=0; + #pragma omp parallel + { + uint32_t nersc_csum_thr = 0; -#pragma omp for - for(uint64_t local_site=0;local_site static inline void ScidacChecksum(GridBase *grid,std::vector &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb) { const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t); @@ -266,7 +270,7 @@ class BinaryIO { grid->Barrier(); GridStopWatch timer; GridStopWatch bstimer; - + nersc_csum=0; scidac_csuma=0; scidac_csumb=0; @@ -362,18 +366,22 @@ class BinaryIO { #else assert(0); #endif - } else { - std::cout<< GridLogMessage<< "C++ read I/O "<< file<<" : " - << iodata.size()*sizeof(fobj)<<" bytes"<Barrier(); - grid->GlobalSum(nersc_csum); - grid->GlobalXOR(scidac_csuma); - grid->GlobalXOR(scidac_csumb); - grid->Barrier(); + // if the data size is 1 we do not want to sum over the MPI ranks + if (iodata.size() != 1){ + grid->Barrier(); + grid->GlobalSum(nersc_csum); + grid->GlobalXOR(scidac_csuma); + grid->GlobalXOR(scidac_csumb); + grid->Barrier(); + } } ///////////////////////////////////////////////////////////////////////////// @@ -546,9 +585,9 @@ class BinaryIO { int gsites = grid->gSites(); int lsites = grid->lSites(); - uint32_t nersc_csum_tmp; - uint32_t scidac_csuma_tmp; - uint32_t scidac_csumb_tmp; + uint32_t nersc_csum_tmp = 0; + uint32_t scidac_csuma_tmp = 0; + uint32_t scidac_csumb_tmp = 0; GridStopWatch timer; From 14d53e1c9eb8eb1ef684148728c075813814612e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 29 Jul 2017 13:06:53 -0400 Subject: [PATCH 137/170] Threaded MPI calls patches --- benchmarks/Benchmark_dwf.cc | 2 +- lib/allocator/AlignedAllocator.h | 10 ++- lib/communicator/Communicator_base.cc | 4 +- lib/communicator/Communicator_base.h | 14 ++++- lib/communicator/Communicator_mpit.cc | 25 +++++++- lib/qcd/action/fermion/WilsonFermion5D.cc | 74 ++++++++++++++--------- lib/stencil/Stencil.h | 59 +++++++++--------- lib/util/Init.cc | 6 +- 8 files changed, 128 insertions(+), 66 deletions(-) diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index a071c050..0264905c 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -489,7 +489,7 @@ int main (int argc, char ** argv) //assert(norm2(src_e)<1.0e-4); //assert(norm2(src_o)<1.0e-4); - + exit(0); Grid_finalize(); } diff --git a/lib/allocator/AlignedAllocator.h b/lib/allocator/AlignedAllocator.h index 6e85ab27..7fd9496f 100644 --- a/lib/allocator/AlignedAllocator.h +++ b/lib/allocator/AlignedAllocator.h @@ -92,11 +92,15 @@ public: size_type bytes = __n*sizeof(_Tp); _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes); - + ////////////////// + // Hack 2MB align; could make option probably doesn't need configurability + ////////////////// +//define GRID_ALLOC_ALIGN (128) +#define GRID_ALLOC_ALIGN (2*1024*1024) #ifdef HAVE_MM_MALLOC_H - if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128); + if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN); #else - if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes); + if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes); #endif return ptr; diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc index a5edf8e9..67bfaed0 100644 --- a/lib/communicator/Communicator_base.cc +++ b/lib/communicator/Communicator_base.cc @@ -34,7 +34,9 @@ namespace Grid { /////////////////////////////////////////////////////////////// void * CartesianCommunicator::ShmCommBuf; uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 128*1024*1024; -CartesianCommunicator::CommunicatorPolicy_t CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent; +CartesianCommunicator::CommunicatorPolicy_t +CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent; +int CartesianCommunicator::nCommThreads = -1; ///////////////////////////////// // Alloc, free shmem region diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h index 4e471b43..84dbedb4 100644 --- a/lib/communicator/Communicator_base.h +++ b/lib/communicator/Communicator_base.h @@ -54,8 +54,9 @@ class CartesianCommunicator { // 128MB shared memory for comms enought for 48^4 local vol comms // Give external control (command line override?) of this - static const int MAXLOG2RANKSPERNODE = 16; - static uint64_t MAX_MPI_SHM_BYTES; + static const int MAXLOG2RANKSPERNODE = 16; + static uint64_t MAX_MPI_SHM_BYTES; + static int nCommThreads; // Communicator should know nothing of the physics grid, only processor grid. int _Nprocessors; // How many in all @@ -125,7 +126,7 @@ class CartesianCommunicator { enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential }; static CommunicatorPolicy_t CommunicatorPolicy; static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; } - + size_t heap_top; size_t heap_bytes; @@ -215,6 +216,12 @@ class CartesianCommunicator { void SendToRecvFromComplete(std::vector &waitall); + double StencilSendToRecvFrom(void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes,int dir); + double StencilSendToRecvFromBegin(std::vector &list, void *xmit, int xmit_to_rank, @@ -222,6 +229,7 @@ class CartesianCommunicator { int recv_from_rank, int bytes,int dir); + void StencilSendToRecvFromComplete(std::vector &waitall,int i); void StencilBarrier(void); diff --git a/lib/communicator/Communicator_mpit.cc b/lib/communicator/Communicator_mpit.cc index 24a518ec..f522701c 100644 --- a/lib/communicator/Communicator_mpit.cc +++ b/lib/communicator/Communicator_mpit.cc @@ -242,7 +242,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &waitall,int dir) { diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 6a6bc1f8..0b6c9e3d 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -391,37 +391,57 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg DhopFaceTime+=usecond(); // Rely on async comms; start comms before merge of local data - DhopComputeTime-=usecond(); - DhopCommTime-=usecond(); -#pragma omp parallel + double ctime=0; + double ptime=0; + // DhopComputeTime-=usecond(); + // DhopCommTime-=usecond(); +#pragma omp parallel reduction(max:ctime) reduction(max:ptime) { - // Should time this somehow; hard as the threads fork nowait - st.CommunicateThreaded(); - - if (dag == DaggerYes) { -#pragma omp for - for (int ss = 0; ss < U._grid->oSites(); ss++) { - int sU = ss; - int sF = LLs * sU; - Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0); + int tid = omp_get_thread_num(); + int nthreads = omp_get_num_threads(); + int ncomms = CartesianCommunicator::nCommThreads; + if (ncomms == -1) ncomms = st.Packets.size(); + assert(nthreads > ncomms); + if (tid >= ncomms) { + double start = usecond(); + nthreads -= ncomms; + int ttid = tid - ncomms; + int n = U._grid->oSites(); + int chunk = n / nthreads; + int rem = n % nthreads; + int myblock, myn; + if (ttid < rem) { + myblock = ttid * chunk + ttid; + myn = chunk+1; + } else { + myblock = ttid*chunk + rem; + myn = chunk; + } + + // do the compute + if (dag == DaggerYes) { + for (int ss = myblock; ss < myblock+myn; ++ss) { + int sU = ss; + int sF = LLs * sU; + Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0); + } + } else { + for (int ss = myblock; ss < myblock+myn; ++ss) { + int sU = ss; + int sF = LLs * sU; + Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0); + } + } + ptime = usecond() - start; } - } else { -#pragma omp for - for (int ss = 0; ss < U._grid->oSites(); ss++) { - int sU = ss; - int sF = LLs * sU; - Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0); + { + double start = usecond(); + st.CommunicateThreaded(); + ctime = usecond() - start; } } -#pragma omp single - DhopComputeTime+=usecond(); - -#pragma omp taskwait - -#pragma omp single - DhopCommTime+=usecond(); - } // Closes parallel region and waits the comms (I hope) - + DhopCommTime += ctime; + DhopComputeTime+=ptime; DhopFaceTime-=usecond(); st.CommsMerge(compressor); diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index 17db64d8..d1d7a7e0 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -185,6 +185,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal double splicetime; double nosplicetime; double calls; + std::vector comms_bytesthr; + std::vector commtimethr; //////////////////////////////////////// // Stencil query @@ -250,36 +252,22 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal ////////////////////////////////////////// void CommunicateThreaded() { - for(int i=0;i reqs; - bytes=_grid->StencilSendToRecvFromBegin(reqs, - Packets[i].send_buf, - Packets[i].to_rank, - Packets[i].recv_buf, - Packets[i].from_rank, - Packets[i].bytes,i); - _grid->StencilSendToRecvFromComplete(reqs,i); - // Last task logged; this is approximate but hard to catch - // the last to complete - stop = usecond(); - stop = stop - start; - - if ( i==0 ) commtime+=stop; - -#pragma omp critical - { - comms_bytes+=bytes; - } - + // must be called in parallel region + int mythread = omp_get_thread_num(); + int nthreads = CartesianCommunicator::nCommThreads; + if (nthreads == -1) nthreads = Packets.size(); + if (mythread < nthreads) { + for (int i = mythread; i < Packets.size(); i += nthreads) { + double start = usecond(); + uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf, + Packets[i].to_rank, + Packets[i].recv_buf, + Packets[i].from_rank, + Packets[i].bytes,i); + comms_bytesthr[mythread] += bytes; + commtimethr[mythread] += usecond() - start; } } - } void CommunicateBegin(std::vector > &reqs) { @@ -475,7 +463,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal int checkerboard, const std::vector &directions, const std::vector &distances) - : _permute_type(npoints), _comm_buf_size(npoints) + : _permute_type(npoints), + _comm_buf_size(npoints), + comms_bytesthr(npoints), + commtimethr(npoints) { face_table_computed=0; _npoints = npoints; @@ -1029,6 +1020,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal void ZeroCounters(void) { gathertime = 0.; commtime = 0.; + memset(&commtimethr[0], 0, sizeof(commtimethr)); + memset(&comms_bytesthr[0], 0, sizeof(comms_bytesthr)); halogtime = 0.; mergetime = 0.; decompresstime = 0.; @@ -1044,6 +1037,14 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal #define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<_Nprocessors; RealD NN = _grid->NodeCount(); + double t = 0; + // if commtimethr is set they were all done in parallel so take the max + // but add up the bytes + for (int i = 0; i < 8; ++i) { + comms_bytes += comms_bytesthr[i]; + if (t < commtimethr[i]) t = commtimethr[i]; + } + commtime += t; _grid->GlobalSum(commtime); commtime/=NP; if ( calls > 0. ) { diff --git a/lib/util/Init.cc b/lib/util/Init.cc index fc701ac1..ef875429 100644 --- a/lib/util/Init.cc +++ b/lib/util/Init.cc @@ -359,7 +359,11 @@ void Grid_init(int *argc,char ***argv) if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){ LebesgueOrder::UseLebesgueOrder=1; } - + CartesianCommunicator::nCommThreads = -1; + if( GridCmdOptionExists(*argv,*argv+*argc,"--commthreads") ){ + arg= GridCmdOptionPayload(*argv,*argv+*argc,"--commthreads"); + GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads); + } if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); GridCmdOptionIntVector(arg,LebesgueOrder::Block); From 175f393f9d1b3dda4da435a6d995003eddb7b257 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Fri, 4 Aug 2017 12:14:10 +0100 Subject: [PATCH 138/170] Binary IO error checking --- lib/parallelIO/BinaryIO.h | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h index 108e7ef8..f56f6514 100644 --- a/lib/parallelIO/BinaryIO.h +++ b/lib/parallelIO/BinaryIO.h @@ -413,13 +413,33 @@ class BinaryIO { timer.Start(); if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) { #ifdef USE_MPI_IO - std::cout<< GridLogMessage<< "MPI write I/O "<< file<< std::endl; - ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0); - ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0); - ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0); - MPI_File_close(&fh); - MPI_Type_free(&fileArray); - MPI_Type_free(&localArray); + std::cout << GridLogMessage << "MPI write I/O " << file << std::endl; + ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh); + std::cout << GridLogMessage << "Checking for errors" << std::endl; + if (ierr != MPI_SUCCESS) + { + char error_string[BUFSIZ]; + int length_of_error_string, error_class; + + MPI_Error_class(ierr, &error_class); + MPI_Error_string(error_class, error_string, &length_of_error_string); + fprintf(stderr, "%3d: %s\n", myrank, error_string); + MPI_Error_string(ierr, error_string, &length_of_error_string); + fprintf(stderr, "%3d: %s\n", myrank, error_string); + MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0); + } + + std::cout << GridLogDebug << "MPI read I/O set view " << file << std::endl; + ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); + assert(ierr == 0); + + std::cout << GridLogDebug << "MPI read I/O write all " << file << std::endl; + ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status); + assert(ierr == 0); + + MPI_File_close(&fh); + MPI_Type_free(&fileArray); + MPI_Type_free(&localArray); #else assert(0); #endif From 4fe182e5a7c4b1d1dddc022706a71f1c0432cda5 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Sun, 6 Aug 2017 10:46:19 +0100 Subject: [PATCH 139/170] Added high level HMC support for overriding default SIMD lane decomposition --- lib/cartesian/Cartesian_base.h | 23 ++++---- lib/qcd/hmc/HMCResourceManager.h | 14 ++++- lib/qcd/hmc/HMC_GridModules.h | 92 +++++++++++++++++++++-------- lib/util/Init.cc | 2 +- tests/hmc/Test_hmc_EOMobiusRatio.cc | 13 ++-- 5 files changed, 98 insertions(+), 46 deletions(-) diff --git a/lib/cartesian/Cartesian_base.h b/lib/cartesian/Cartesian_base.h index 0db6ce0d..f4f9a269 100644 --- a/lib/cartesian/Cartesian_base.h +++ b/lib/cartesian/Cartesian_base.h @@ -185,17 +185,18 @@ public: //////////////////////////////////////////////////////////////// void show_decomposition(){ - std::cout << GridLogMessage << "Full Dimensions : " << _fdimensions << std::endl; - std::cout << GridLogMessage << "Global Dimensions : " << _gdimensions << std::endl; - std::cout << GridLogMessage << "Local Dimensions : " << _ldimensions << std::endl; - std::cout << GridLogMessage << "Reduced Dimensions : " << _rdimensions << std::endl; - std::cout << GridLogMessage << "Outer strides : " << _ostride << std::endl; - std::cout << GridLogMessage << "Inner strides : " << _istride << std::endl; - std::cout << GridLogMessage << "iSites : " << _isites << std::endl; - std::cout << GridLogMessage << "oSites : " << _osites << std::endl; - std::cout << GridLogMessage << "lSites : " << lSites() << std::endl; - std::cout << GridLogMessage << "gSites : " << gSites() << std::endl; - std::cout << GridLogMessage << "Nd : " << _ndimension << std::endl; + std::cout << GridLogMessage << "\tFull Dimensions : " << _fdimensions << std::endl; + std::cout << GridLogMessage << "\tSIMD layout : " << _simd_layout << std::endl; + std::cout << GridLogMessage << "\tGlobal Dimensions : " << _gdimensions << std::endl; + std::cout << GridLogMessage << "\tLocal Dimensions : " << _ldimensions << std::endl; + std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl; + std::cout << GridLogMessage << "\tOuter strides : " << _ostride << std::endl; + std::cout << GridLogMessage << "\tInner strides : " << _istride << std::endl; + std::cout << GridLogMessage << "\tiSites : " << _isites << std::endl; + std::cout << GridLogMessage << "\toSites : " << _osites << std::endl; + std::cout << GridLogMessage << "\tlSites : " << lSites() << std::endl; + std::cout << GridLogMessage << "\tgSites : " << gSites() << std::endl; + std::cout << GridLogMessage << "\tNd : " << _ndimension << std::endl; } //////////////////////////////////////////////////////////////// diff --git a/lib/qcd/hmc/HMCResourceManager.h b/lib/qcd/hmc/HMCResourceManager.h index cf0000ed..3e20a8c1 100644 --- a/lib/qcd/hmc/HMCResourceManager.h +++ b/lib/qcd/hmc/HMCResourceManager.h @@ -165,7 +165,7 @@ class HMCResourceManager { // Grids ////////////////////////////////////////////////////////////// - void AddGrid(std::string s, GridModule& M) { + void AddGrid(const std::string s, GridModule& M) { // Check for name clashes auto search = Grids.find(s); if (search != Grids.end()) { @@ -174,14 +174,24 @@ class HMCResourceManager { exit(1); } Grids[s] = std::move(M); + std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" < Mod; AddGrid(s, Mod); } + // Add a named grid set, 4d shortcut + tweak simd lanes + void AddFourDimGrid(const std::string s, const std::vector simd_decomposition) { + GridFourDimModule Mod(simd_decomposition); + AddGrid(s, Mod); + } GridCartesian* GetCartesian(std::string s = "") { diff --git a/lib/qcd/hmc/HMC_GridModules.h b/lib/qcd/hmc/HMC_GridModules.h index 8331c02b..0f34e9a7 100644 --- a/lib/qcd/hmc/HMC_GridModules.h +++ b/lib/qcd/hmc/HMC_GridModules.h @@ -33,28 +33,29 @@ directory namespace Grid { // Resources -// Modules for grids +// Modules for grids // Introduce another namespace HMCModules? -class GridModuleParameters: Serializable{ +class GridModuleParameters: Serializable{ public: GRID_SERIALIZABLE_CLASS_MEMBERS(GridModuleParameters, std::string, lattice, std::string, mpi); - std::vector getLattice(){return strToVec(lattice);} - std::vector getMpi() {return strToVec(mpi);} + std::vector getLattice() const {return strToVec(lattice);} + std::vector getMpi() const {return strToVec(mpi);} - void check(){ - if (getLattice().size() != getMpi().size()) { - std::cout << GridLogError + + void check() const { + if (getLattice().size() != getMpi().size() ) { + std::cout << GridLogError << "Error in GridModuleParameters: lattice and mpi dimensions " "do not match" << std::endl; exit(1); } - } + } template GridModuleParameters(Reader& Reader, std::string n = "LatticeGrid"):name(n) { @@ -75,51 +76,94 @@ private: // Lower level class class GridModule { public: - GridCartesian* get_full() { + GridCartesian* get_full() { std::cout << GridLogDebug << "Getting cartesian in module"<< std::endl; return grid_.get(); } - GridRedBlackCartesian* get_rb() { + GridRedBlackCartesian* get_rb() { std::cout << GridLogDebug << "Getting rb-cartesian in module"<< std::endl; return rbgrid_.get(); } void set_full(GridCartesian* grid) { grid_.reset(grid); } void set_rb(GridRedBlackCartesian* rbgrid) { rbgrid_.reset(rbgrid); } + void show_full_decomposition(){ grid_->show_decomposition(); } + void show_rb_decomposition(){ rbgrid_->show_decomposition(); } protected: std::unique_ptr grid_; std::unique_ptr rbgrid_; - + }; //////////////////////////////////// // Classes for the user //////////////////////////////////// // Note: the space time grid should be out of the QCD namespace -template< class vector_type> -class GridFourDimModule : public GridModule { - public: - GridFourDimModule() { +template +class GridFourDimModule : public GridModule +{ +public: + GridFourDimModule() + { using namespace QCD; set_full(SpaceTimeGrid::makeFourDimGrid( - GridDefaultLatt(), GridDefaultSimd(4, vector_type::Nsimd()), + GridDefaultLatt(), + GridDefaultSimd(4, vector_type::Nsimd()), GridDefaultMpi())); set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get())); } - GridFourDimModule(GridModuleParameters Params) { + GridFourDimModule(const std::vector tweak_simd) + { + using namespace QCD; + if (tweak_simd.size() != 4) + { + std::cout << GridLogError + << "Error in GridFourDimModule: SIMD size different from 4" + << std::endl; + exit(1); + } + + // Checks that the product agrees with the expectation + int simd_sum = 1; + for (auto &n : tweak_simd) + simd_sum *= n; + std::cout << GridLogDebug << "TweakSIMD: " << tweak_simd << " Sum: " << simd_sum << std::endl; + + if (simd_sum == vector_type::Nsimd()) + { + set_full(SpaceTimeGrid::makeFourDimGrid( + GridDefaultLatt(), + tweak_simd, + GridDefaultMpi())); + set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get())); + } + else + { + std::cout << GridLogError + << "Error in GridFourDimModule: SIMD lanes must sum to " + << vector_type::Nsimd() + << std::endl; + } + } + + GridFourDimModule(const GridModuleParameters Params) + { using namespace QCD; - Params.check(); std::vector lattice_v = Params.getLattice(); std::vector mpi_v = Params.getMpi(); - if (lattice_v.size() == 4) { + if (lattice_v.size() == 4) + { set_full(SpaceTimeGrid::makeFourDimGrid( - lattice_v, GridDefaultSimd(4, vector_type::Nsimd()), + lattice_v, + GridDefaultSimd(4, vector_type::Nsimd()), mpi_v)); set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get())); - } else { - std::cout << GridLogError - << "Error in GridFourDimModule: lattice dimension different from 4" - << std::endl; + } + else + { + std::cout << GridLogError + << "Error in GridFourDimModule: lattice dimension different from 4" + << std::endl; exit(1); } } diff --git a/lib/util/Init.cc b/lib/util/Init.cc index fe3b1734..35a569ba 100644 --- a/lib/util/Init.cc +++ b/lib/util/Init.cc @@ -377,7 +377,7 @@ void Grid_init(int *argc,char ***argv) std::cout << GridLogDebug << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "< - //FermionParameters(Reader& Reader){ - // read(Reader, "Mobius", *this); - //} - }; @@ -113,9 +107,12 @@ int main(int argc, char **argv) { bool ApplySmearing = MyParams.Mobius.ApplySmearing; + // Use this if you want to tweak the default decomposition + std::vector simd_lanes({2,2,1,1}); - // Grid from the command line - TheHMC.Resources.AddFourDimGrid("gauge"); + // Grid from the command line arguments --grid and --mpi + // drop the simd_lanes argument to fall back to the default decomposition for the SIMD lanes + TheHMC.Resources.AddFourDimGrid("gauge", simd_lanes); // Possibile to create the module by hand // hardcoding parameters or using a Reader From dbe4d7850c1e132f538e4aead7869ba703a21ec5 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Sun, 6 Aug 2017 10:49:45 +0100 Subject: [PATCH 140/170] Make a test file compatible with all architectures --- tests/hmc/Test_hmc_EOMobiusRatio.cc | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/hmc/Test_hmc_EOMobiusRatio.cc b/tests/hmc/Test_hmc_EOMobiusRatio.cc index 4b4555e3..d6ca96db 100644 --- a/tests/hmc/Test_hmc_EOMobiusRatio.cc +++ b/tests/hmc/Test_hmc_EOMobiusRatio.cc @@ -108,11 +108,16 @@ int main(int argc, char **argv) { // Use this if you want to tweak the default decomposition - std::vector simd_lanes({2,2,1,1}); + // commented out as very architecture speficic + + //std::vector simd_lanes({2,2,1,1}); // Grid from the command line arguments --grid and --mpi // drop the simd_lanes argument to fall back to the default decomposition for the SIMD lanes - TheHMC.Resources.AddFourDimGrid("gauge", simd_lanes); + + //TheHMC.Resources.AddFourDimGrid("gauge", simd_lanes); // tweak the SIMD lanes + TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition + // Possibile to create the module by hand // hardcoding parameters or using a Reader From 06e6f8de00528ede75f248f98d48eca715d79630 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Tue, 8 Aug 2017 10:22:12 +0100 Subject: [PATCH 141/170] Check that the reduced dim is an integer --- lib/cartesian/Cartesian_red_black.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/cartesian/Cartesian_red_black.h b/lib/cartesian/Cartesian_red_black.h index 3037de00..e58999c5 100644 --- a/lib/cartesian/Cartesian_red_black.h +++ b/lib/cartesian/Cartesian_red_black.h @@ -176,7 +176,8 @@ public: // Use a reduced simd grid _simd_layout[d] = simd_layout[d]; - _rdimensions[d]= _ldimensions[d]/_simd_layout[d]; + _rdimensions[d]= _ldimensions[d]/_simd_layout[d]; // this is not checking if this is integer + assert(_rdimensions[d]*_simd_layout[d] == _ldimensions[d]); assert(_rdimensions[d]>0); // all elements of a simd vector must have same checkerboard. From 44051aecd1eb0abc7a61ac814654491804455347 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Tue, 8 Aug 2017 10:31:12 +0100 Subject: [PATCH 142/170] Checking for integer divisions in cartesian full --- lib/cartesian/Cartesian_full.h | 130 +++++++++++++++++---------------- 1 file changed, 67 insertions(+), 63 deletions(-) diff --git a/lib/cartesian/Cartesian_full.h b/lib/cartesian/Cartesian_full.h index b0e47fa4..815e3b22 100644 --- a/lib/cartesian/Cartesian_full.h +++ b/lib/cartesian/Cartesian_full.h @@ -62,77 +62,81 @@ public: return shift; } GridCartesian(const std::vector &dimensions, - const std::vector &simd_layout, - const std::vector &processor_grid - ) : GridBase(processor_grid) + const std::vector &simd_layout, + const std::vector &processor_grid) : GridBase(processor_grid) { - /////////////////////// - // Grid information - /////////////////////// - _ndimension = dimensions.size(); - - _fdimensions.resize(_ndimension); - _gdimensions.resize(_ndimension); - _ldimensions.resize(_ndimension); - _rdimensions.resize(_ndimension); - _simd_layout.resize(_ndimension); - _lstart.resize(_ndimension); - _lend.resize(_ndimension); - - _ostride.resize(_ndimension); - _istride.resize(_ndimension); - - _fsites = _gsites = _osites = _isites = 1; + /////////////////////// + // Grid information + /////////////////////// + _ndimension = dimensions.size(); - for(int d=0;d<_ndimension;d++){ - _fdimensions[d] = dimensions[d]; // Global dimensions - _gdimensions[d] = _fdimensions[d]; // Global dimensions - _simd_layout[d] = simd_layout[d]; - _fsites = _fsites * _fdimensions[d]; - _gsites = _gsites * _gdimensions[d]; + _fdimensions.resize(_ndimension); + _gdimensions.resize(_ndimension); + _ldimensions.resize(_ndimension); + _rdimensions.resize(_ndimension); + _simd_layout.resize(_ndimension); + _lstart.resize(_ndimension); + _lend.resize(_ndimension); - //FIXME check for exact division + _ostride.resize(_ndimension); + _istride.resize(_ndimension); - // Use a reduced simd grid - _ldimensions[d]= _gdimensions[d]/_processors[d]; //local dimensions - _rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition - _lstart[d] = _processor_coor[d]*_ldimensions[d]; - _lend[d] = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1; - _osites *= _rdimensions[d]; - _isites *= _simd_layout[d]; - - // Addressing support - if ( d==0 ) { - _ostride[d] = 1; - _istride[d] = 1; - } else { - _ostride[d] = _ostride[d-1]*_rdimensions[d-1]; - _istride[d] = _istride[d-1]*_simd_layout[d-1]; - } + _fsites = _gsites = _osites = _isites = 1; + + for (int d = 0; d < _ndimension; d++) + { + _fdimensions[d] = dimensions[d]; // Global dimensions + _gdimensions[d] = _fdimensions[d]; // Global dimensions + _simd_layout[d] = simd_layout[d]; + _fsites = _fsites * _fdimensions[d]; + _gsites = _gsites * _gdimensions[d]; + + // Use a reduced simd grid + _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions + assert(_ldimensions[d] * _processors[d] == _gdimensions[d]); + + _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition + assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]); + + _lstart[d] = _processor_coor[d] * _ldimensions[d]; + _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1; + _osites *= _rdimensions[d]; + _isites *= _simd_layout[d]; + + // Addressing support + if (d == 0) + { + _ostride[d] = 1; + _istride[d] = 1; } - - /////////////////////// - // subplane information - /////////////////////// - _slice_block.resize(_ndimension); - _slice_stride.resize(_ndimension); - _slice_nblock.resize(_ndimension); - - int block =1; - int nblock=1; - for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d]; - - for(int d=0;d<_ndimension;d++){ - nblock/=_rdimensions[d]; - _slice_block[d] =block; - _slice_stride[d]=_ostride[d]*_rdimensions[d]; - _slice_nblock[d]=nblock; - block = block*_rdimensions[d]; + else + { + _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1]; + _istride[d] = _istride[d - 1] * _simd_layout[d - 1]; } + } + /////////////////////// + // subplane information + /////////////////////// + _slice_block.resize(_ndimension); + _slice_stride.resize(_ndimension); + _slice_nblock.resize(_ndimension); + + int block = 1; + int nblock = 1; + for (int d = 0; d < _ndimension; d++) + nblock *= _rdimensions[d]; + + for (int d = 0; d < _ndimension; d++) + { + nblock /= _rdimensions[d]; + _slice_block[d] = block; + _slice_stride[d] = _ostride[d] * _rdimensions[d]; + _slice_nblock[d] = nblock; + block = block * _rdimensions[d]; + } }; }; - - } #endif From 8a3fe60a27e4573faca940efd33d18a7d468c764 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Tue, 8 Aug 2017 11:36:20 +0100 Subject: [PATCH 143/170] Added more asserts at grid creation time --- lib/cartesian/Cartesian_red_black.h | 192 +++++++++++++++------------- 1 file changed, 105 insertions(+), 87 deletions(-) diff --git a/lib/cartesian/Cartesian_red_black.h b/lib/cartesian/Cartesian_red_black.h index e58999c5..b1a5b9ef 100644 --- a/lib/cartesian/Cartesian_red_black.h +++ b/lib/cartesian/Cartesian_red_black.h @@ -131,21 +131,21 @@ public: Init(dimensions,simd_layout,processor_grid,checker_dim_mask,0); } void Init(const std::vector &dimensions, - const std::vector &simd_layout, - const std::vector &processor_grid, - const std::vector &checker_dim_mask, - int checker_dim) + const std::vector &simd_layout, + const std::vector &processor_grid, + const std::vector &checker_dim_mask, + int checker_dim) { - /////////////////////// - // Grid information - /////////////////////// + /////////////////////// + // Grid information + /////////////////////// _checker_dim = checker_dim; - assert(checker_dim_mask[checker_dim]==1); + assert(checker_dim_mask[checker_dim] == 1); _ndimension = dimensions.size(); - assert(checker_dim_mask.size()==_ndimension); - assert(processor_grid.size()==_ndimension); - assert(simd_layout.size()==_ndimension); - + assert(checker_dim_mask.size() == _ndimension); + assert(processor_grid.size() == _ndimension); + assert(simd_layout.size() == _ndimension); + _fdimensions.resize(_ndimension); _gdimensions.resize(_ndimension); _ldimensions.resize(_ndimension); @@ -153,115 +153,133 @@ public: _simd_layout.resize(_ndimension); _lstart.resize(_ndimension); _lend.resize(_ndimension); - + _ostride.resize(_ndimension); _istride.resize(_ndimension); - + _fsites = _gsites = _osites = _isites = 1; - - _checker_dim_mask=checker_dim_mask; - for(int d=0;d<_ndimension;d++){ - _fdimensions[d] = dimensions[d]; - _gdimensions[d] = _fdimensions[d]; - _fsites = _fsites * _fdimensions[d]; - _gsites = _gsites * _gdimensions[d]; - - if (d==_checker_dim) { - _gdimensions[d] = _gdimensions[d]/2; // Remove a checkerboard - } - _ldimensions[d] = _gdimensions[d]/_processors[d]; - _lstart[d] = _processor_coor[d]*_ldimensions[d]; - _lend[d] = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1; + _checker_dim_mask = checker_dim_mask; - // Use a reduced simd grid - _simd_layout[d] = simd_layout[d]; - _rdimensions[d]= _ldimensions[d]/_simd_layout[d]; // this is not checking if this is integer - assert(_rdimensions[d]*_simd_layout[d] == _ldimensions[d]); - assert(_rdimensions[d]>0); + for (int d = 0; d < _ndimension; d++) + { + _fdimensions[d] = dimensions[d]; + _gdimensions[d] = _fdimensions[d]; + _fsites = _fsites * _fdimensions[d]; + _gsites = _gsites * _gdimensions[d]; - // all elements of a simd vector must have same checkerboard. - // If Ls vectorised, this must still be the case; e.g. dwf rb5d - if ( _simd_layout[d]>1 ) { - if ( checker_dim_mask[d] ) { - assert( (_rdimensions[d]&0x1) == 0 ); - } - } + if (d == _checker_dim) + { + assert((_gdimensions[d] & 0x1) == 0); + _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard + } + _ldimensions[d] = _gdimensions[d] / _processors[d]; + assert(_ldimensions[d] * _processors[d] == _gdimensions[d]); + _lstart[d] = _processor_coor[d] * _ldimensions[d]; + _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1; - _osites *= _rdimensions[d]; - _isites *= _simd_layout[d]; - - // Addressing support - if ( d==0 ) { - _ostride[d] = 1; - _istride[d] = 1; - } else { - _ostride[d] = _ostride[d-1]*_rdimensions[d-1]; - _istride[d] = _istride[d-1]*_simd_layout[d-1]; - } + // Use a reduced simd grid + _simd_layout[d] = simd_layout[d]; + _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer + assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]); + assert(_rdimensions[d] > 0); + // all elements of a simd vector must have same checkerboard. + // If Ls vectorised, this must still be the case; e.g. dwf rb5d + if (_simd_layout[d] > 1) + { + if (checker_dim_mask[d]) + { + assert((_rdimensions[d] & 0x1) == 0); + } + } + _osites *= _rdimensions[d]; + _isites *= _simd_layout[d]; + + // Addressing support + if (d == 0) + { + _ostride[d] = 1; + _istride[d] = 1; + } + else + { + _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1]; + _istride[d] = _istride[d - 1] * _simd_layout[d - 1]; + } } - + //////////////////////////////////////////////////////////////////////////////////////////// // subplane information //////////////////////////////////////////////////////////////////////////////////////////// _slice_block.resize(_ndimension); _slice_stride.resize(_ndimension); _slice_nblock.resize(_ndimension); - - int block =1; - int nblock=1; - for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d]; - - for(int d=0;d<_ndimension;d++){ - nblock/=_rdimensions[d]; - _slice_block[d] =block; - _slice_stride[d]=_ostride[d]*_rdimensions[d]; - _slice_nblock[d]=nblock; - block = block*_rdimensions[d]; + + int block = 1; + int nblock = 1; + for (int d = 0; d < _ndimension; d++) + nblock *= _rdimensions[d]; + + for (int d = 0; d < _ndimension; d++) + { + nblock /= _rdimensions[d]; + _slice_block[d] = block; + _slice_stride[d] = _ostride[d] * _rdimensions[d]; + _slice_nblock[d] = nblock; + block = block * _rdimensions[d]; } //////////////////////////////////////////////// // Create a checkerboard lookup table //////////////////////////////////////////////// int rvol = 1; - for(int d=0;d<_ndimension;d++){ - rvol=rvol * _rdimensions[d]; + for (int d = 0; d < _ndimension; d++) + { + rvol = rvol * _rdimensions[d]; } _checker_board.resize(rvol); - for(int osite=0;osite<_osites;osite++){ - _checker_board[osite] = CheckerBoardFromOindex (osite); + for (int osite = 0; osite < _osites; osite++) + { + _checker_board[osite] = CheckerBoardFromOindex(osite); } - }; -protected: + + protected: virtual int oIndex(std::vector &coor) { - int idx=0; - for(int d=0;d<_ndimension;d++) { - if( d==_checker_dim ) { - idx+=_ostride[d]*((coor[d]/2)%_rdimensions[d]); - } else { - idx+=_ostride[d]*(coor[d]%_rdimensions[d]); - } + int idx = 0; + for (int d = 0; d < _ndimension; d++) + { + if (d == _checker_dim) + { + idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]); + } + else + { + idx += _ostride[d] * (coor[d] % _rdimensions[d]); + } } return idx; }; - + virtual int iIndex(std::vector &lcoor) { - int idx=0; - for(int d=0;d<_ndimension;d++) { - if( d==_checker_dim ) { - idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d])); - } else { - idx+=_istride[d]*(lcoor[d]/_rdimensions[d]); - } - } - return idx; + int idx = 0; + for (int d = 0; d < _ndimension; d++) + { + if (d == _checker_dim) + { + idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d])); + } + else + { + idx += _istride[d] * (lcoor[d] / _rdimensions[d]); + } + } + return idx; } }; - } #endif From fd367d8bfd95ec193b9528c59d7846508bf82296 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Wed, 16 Aug 2017 09:42:57 +0100 Subject: [PATCH 144/170] Debugging the PointerCache --- lib/allocator/AlignedAllocator.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/allocator/AlignedAllocator.cc b/lib/allocator/AlignedAllocator.cc index 4249a72e..04de20bf 100644 --- a/lib/allocator/AlignedAllocator.cc +++ b/lib/allocator/AlignedAllocator.cc @@ -11,7 +11,7 @@ int PointerCache::victim; void *PointerCache::Insert(void *ptr,size_t bytes) { - if (bytes < 4096 ) return NULL; + if (bytes < 4096 ) return ptr; #ifdef GRID_OMP assert(omp_in_parallel()==0); From bcefdd7c4eff147242ededf040653449c2d573c9 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 19 Aug 2017 12:49:02 -0400 Subject: [PATCH 145/170] Align both allocator calls to 2MB --- lib/allocator/AlignedAllocator.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/allocator/AlignedAllocator.h b/lib/allocator/AlignedAllocator.h index 7fd9496f..39734b53 100644 --- a/lib/allocator/AlignedAllocator.h +++ b/lib/allocator/AlignedAllocator.h @@ -186,9 +186,9 @@ public: pointer allocate(size_type __n, const void* _p= 0) { #ifdef HAVE_MM_MALLOC_H - _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128); + _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),GRID_ALLOC_ALIGN); #else - _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp)); + _Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,__n*sizeof(_Tp)); #endif return ptr; } From 9e658de2383620b5aa002f319b85442ab24d8115 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 19 Aug 2017 12:52:44 -0400 Subject: [PATCH 146/170] Use Vector --- benchmarks/Benchmark_comms.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc index 698f9d25..491fba1e 100644 --- a/benchmarks/Benchmark_comms.cc +++ b/benchmarks/Benchmark_comms.cc @@ -92,8 +92,8 @@ int main (int argc, char ** argv) RealD Nnode = Grid.NodeCount(); RealD ppn = Nrank/Nnode; - std::vector > xbuf(8,std::vector(lat*lat*lat*Ls)); - std::vector > rbuf(8,std::vector(lat*lat*lat*Ls)); + Vector > xbuf(8,Vector(lat*lat*lat*Ls)); + Vector > rbuf(8,Vector(lat*lat*lat*Ls)); int ncomm; int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); @@ -172,8 +172,8 @@ int main (int argc, char ** argv) RealD Nnode = Grid.NodeCount(); RealD ppn = Nrank/Nnode; - std::vector > xbuf(8,std::vector(lat*lat*lat*Ls)); - std::vector > rbuf(8,std::vector(lat*lat*lat*Ls)); + Vector > xbuf(8,Vector(lat*lat*lat*Ls)); + Vector > rbuf(8,Vector(lat*lat*lat*Ls)); int ncomm; From d6472eda8d00c8d0ffc60760a4dd9462702ac00b Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 19 Aug 2017 12:53:18 -0400 Subject: [PATCH 147/170] Use mmap --- lib/communicator/Communicator_base.cc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc index 67bfaed0..6767495f 100644 --- a/lib/communicator/Communicator_base.cc +++ b/lib/communicator/Communicator_base.cc @@ -26,6 +26,10 @@ Author: Peter Boyle *************************************************************************************/ /* END LEGAL */ #include +#include +#include +#include +#include namespace Grid { @@ -129,8 +133,15 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { return NULL; } void CartesianCommunicator::ShmInitGeneric(void){ +#if 1 + ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, MAP_HUGETLB| MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (ShmCommBuf == (void *)MAP_FAILED) exit(EXIT_FAILURE); + std::cout << "ShmCommBuf "< Date: Sat, 19 Aug 2017 12:53:59 -0400 Subject: [PATCH 148/170] Enable blocking stencil send --- lib/communicator/Communicator_mpit.cc | 5 ----- 1 file changed, 5 deletions(-) diff --git a/lib/communicator/Communicator_mpit.cc b/lib/communicator/Communicator_mpit.cc index f522701c..c0fb47fd 100644 --- a/lib/communicator/Communicator_mpit.cc +++ b/lib/communicator/Communicator_mpit.cc @@ -242,20 +242,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector Date: Sat, 19 Aug 2017 13:18:50 -0400 Subject: [PATCH 149/170] Fix mpi 3 interface change --- lib/communicator/Communicator_mpi3.cc | 11 +++++++++++ lib/communicator/Communicator_mpit.cc | 25 +++++++------------------ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index 46e4745c..e6e33d33 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -621,6 +621,17 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis } } +double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, + int dest, + void *recv, + int from, + int bytes,int dir) +{ + std::vector list; + StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir); + StencilSendToRecvFromComplete(list,dir); +} + double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, void *xmit, int dest, diff --git a/lib/communicator/Communicator_mpit.cc b/lib/communicator/Communicator_mpit.cc index c0fb47fd..9a9b26d2 100644 --- a/lib/communicator/Communicator_mpit.cc +++ b/lib/communicator/Communicator_mpit.cc @@ -242,17 +242,12 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &waitall,int dir) +{ + // Do nothing +}; double CartesianCommunicator::StencilSendToRecvFrom(void *xmit, int xmit_to_rank, void *recv, @@ -266,17 +261,11 @@ double CartesianCommunicator::StencilSendToRecvFrom(void *xmit, // std::cout << " sending on communicator "< &waitall,int dir) -{ - // Do nothing -}; From bfef525ed2474c0cfe1047e0351ab58ce525ff10 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 19 Aug 2017 23:10:12 +0100 Subject: [PATCH 150/170] New benchmark prep --- benchmarks/Benchmark_ITT.cc | 518 ++++++++++++++++++++++++++++++++++++ 1 file changed, 518 insertions(+) create mode 100644 benchmarks/Benchmark_ITT.cc diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc new file mode 100644 index 00000000..4f16b1de --- /dev/null +++ b/benchmarks/Benchmark_ITT.cc @@ -0,0 +1,518 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./benchmarks/Benchmark_memory_bandwidth.cc + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + + +struct time_statistics{ + double mean; + double err; + double min; + double max; + + void statistics(std::vector v){ + double sum = std::accumulate(v.begin(), v.end(), 0.0); + mean = sum / v.size(); + + std::vector diff(v.size()); + std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; }); + double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); + err = std::sqrt(sq_sum / (v.size()*(v.size() - 1))); + + auto result = std::minmax_element(v.begin(), v.end()); + min = *result.first; + max = *result.second; +} +}; + +void comms_header(){ + std::cout < simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + + std::vector t_time(Nloop); + time_statistics timestat; + + std::cout< latt_size ({lat*mpi_layout[0], + lat*mpi_layout[1], + lat*mpi_layout[2], + lat*mpi_layout[3]}); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + RealD Nrank = Grid._Nprocessors; + RealD Nnode = Grid.NodeCount(); + RealD ppn = Nrank/Nnode; + + std::vector xbuf(8); + std::vector rbuf(8); + Grid.ShmBufferFreeAll(); + for(int d=0;d<8;d++){ + xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + } + + int ncomm; + int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + double dbytes; + for(int i=0;i requests; + dbytes=0; + ncomm=0; + + parallel_for(int dir=0;dir<8;dir++){ + + double tbytes; + int mu =dir % 4; + + if (mpi_layout[mu]>1 ) { + + ncomm++; + int xmit_to_rank; + int recv_from_rank; + if ( dir == mu ) { + int comm_proc=1; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + } else { + int comm_proc = mpi_layout[mu]-1; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + } +#if 1 + tbytes= Grid.StencilSendToRecvFromBegin(requests, + (void *)&xbuf[dir][0], + xmit_to_rank, + (void *)&rbuf[dir][0], + recv_from_rank, + bytes,dir); + Grid.StencilSendToRecvFromComplete(requests,dir); +#endif + requests.resize(0); + +#pragma omp atomic + dbytes+=tbytes; + } + } + Grid.Barrier(); + double stop=usecond(); + t_time[i] = stop-start; // microseconds + } + + timestat.statistics(t_time); + + dbytes=dbytes*ppn; + double xbytes = dbytes*0.5; + double rbytes = dbytes*0.5; + double bidibytes = dbytes; + + + std::cout< > LatticeVec; + typedef iVector Vec; + + std::vector simd_layout = GridDefaultSimd(Nd,vReal::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + + std::cout<({45,12,81,9})); + for(int lat=8;lat<=lmax;lat+=4){ + + std::vector latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + + Vec rn ; random(sRNG,rn); + + LatticeVec z(&Grid); z=rn; + LatticeVec x(&Grid); x=rn; + LatticeVec y(&Grid); y=rn; + double a=2.0; + + uint64_t Nloop=NLOOP; + + double start=usecond(); + for(int i=0;i mpi = GridDefaultMpi(); assert(mpi.size()==4); + std::vector local({L,L,L,L}); + + GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(std::vector({64,64,64,64}), + GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + uint64_t NP = TmpGrid->RankCount(); + uint64_t NN = TmpGrid->NodeCount(); + uint64_t SHM=NP/NN; + + std::vector internal; + if ( SHM == 1 ) internal = std::vector({1,1,1,1}); + else if ( SHM == 2 ) internal = std::vector({2,1,1,1}); + else if ( SHM == 4 ) internal = std::vector({2,2,1,1}); + else if ( SHM == 8 ) internal = std::vector({2,2,2,1}); + else assert(0); + + std::vector nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]}); + std::vector latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]}); + + ///////// Welcome message //////////// + std::cout< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + ///////// Source preparation //////////// + LatticeFermion src (FGrid); random(RNG5,src); + LatticeFermion ref (FGrid); + LatticeFermion tmp (FGrid); + + RealD N2 = 1.0/::sqrt(norm2(src)); + src = src*N2; + + LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); + + DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + + //////////////////////////////////// + // Naive wilson implementation + //////////////////////////////////// + { + LatticeGaugeField Umu5d(FGrid); + std::vector U(4,FGrid); + for(int ss=0;ssoSites();ss++){ + for(int s=0;s(Umu5d,mu); + } + for(int mu=0;muBarrier(); + for(int i=0;iBarrier(); + double t1=usecond(); + uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0); + FGrid->Broadcast(0,&ncall,sizeof(ncall)); + + // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); + for(uint64_t i=0;iBarrier(); + + double volume=Ls; for(int mu=0;mumflops_best ) mflops_best = mflops; + if ( mflops({2,2,2,2}); + + Benchmark::Decomposition(); + + int do_memory=1; + int do_comms =1; + int do_su3 =0; + int do_wilson=1; + int do_dwf =1; + + if ( do_memory ) { + std::cout< Date: Sat, 19 Aug 2017 23:11:30 +0100 Subject: [PATCH 151/170] Update TODO --- TODO | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/TODO b/TODO index 001c6c0c..cccc5f45 100644 --- a/TODO +++ b/TODO @@ -2,18 +2,18 @@ TODO: --------------- Large item work list: -1)- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O +1)- BG/Q port and check 2)- Christoph's local basis expansion Lanczos -3)- BG/Q port and check -4)- Precision conversion and sort out localConvert <-- partial +3)- Precision conversion and sort out localConvert <-- partial - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet -5)- Physical propagator interface -6)- Conserved currents -7)- Multigrid Wilson and DWF, compare to other Multigrid implementations -8)- HDCR resume +4)- Physical propagator interface +5)- Conserved currents +6)- Multigrid Wilson and DWF, compare to other Multigrid implementations +7)- HDCR resume Recent DONE +-- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O -- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE -- GaugeFix into central location <-- DONE -- Scidac and Ildg metadata handling <-- DONE From a446d95c3393d697f987434ac594950d18017b7a Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 20 Aug 2017 01:10:50 +0100 Subject: [PATCH 152/170] Trying to pass TeamCity and Travis --- benchmarks/Benchmark_ITT.cc | 12 ++++++------ lib/communicator/Communicator_base.cc | 6 +++++- lib/communicator/Communicator_base.h | 19 +++++++++++++------ lib/communicator/Communicator_mpi3.cc | 17 +++++++++++++---- lib/qcd/action/fermion/WilsonFermion5D.cc | 18 ++++++++++-------- lib/stencil/Stencil.h | 7 ++++++- lib/util/Init.cc | 18 ++++++++++++++---- 7 files changed, 67 insertions(+), 30 deletions(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 4f16b1de..9bf7d0a5 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -218,7 +218,7 @@ public: std::cout<({45,12,81,9})); for(int lat=8;lat<=lmax;lat+=4){ @@ -368,7 +368,7 @@ public: const int num_cases = 4; #endif controls Cases [] = { -#if defined(AVX512) +#ifdef AVX512 { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, #endif @@ -380,6 +380,10 @@ public: for(int c=0;cBarrier(); diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc index 6767495f..3ce3a774 100644 --- a/lib/communicator/Communicator_base.cc +++ b/lib/communicator/Communicator_base.cc @@ -41,6 +41,7 @@ uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 128*1024*1024; CartesianCommunicator::CommunicatorPolicy_t CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent; int CartesianCommunicator::nCommThreads = -1; +int CartesianCommunicator::Hugepages = 0; ///////////////////////////////// // Alloc, free shmem region @@ -134,7 +135,10 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { } void CartesianCommunicator::ShmInitGeneric(void){ #if 1 - ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, MAP_HUGETLB| MAP_SHARED | MAP_ANONYMOUS, -1, 0); + + int mmap_flag = MAP_SHARED | MAP_ANONYMOUS; + if ( Hugepages ) mmap_flag |= MAP_HUGETLB; + ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); if (ShmCommBuf == (void *)MAP_FAILED) exit(EXIT_FAILURE); std::cout << "ShmCommBuf "< #ifdef HAVE_NUMAIF_H #include #endif + +// Make up for linex deficiencies #ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 +#define SHM_HUGETLB 0x0 +#endif +#ifndef MAP_HUGETLB +#define MAP_HUGETLB 0x0 #endif namespace Grid { @@ -213,8 +218,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666); if ( fd < 0 ) { perror("failed shm_open"); assert(0); } ftruncate(fd, size); + + int mmap_flag = MAP_SHARED; + if (Hugepages) mmap_flag |= MAP_HUGETLB; + void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0); - void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); } assert(((uint64_t)ptr&0x3F)==0); @@ -628,8 +636,9 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, int bytes,int dir) { std::vector list; - StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir); + double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir); StencilSendToRecvFromComplete(list,dir); + return offbytes; } double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, @@ -671,7 +680,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorStencilSendToRecvFromComplete(list); + this->StencilSendToRecvFromComplete(list,dir); } return off_node_bytes; diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 0b6c9e3d..404ecce0 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -135,10 +135,11 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, template void WilsonFermion5D::Report(void) { - std::vector latt = GridDefaultLatt(); - RealD volume = Ls; for(int mu=0;mu_Nprocessors; - RealD NN = _FourDimGrid->NodeCount(); + RealD NP = _FourDimGrid->_Nprocessors; + RealD NN = _FourDimGrid->NodeCount(); + RealD volume = Ls; + std::vector latt = _FourDimGrid->GlobalDimensions(); + for(int mu=0;mu 0 ) { std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; @@ -390,17 +391,18 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms DhopFaceTime+=usecond(); - // Rely on async comms; start comms before merge of local data double ctime=0; double ptime=0; - // DhopComputeTime-=usecond(); - // DhopCommTime-=usecond(); + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // Ugly explicit thread mapping introduced for OPA reasons. + ////////////////////////////////////////////////////////////////////////////////////////////////////// #pragma omp parallel reduction(max:ctime) reduction(max:ptime) { int tid = omp_get_thread_num(); int nthreads = omp_get_num_threads(); int ncomms = CartesianCommunicator::nCommThreads; - if (ncomms == -1) ncomms = st.Packets.size(); + if (ncomms == -1) ncomms = 1; assert(nthreads > ncomms); if (tid >= ncomms) { double start = usecond(); diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index d1d7a7e0..cca67587 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -252,10 +252,15 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal ////////////////////////////////////////// void CommunicateThreaded() { +#ifdef GRID_OMP // must be called in parallel region int mythread = omp_get_thread_num(); int nthreads = CartesianCommunicator::nCommThreads; - if (nthreads == -1) nthreads = Packets.size(); +#else + int mythread = 0; + int nthreads = 1; +#endif + if (nthreads == -1) nthreads = 1; if (mythread < nthreads) { for (int i = mythread; i < Packets.size(); i += nthreads) { double start = usecond(); diff --git a/lib/util/Init.cc b/lib/util/Init.cc index 39a726cf..3fd8b4cd 100644 --- a/lib/util/Init.cc +++ b/lib/util/Init.cc @@ -222,6 +222,11 @@ void Grid_init(int *argc,char ***argv) CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024; } + if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-hugepages") ){ + CartesianCommunicator::Hugepages = 1; + } + + if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){ Grid_debug_handler_init(); } @@ -304,6 +309,7 @@ void Grid_init(int *argc,char ***argv) std::cout< Date: Sun, 20 Aug 2017 01:27:48 +0100 Subject: [PATCH 153/170] Switch off comms for now until feature/multi-communicator is merged --- benchmarks/Benchmark_ITT.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 4f16b1de..91524149 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -160,7 +160,7 @@ public: int comm_proc = mpi_layout[mu]-1; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); } -#if 1 +#if 0 tbytes= Grid.StencilSendToRecvFromBegin(requests, (void *)&xbuf[dir][0], xmit_to_rank, From 11062fb6861153ffafa6d821f8ee53f01f5f72a4 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 20 Aug 2017 01:37:07 +0100 Subject: [PATCH 154/170] Comms none fail fix --- lib/communicator/Communicator_base.cc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc index 3ce3a774..2e6626be 100644 --- a/lib/communicator/Communicator_base.cc +++ b/lib/communicator/Communicator_base.cc @@ -102,6 +102,18 @@ int CartesianCommunicator::NodeCount(void) { return Proc int CartesianCommunicator::RankCount(void) { return ProcessorCount();}; #endif #if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT) +double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes, int dir) +{ + std::vector list; + // Discard the "dir" + SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); + SendToRecvFromComplete(list); + return 2.0*bytes; +} double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, void *xmit, int xmit_to_rank, From 1cdf99966810227f180452393973c87ae4a301c4 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 20 Aug 2017 02:39:10 +0100 Subject: [PATCH 155/170] Moving multicommunicator into mpi3 also for threading --- lib/communicator/Communicator_base.h | 8 ++++---- lib/communicator/Communicator_mpi3.cc | 12 ++++++++++-- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h index ac7d94f3..ac866ced 100644 --- a/lib/communicator/Communicator_base.h +++ b/lib/communicator/Communicator_base.h @@ -78,15 +78,15 @@ class CartesianCommunicator { #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT) static MPI_Comm communicator_world; - MPI_Comm communicator; + + MPI_Comm communicator; + std::vector communicator_halo; + typedef MPI_Request CommsRequest_t; #else typedef int CommsRequest_t; #endif -#if defined (GRID_COMMS_MPIT) - std::vector communicator_halo; -#endif //////////////////////////////////////////////////////////////////// // Helper functionality for SHM Windows common to all other impls diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index 4f769971..9e5dfb97 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -405,8 +405,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors) { int ierr; communicator=communicator_world; + _ndimension = processors.size(); + communicator_halo.resize (2*_ndimension); + for(int i=0;i<_ndimension*2;i++){ + MPI_Comm_dup(communicator,&communicator_halo[i]); + } + //////////////////////////////////////////////////////////////// // Assert power of two shm_size. //////////////////////////////////////////////////////////////// @@ -648,6 +654,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector Date: Sun, 20 Aug 2017 02:53:12 +0100 Subject: [PATCH 156/170] finalise issue on new OPA revert --- benchmarks/Benchmark_dwf.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index 98ce0a07..3858226e 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -503,9 +503,9 @@ int main (int argc, char ** argv) std::cout< Date: Sun, 20 Aug 2017 03:08:54 +0100 Subject: [PATCH 157/170] MAP_HUGETLB portability fix --- lib/communicator/Communicator_base.cc | 2 ++ lib/communicator/Communicator_mpi3.cc | 15 +++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc index 2e6626be..3378c56a 100644 --- a/lib/communicator/Communicator_base.cc +++ b/lib/communicator/Communicator_base.cc @@ -149,7 +149,9 @@ void CartesianCommunicator::ShmInitGeneric(void){ #if 1 int mmap_flag = MAP_SHARED | MAP_ANONYMOUS; +#ifdef MAP_HUGETLB if ( Hugepages ) mmap_flag |= MAP_HUGETLB; +#endif ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); if (ShmCommBuf == (void *)MAP_FAILED) exit(EXIT_FAILURE); std::cout << "ShmCommBuf "< #include #endif -// Make up for linex deficiencies -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 0x0 -#endif -#ifndef MAP_HUGETLB -#define MAP_HUGETLB 0x0 -#endif namespace Grid { @@ -220,7 +213,9 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { ftruncate(fd, size); int mmap_flag = MAP_SHARED; +#ifdef MAP_HUGETLB if (Hugepages) mmap_flag |= MAP_HUGETLB; +#endif void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0); if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); } @@ -274,7 +269,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { for(int r=0;r Date: Wed, 23 Aug 2017 15:07:18 +0100 Subject: [PATCH 158/170] Staggered multinode block cg debugged. Missing global sum. Code stalls and resumes on KNL at cambridge. Curious. CG iterations 23ms each, then 3200 ms pauses. Mean bandwidth reports as 200MB/s. Comms dominant in the report. However, the time behaviour suggests it is *bursty*.... Could be swap to disk? --- .../iterative/BlockConjugateGradient.h | 9 ++- lib/lattice/Lattice_reduction.h | 38 ++++++++---- .../fermion/ImprovedStaggeredFermion5D.cc | 60 +++++++++++++++++++ .../fermion/ImprovedStaggeredFermion5D.h | 10 ++++ .../solver/Test_staggered_block_cg_unprec.cc | 8 ++- 5 files changed, 110 insertions(+), 15 deletions(-) diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h index 9418f63c..d7817c05 100644 --- a/lib/algorithms/iterative/BlockConjugateGradient.h +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -199,7 +199,12 @@ void BlockCGrQsolve(LinearOperatorBase &Linop, const Field &B, Field &X) Linop.HermOp(X, AD); tmp = B - AD; + //std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl; ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp); + //std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl; + //std::cout << GridLogMessage << " m_rr " << m_rr< &Linop, const Field &B, Field &X) MatrixTimer.Start(); Linop.HermOp(D, Z); MatrixTimer.Stop(); + //std::cout << GridLogMessage << " norm2 Z " < &R,std::vector &a,const Lattice } }; +/* inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) { int NN = BlockSolverGrid->_ndimension; @@ -387,6 +388,7 @@ inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or } return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); } +*/ template static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) @@ -398,14 +400,15 @@ static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice int Nblock = X._grid->GlobalDimensions()[Orthog]; GridBase *FullGrid = X._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - Lattice Xslice(SliceGrid); - Lattice Rslice(SliceGrid); + // Lattice Xslice(SliceGrid); + // Lattice Rslice(SliceGrid); assert( FullGrid->_simd_layout[Orthog]==1); int nh = FullGrid->_ndimension; - int nl = SliceGrid->_ndimension; + // int nl = SliceGrid->_ndimension; + int nl = nh-1; //FIXME package in a convenient iterator //Should loop over a plane orthogonal to direction "Orthog" @@ -448,14 +451,14 @@ static void sliceMulMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice< int Nblock = X._grid->GlobalDimensions()[Orthog]; GridBase *FullGrid = X._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - Lattice Xslice(SliceGrid); - Lattice Rslice(SliceGrid); + // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + // Lattice Xslice(SliceGrid); + // Lattice Rslice(SliceGrid); assert( FullGrid->_simd_layout[Orthog]==1); int nh = FullGrid->_ndimension; - int nl = SliceGrid->_ndimension; + // int nl = SliceGrid->_ndimension; + int nl=1; //FIXME package in a convenient iterator //Should loop over a plane orthogonal to direction "Orthog" @@ -498,18 +501,19 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice typedef typename vobj::vector_type vector_type; GridBase *FullGrid = lhs._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); int Nblock = FullGrid->GlobalDimensions()[Orthog]; - Lattice Lslice(SliceGrid); - Lattice Rslice(SliceGrid); + // Lattice Lslice(SliceGrid); + // Lattice Rslice(SliceGrid); mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); assert( FullGrid->_simd_layout[Orthog]==1); int nh = FullGrid->_ndimension; - int nl = SliceGrid->_ndimension; + // int nl = SliceGrid->_ndimension; + int nl = nh-1; //FIXME package in a convenient iterator //Should loop over a plane orthogonal to direction "Orthog" @@ -550,6 +554,14 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice mat += mat_thread; } } + + for(int i=0;iGlobalSum(sum); + mat(i,j)=sum; + }} + return; } diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc index 61a3c559..7d988d89 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc @@ -230,8 +230,15 @@ void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOr { Compressor compressor; int LLs = in._grid->_rdimensions[0]; + + + + DhopTotalTime -= usecond(); + DhopCommTime -= usecond(); st.HaloExchange(in,compressor); + DhopCommTime += usecond(); + DhopComputeTime -= usecond(); // Dhop takes the 4d grid from U, and makes a 5d index for fermion if (dag == DaggerYes) { parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { @@ -244,12 +251,15 @@ void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOr Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out); } } + DhopComputeTime += usecond(); + DhopTotalTime += usecond(); } template void ImprovedStaggeredFermion5D::DhopOE(const FermionField &in, FermionField &out,int dag) { + DhopCalls+=1; conformable(in._grid,FermionRedBlackGrid()); // verifies half grid conformable(in._grid,out._grid); // drops the cb check @@ -261,6 +271,7 @@ void ImprovedStaggeredFermion5D::DhopOE(const FermionField &in, FermionFie template void ImprovedStaggeredFermion5D::DhopEO(const FermionField &in, FermionField &out,int dag) { + DhopCalls+=1; conformable(in._grid,FermionRedBlackGrid()); // verifies half grid conformable(in._grid,out._grid); // drops the cb check @@ -272,6 +283,7 @@ void ImprovedStaggeredFermion5D::DhopEO(const FermionField &in, FermionFie template void ImprovedStaggeredFermion5D::Dhop(const FermionField &in, FermionField &out,int dag) { + DhopCalls+=2; conformable(in._grid,FermionGrid()); // verifies full grid conformable(in._grid,out._grid); @@ -280,6 +292,54 @@ void ImprovedStaggeredFermion5D::Dhop(const FermionField &in, FermionField DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag); } +template +void ImprovedStaggeredFermion5D::Report(void) +{ + std::vector latt = GridDefaultLatt(); + RealD volume = Ls; for(int mu=0;mu_Nprocessors; + RealD NN = _FourDimGrid->NodeCount(); + + std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; + + std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls : " + << DhopCalls << std::endl; + std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime /Calls : " + << DhopTotalTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime /Calls : " + << DhopCommTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls : " + << DhopComputeTime / DhopCalls << " us" << std::endl; + + // Average the compute time + _FourDimGrid->GlobalSum(DhopComputeTime); + DhopComputeTime/=NP; + + RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl; + + RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl; + + std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil" < +void ImprovedStaggeredFermion5D::ZeroCounters(void) +{ + DhopCalls = 0; + DhopTotalTime = 0; + DhopCommTime = 0; + DhopComputeTime = 0; + Stencil.ZeroCounters(); + StencilEven.ZeroCounters(); + StencilOdd.ZeroCounters(); +} ///////////////////////////////////////////////////////////////////////// // Implement the general interface. Here we use SAME mass on all slices diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h index 4961da49..ca1a955a 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -55,6 +55,16 @@ namespace QCD { FermionField _tmp; FermionField &tmp(void) { return _tmp; } + //////////////////////////////////////// + // Performance monitoring + //////////////////////////////////////// + void Report(void); + void ZeroCounters(void); + double DhopTotalTime; + double DhopCalls; + double DhopCommTime; + double DhopComputeTime; + /////////////////////////////////////////////////////////////// // Implement the abstract base /////////////////////////////////////////////////////////////// diff --git a/tests/solver/Test_staggered_block_cg_unprec.cc b/tests/solver/Test_staggered_block_cg_unprec.cc index 8db41e98..f54bc3b2 100644 --- a/tests/solver/Test_staggered_block_cg_unprec.cc +++ b/tests/solver/Test_staggered_block_cg_unprec.cc @@ -75,7 +75,7 @@ int main (int argc, char ** argv) LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu); RealD mass=0.003; - ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); + ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); MdagMLinearOperator HermOp(Ds); ConjugateGradient CG(1.0e-8,10000); @@ -99,21 +99,27 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << " Calling 5d CG for "< Date: Thu, 24 Aug 2017 10:17:52 +0100 Subject: [PATCH 159/170] FFT test compile fixed --- lib/qcd/utils/GaugeFix.h | 3 +++ tests/core/Test_fft_gfix.cc | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/lib/qcd/utils/GaugeFix.h b/lib/qcd/utils/GaugeFix.h index f2ea1aa2..c4ea31aa 100644 --- a/lib/qcd/utils/GaugeFix.h +++ b/lib/qcd/utils/GaugeFix.h @@ -26,6 +26,8 @@ Author: Peter Boyle /* END LEGAL */ //#include +#ifndef GRID_QCD_GAUGE_FIX_H +#define GRID_QCD_GAUGE_FIX_H namespace Grid { namespace QCD { @@ -188,3 +190,4 @@ class FourierAcceleratedGaugeFixer : public Gimpl { } } +#endif diff --git a/tests/core/Test_fft_gfix.cc b/tests/core/Test_fft_gfix.cc index 9732eb85..916c4b0b 100644 --- a/tests/core/Test_fft_gfix.cc +++ b/tests/core/Test_fft_gfix.cc @@ -28,6 +28,9 @@ Author: Peter Boyle /* END LEGAL */ #include +using namespace Grid; +using namespace Grid::QCD; + int main (int argc, char ** argv) { std::vector seeds({1,2,3,4}); @@ -82,6 +85,7 @@ int main (int argc, char ** argv) Uorg = Uorg - Umu; std::cout << " Norm Difference "<< norm2(Uorg) << std::endl; + std::cout << " Norm "<< norm2(Umu) << std::endl; std::cout<< "*****************************************************************" < Date: Thu, 24 Aug 2017 17:31:44 +0100 Subject: [PATCH 160/170] Fix bug in non-zero momentum projection --- extras/Hadrons/Modules/MScalar/ChargedProp.cc | 25 ++++++++-- extras/Hadrons/Modules/MScalar/ScalarVP.cc | 50 +++++++++++++++---- 2 files changed, 60 insertions(+), 15 deletions(-) diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.cc b/extras/Hadrons/Modules/MScalar/ChargedProp.cc index 285b237a..1b901bf1 100644 --- a/extras/Hadrons/Modules/MScalar/ChargedProp.cc +++ b/extras/Hadrons/Modules/MScalar/ChargedProp.cc @@ -194,7 +194,10 @@ void TChargedProp::execute(void) buf = prop; for (unsigned int j = 0; j < env().getNd()-1; ++j) { - buf = buf*pow(adj(*phase_[j]), mom[j]); + for (unsigned int momcount = 0; momcount < mom[j]; ++momcount) + { + buf = buf*adj(*phase_[j]); + } } sliceSum(buf, vecBuf, Tp); result.resize(vecBuf.size()); @@ -208,7 +211,10 @@ void TChargedProp::execute(void) buf = *prop0_; for (unsigned int j = 0; j < env().getNd()-1; ++j) { - buf = buf*pow(adj(*phase_[j]), mom[j]); + for (unsigned int momcount = 0; momcount < mom[j]; ++momcount) + { + buf = buf*adj(*phase_[j]); + } } sliceSum(buf, vecBuf, Tp); for (unsigned int t = 0; t < vecBuf.size(); ++t) @@ -221,7 +227,10 @@ void TChargedProp::execute(void) buf = propQ; for (unsigned int j = 0; j < env().getNd()-1; ++j) { - buf = buf*pow(adj(*phase_[j]), mom[j]); + for (unsigned int momcount = 0; momcount < mom[j]; ++momcount) + { + buf = buf*adj(*phase_[j]); + } } sliceSum(buf, vecBuf, Tp); for (unsigned int t = 0; t < vecBuf.size(); ++t) @@ -234,7 +243,10 @@ void TChargedProp::execute(void) buf = propSun; for (unsigned int j = 0; j < env().getNd()-1; ++j) { - buf = buf*pow(adj(*phase_[j]), mom[j]); + for (unsigned int momcount = 0; momcount < mom[j]; ++momcount) + { + buf = buf*adj(*phase_[j]); + } } sliceSum(buf, vecBuf, Tp); for (unsigned int t = 0; t < vecBuf.size(); ++t) @@ -247,7 +259,10 @@ void TChargedProp::execute(void) buf = propTad; for (unsigned int j = 0; j < env().getNd()-1; ++j) { - buf = buf*pow(adj(*phase_[j]), mom[j]); + for (unsigned int momcount = 0; momcount < mom[j]; ++momcount) + { + buf = buf*adj(*phase_[j]); + } } sliceSum(buf, vecBuf, Tp); for (unsigned int t = 0; t < vecBuf.size(); ++t) diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.cc b/extras/Hadrons/Modules/MScalar/ScalarVP.cc index e4f4e820..4d923802 100644 --- a/extras/Hadrons/Modules/MScalar/ScalarVP.cc +++ b/extras/Hadrons/Modules/MScalar/ScalarVP.cc @@ -214,7 +214,10 @@ void TScalarVP::execute(void) vpPhase = freeVpTensor[mu][nu]; for (unsigned int j = 0; j < env().getNd()-1; ++j) { - vpPhase = vpPhase*pow(adj(*phase_[j]), mom[j]); + for (unsigned int momcount = 0; momcount < mom[j]; ++momcount) + { + vpPhase = vpPhase*adj(*phase_[j]); + } } sliceSum(vpPhase, vecBuf, Tp); result.resize(vecBuf.size()); @@ -248,7 +251,10 @@ void TScalarVP::execute(void) vpPhase = tmp_vp; for (unsigned int j = 0; j < env().getNd()-1; ++j) { - vpPhase = vpPhase*pow(adj(*phase_[j]), mom[j]); + for (unsigned int momcount = 0; momcount < mom[j]; ++momcount) + { + vpPhase = vpPhase*adj(*phase_[j]); + } } sliceSum(vpPhase, vecBuf, Tp); result.resize(vecBuf.size()); @@ -282,7 +288,10 @@ void TScalarVP::execute(void) vpPhase = tmp_vp; for (unsigned int j = 0; j < env().getNd()-1; ++j) { - vpPhase = vpPhase*pow(adj(*phase_[j]), mom[j]); + for (unsigned int momcount = 0; momcount < mom[j]; ++momcount) + { + vpPhase = vpPhase*adj(*phase_[j]); + } } sliceSum(vpPhase, vecBuf, Tp); result.resize(vecBuf.size()); @@ -314,7 +323,10 @@ void TScalarVP::execute(void) vpPhase = tmp_vp; for (unsigned int j = 0; j < env().getNd()-1; ++j) { - vpPhase = vpPhase*pow(adj(*phase_[j]), mom[j]); + for (unsigned int momcount = 0; momcount < mom[j]; ++momcount) + { + vpPhase = vpPhase*adj(*phase_[j]); + } } sliceSum(vpPhase, vecBuf, Tp); result.resize(vecBuf.size()); @@ -345,7 +357,10 @@ void TScalarVP::execute(void) vpPhase = tmp_vp; for (unsigned int j = 0; j < env().getNd()-1; ++j) { - vpPhase = vpPhase*pow(adj(*phase_[j]), mom[j]); + for (unsigned int momcount = 0; momcount < mom[j]; ++momcount) + { + vpPhase = vpPhase*adj(*phase_[j]); + } } sliceSum(vpPhase, vecBuf, Tp); result.resize(vecBuf.size()); @@ -377,7 +392,10 @@ void TScalarVP::execute(void) vpPhase = tmp_vp; for (unsigned int j = 0; j < env().getNd()-1; ++j) { - vpPhase = vpPhase*pow(adj(*phase_[j]), mom[j]); + for (unsigned int momcount = 0; momcount < mom[j]; ++momcount) + { + vpPhase = vpPhase*adj(*phase_[j]); + } } sliceSum(vpPhase, vecBuf, Tp); result.resize(vecBuf.size()); @@ -408,7 +426,10 @@ void TScalarVP::execute(void) vpPhase = tmp_vp; for (unsigned int j = 0; j < env().getNd()-1; ++j) { - vpPhase = vpPhase*pow(adj(*phase_[j]), mom[j]); + for (unsigned int momcount = 0; momcount < mom[j]; ++momcount) + { + vpPhase = vpPhase*adj(*phase_[j]); + } } sliceSum(vpPhase, vecBuf, Tp); result.resize(vecBuf.size()); @@ -443,7 +464,10 @@ void TScalarVP::execute(void) vpPhase = tmp_vp; for (unsigned int j = 0; j < env().getNd()-1; ++j) { - vpPhase = vpPhase*pow(adj(*phase_[j]), mom[j]); + for (unsigned int momcount = 0; momcount < mom[j]; ++momcount) + { + vpPhase = vpPhase*adj(*phase_[j]); + } } sliceSum(vpPhase, vecBuf, Tp); result.resize(vecBuf.size()); @@ -477,7 +501,10 @@ void TScalarVP::execute(void) vpPhase = tmp_vp; for (unsigned int j = 0; j < env().getNd()-1; ++j) { - vpPhase = vpPhase*pow(adj(*phase_[j]), mom[j]); + for (unsigned int momcount = 0; momcount < mom[j]; ++momcount) + { + vpPhase = vpPhase*adj(*phase_[j]); + } } sliceSum(vpPhase, vecBuf, Tp); result.resize(vecBuf.size()); @@ -501,7 +528,10 @@ void TScalarVP::execute(void) vpPhase = vpTensor[mu][nu]; for (unsigned int j = 0; j < env().getNd()-1; ++j) { - vpPhase = vpPhase*pow(adj(*phase_[j]), mom[j]); + for (unsigned int momcount = 0; momcount < mom[j]; ++momcount) + { + vpPhase = vpPhase*adj(*phase_[j]); + } } sliceSum(vpPhase, vecBuf, Tp); result.resize(vecBuf.size()); From 102ea9ae668a1c8eef506113348355e9d78fd522 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Thu, 24 Aug 2017 18:17:09 +0100 Subject: [PATCH 161/170] CI update --- .travis.yml | 68 ----------------------------------------------------- README.md | 16 +------------ 2 files changed, 1 insertion(+), 83 deletions(-) diff --git a/.travis.yml b/.travis.yml index 64dae823..7d8203ce 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,68 +9,6 @@ matrix: - os: osx osx_image: xcode8.3 compiler: clang - - compiler: gcc - dist: trusty - sudo: required - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - g++-4.9 - - libmpfr-dev - - libgmp-dev - - libmpc-dev - - libopenmpi-dev - - openmpi-bin - - binutils-dev - env: VERSION=-4.9 - - compiler: gcc - dist: trusty - sudo: required - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - g++-5 - - libmpfr-dev - - libgmp-dev - - libmpc-dev - - libopenmpi-dev - - openmpi-bin - - binutils-dev - env: VERSION=-5 - - compiler: clang - dist: trusty - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - g++-4.8 - - libmpfr-dev - - libgmp-dev - - libmpc-dev - - libopenmpi-dev - - openmpi-bin - - binutils-dev - env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz - - compiler: clang - dist: trusty - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - g++-4.8 - - libmpfr-dev - - libgmp-dev - - libmpc-dev - - libopenmpi-dev - - openmpi-bin - - binutils-dev - env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz before_install: - export GRIDDIR=`pwd` @@ -106,9 +44,3 @@ script: - make -j4 - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - make check - - echo make clean - - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto ; fi - - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then make -j4; fi - - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi - - diff --git a/README.md b/README.md index 1e0988f3..13dd6996 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,4 @@ -# Grid - - - - - - - - - -
Last stable release - -
Development branch - -
+# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid) **Data parallel C++ mathematical object library.** From c3b1263e75212356fc1aa061cd226db70f4f00fc Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 25 Aug 2017 09:25:54 +0100 Subject: [PATCH 162/170] Benchmark prep --- benchmarks/Benchmark_ITT.cc | 322 +++++++++++++++++++--- benchmarks/Benchmark_comms.cc | 30 +- lib/allocator/AlignedAllocator.h | 5 + lib/communicator/Communicator_base.cc | 6 +- lib/communicator/Communicator_mpi3.cc | 5 +- lib/communicator/Communicator_mpit.cc | 19 +- lib/qcd/action/fermion/CayleyFermion5D.cc | 12 +- lib/qcd/action/fermion/WilsonCompressor.h | 41 ++- lib/qcd/action/fermion/WilsonFermion5D.cc | 11 + lib/stencil/Stencil.h | 114 +++++++- 10 files changed, 494 insertions(+), 71 deletions(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 9bf7d0a5..c5226ee1 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -32,6 +32,19 @@ using namespace std; using namespace Grid; using namespace Grid::QCD; +typedef WilsonFermion5D WilsonFermion5DR; +typedef WilsonFermion5D WilsonFermion5DF; +typedef WilsonFermion5D WilsonFermion5DD; + + +std::vector L_list; +std::vector Ls_list; +std::vector mflop_list; + +double mflop_ref; +double mflop_ref_err; + +int NN_global; struct time_statistics{ double mean; @@ -95,13 +108,15 @@ public: static void Comms(void) { - int Nloop=100; + int Nloop=1000; int nmu=0; int maxlat=32; std::vector simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd()); std::vector mpi_layout = GridDefaultMpi(); + for(int mu=0;mu1) nmu++; + std::vector t_time(Nloop); time_statistics timestat; @@ -133,13 +148,14 @@ public: bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); } - int ncomm; int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + int ncomm; double dbytes; + std::vector times(Nloop); for(int i=0;i requests; dbytes=0; ncomm=0; @@ -150,7 +166,6 @@ public: if (mpi_layout[mu]>1 ) { - ncomm++; int xmit_to_rank; int recv_from_rank; if ( dir == mu ) { @@ -160,18 +175,18 @@ public: int comm_proc = mpi_layout[mu]-1; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); } -#if 1 - tbytes= Grid.StencilSendToRecvFromBegin(requests, - (void *)&xbuf[dir][0], - xmit_to_rank, - (void *)&rbuf[dir][0], - recv_from_rank, - bytes,dir); - Grid.StencilSendToRecvFromComplete(requests,dir); -#endif - requests.resize(0); - + tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, + (void *)&rbuf[dir][0], recv_from_rank, + bytes,dir); + +#ifdef GRID_OMP #pragma omp atomic +#endif + ncomm++; + +#ifdef GRID_OMP +#pragma omp atomic +#endif dbytes+=tbytes; } } @@ -181,13 +196,15 @@ public: } timestat.statistics(t_time); + // for(int i=0;i({45,12,81,9})); for(int lat=8;lat<=lmax;lat+=4){ @@ -253,8 +271,7 @@ public: } }; - - static void DWF(int Ls,int L) + static double DWF5(int Ls,int L) { RealD mass=0.1; RealD M5 =1.8; @@ -262,6 +279,7 @@ public: double mflops; double mflops_best = 0; double mflops_worst= 0; + std::vector mflops_all; /////////////////////////////////////////////////////// // Set/Get the layout & grid size @@ -274,6 +292,189 @@ public: GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); uint64_t NP = TmpGrid->RankCount(); uint64_t NN = TmpGrid->NodeCount(); + NN_global=NN; + uint64_t SHM=NP/NN; + + std::vector internal; + if ( SHM == 1 ) internal = std::vector({1,1,1,1}); + else if ( SHM == 2 ) internal = std::vector({2,1,1,1}); + else if ( SHM == 4 ) internal = std::vector({2,2,1,1}); + else if ( SHM == 8 ) internal = std::vector({2,2,2,1}); + else assert(0); + + std::vector nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]}); + std::vector latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]}); + + ///////// Welcome message //////////// + std::cout< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG RNG5(sFGrid); RNG5.SeedFixedIntegers(seeds5); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + ///////// Source preparation //////////// + LatticeFermion src (sFGrid); random(RNG5,src); + LatticeFermion tmp (sFGrid); + + RealD N2 = 1.0/::sqrt(norm2(src)); + src = src*N2; + + LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); + + WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5); + LatticeFermion src_e (sFrbGrid); + LatticeFermion src_o (sFrbGrid); + LatticeFermion r_e (sFrbGrid); + LatticeFermion r_o (sFrbGrid); + LatticeFermion r_eo (sFGrid); + LatticeFermion err (sFGrid); + { + + pickCheckerboard(Even,src_e,src); + pickCheckerboard(Odd,src_o,src); + +#if defined(AVX512) + const int num_cases = 6; + std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O "); +#else + const int num_cases = 4; + std::string fmt("U/S ; U/O ; G/S ; G/O "); +#endif + controls Cases [] = { +#ifdef AVX512 + { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, + { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, +#endif + { QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, + { QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, + { QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, + { QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential } + }; + + for(int c=0;cBarrier(); + for(int i=0;iBarrier(); + double t1=usecond(); + // uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0); + // if (ncall < 500) ncall = 500; + uint64_t ncall = 1000; + + sFGrid->Broadcast(0,&ncall,sizeof(ncall)); + + // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); + for(uint64_t i=0;iBarrier(); + + double volume=Ls; for(int mu=0;mumflops_best ) mflops_best = mflops; + if ( mflops mflops_all; + + /////////////////////////////////////////////////////// + // Set/Get the layout & grid size + /////////////////////////////////////////////////////// + int threads = GridThread::GetThreads(); + std::vector mpi = GridDefaultMpi(); assert(mpi.size()==4); + std::vector local({L,L,L,L}); + + GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(std::vector({64,64,64,64}), + GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + uint64_t NP = TmpGrid->RankCount(); + uint64_t NN = TmpGrid->NodeCount(); + NN_global=NN; uint64_t SHM=NP/NN; std::vector internal; @@ -364,13 +565,15 @@ public: #if defined(AVX512) const int num_cases = 6; + std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O "); #else const int num_cases = 4; + std::string fmt("U/S ; U/O ; G/S ; G/O "); #endif controls Cases [] = { #ifdef AVX512 - { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, + { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, #endif { QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, { QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, @@ -394,7 +597,7 @@ public: if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<Barrier(); for(int i=0;iBarrier(); double t1=usecond(); - uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0); + // uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0); + // if (ncall < 500) ncall = 500; + uint64_t ncall = 1000; + FGrid->Broadcast(0,&ncall,sizeof(ncall)); // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<mflops_best ) mflops_best = mflops; @@ -450,12 +656,20 @@ public: } std::cout< L_list({8,12,16,24}); + std::vector wilson; + std::vector dwf4; + std::vector dwf5; + if ( do_wilson ) { int Ls=1; std::cout< > xbuf(8,Vector(lat*lat*lat*Ls)); - Vector > rbuf(8,Vector(lat*lat*lat*Ls)); + std::vector > xbuf(8); + std::vector > rbuf(8); int ncomm; int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + for(int mu=0;mu<8;mu++){ + xbuf[mu].resize(lat*lat*lat*Ls); + rbuf[mu].resize(lat*lat*lat*Ls); + // std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] < > xbuf(8,Vector(lat*lat*lat*Ls)); - Vector > rbuf(8,Vector(lat*lat*lat*Ls)); + std::vector > xbuf(8); + std::vector > rbuf(8); + for(int mu=0;mu<8;mu++){ + xbuf[mu].resize(lat*lat*lat*Ls); + rbuf[mu].resize(lat*lat*lat*Ls); + // std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] < &waitall,int dir) { - // Do nothing + int nreq=waitall.size(); + MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE); }; double CartesianCommunicator::StencilSendToRecvFrom(void *xmit, int xmit_to_rank, @@ -262,7 +275,7 @@ double CartesianCommunicator::StencilSendToRecvFrom(void *xmit, // Give the CPU to MPI immediately; can use threads to overlap optionally MPI_Request req[2]; MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]); - MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank,myrank, communicator_halo[dir], &req[0]); + MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank ,myrank , communicator_halo[dir],&req[0]); MPI_Waitall(2, req, MPI_STATUSES_IGNORE); return 2.0*bytes; } diff --git a/lib/qcd/action/fermion/CayleyFermion5D.cc b/lib/qcd/action/fermion/CayleyFermion5D.cc index 46ba3793..5e67d1f1 100644 --- a/lib/qcd/action/fermion/CayleyFermion5D.cc +++ b/lib/qcd/action/fermion/CayleyFermion5D.cc @@ -429,7 +429,7 @@ void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vectorM5) +1.0); - // assert(fabs(bee[i])>0.0); + assert(fabs(bee[i])>0.0); cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5)); beo[i]=as[i]*bs[i]; ceo[i]=-as[i]*cs[i]; @@ -455,11 +455,17 @@ void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vector0.0); + assert(fabs(bee[0])>0.0); lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column leem[i]=mass*cee[Ls-1]/bee[0]; - for(int j=0;j0.0); + leem[i]*= aee[j]/bee[j+1]; + } uee[i] =-aee[i]/bee[i]; // up-diag entry on the ith row @@ -478,7 +484,7 @@ void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vector0.0); + assert(fabs(bee[j])>0.0); delta_d *= cee[j]/bee[j]; } dee[Ls-1] += delta_d; diff --git a/lib/qcd/action/fermion/WilsonCompressor.h b/lib/qcd/action/fermion/WilsonCompressor.h index 96cbe1ec..30c6d838 100644 --- a/lib/qcd/action/fermion/WilsonCompressor.h +++ b/lib/qcd/action/fermion/WilsonCompressor.h @@ -238,7 +238,35 @@ template using WilsonCompressor = WilsonCom template class WilsonStencil : public CartesianStencil { public: - + double timer0; + double timer1; + double timer2; + double timer3; + double timer4; + double timer5; + double timer6; + uint64_t callsi; + void ZeroCountersi(void) + { + std::cout << GridLogMessage << " ZeroCountersi()"< same_node; @@ -252,6 +280,7 @@ public: : CartesianStencil (grid,npoints,checkerboard,directions,distances) , same_node(npoints) { + ZeroCountersi(); surface_list.resize(0); }; @@ -282,17 +311,25 @@ public: { std::vector > reqs; this->HaloExchangeOptGather(source,compress); + double t1=usecond(); this->CommunicateBegin(reqs); this->CommunicateComplete(reqs); + double t2=usecond(); timer1 += t2-t1; this->CommsMerge(compress); + double t3=usecond(); timer2 += t3-t2; this->CommsMergeSHM(compress); + double t4=usecond(); timer3 += t4-t3; } template void HaloExchangeOptGather(const Lattice &source,compressor &compress) { this->Prepare(); + double t0=usecond(); this->HaloGatherOpt(source,compress); + double t1=usecond(); + timer0 += t1-t0; + callsi++; } template @@ -304,7 +341,9 @@ public: typedef typename compressor::SiteHalfSpinor SiteHalfSpinor; typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor; + this->mpi3synctime_g-=usecond(); this->_grid->StencilBarrier(); + this->mpi3synctime_g+=usecond(); assert(source._grid==this->_grid); this->halogtime-=usecond(); diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 404ecce0..c5b0f872 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -185,6 +185,11 @@ void WilsonFermion5D::Report(void) std::cout << GridLogMessage << "WilsonFermion5D StencilEven"< 0){ + std::cout << GridLogMessage << "WilsonFermion5D Stencil Reporti()" < @@ -204,6 +209,9 @@ void WilsonFermion5D::ZeroCounters(void) { Stencil.ZeroCounters(); StencilEven.ZeroCounters(); StencilOdd.ZeroCounters(); + Stencil.ZeroCountersi(); + StencilEven.ZeroCountersi(); + StencilOdd.ZeroCountersi(); } @@ -445,6 +453,9 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg DhopCommTime += ctime; DhopComputeTime+=ptime; + // First to enter, last to leave timing + st.CollateThreads(); + DhopFaceTime-=usecond(); st.CommsMerge(compressor); DhopFaceTime+=usecond(); diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index cca67587..ad454bcb 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -176,6 +176,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal // Timing info; ugly; possibly temporary ///////////////////////////////////////// double commtime; + double mpi3synctime; + double mpi3synctime_g; + double shmmergetime; double gathertime; double gathermtime; double halogtime; @@ -185,8 +188,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal double splicetime; double nosplicetime; double calls; - std::vector comms_bytesthr; - std::vector commtimethr; + std::vector comm_bytes_thr; + std::vector comm_time_thr; + std::vector comm_enter_thr; + std::vector comm_leave_thr; //////////////////////////////////////// // Stencil query @@ -262,18 +267,45 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal #endif if (nthreads == -1) nthreads = 1; if (mythread < nthreads) { + comm_enter_thr[mythread] = usecond(); for (int i = mythread; i < Packets.size(); i += nthreads) { - double start = usecond(); uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf, Packets[i].to_rank, Packets[i].recv_buf, Packets[i].from_rank, Packets[i].bytes,i); - comms_bytesthr[mythread] += bytes; - commtimethr[mythread] += usecond() - start; + comm_bytes_thr[mythread] += bytes; } + comm_leave_thr[mythread]= usecond(); + comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread]; } } + + void CollateThreads(void) + { + int nthreads = CartesianCommunicator::nCommThreads; + double first=0.0; + double last =0.0; + + for(int t=0;t 0.0) && ( t0 < first ) ) first = t0; // min time seen + + if ( t1 > last ) last = t1; // max time seen + + } + commtime+= last-first; + } void CommunicateBegin(std::vector > &reqs) { reqs.resize(Packets.size()); @@ -295,14 +327,48 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal } commtime+=usecond(); } + void Communicate(void) + { +#ifdef GRID_OMP +#pragma omp parallel + { + // must be called in parallel region + int mythread = omp_get_thread_num(); + int maxthreads= omp_get_max_threads(); + int nthreads = CartesianCommunicator::nCommThreads; + assert(nthreads <= maxthreads); + + if (nthreads == -1) nthreads = 1; +#else + int mythread = 0; + int nthreads = 1; +#endif + if (mythread < nthreads) { + for (int i = mythread; i < Packets.size(); i += nthreads) { + double start = usecond(); + comm_bytes_thr[mythread] += _grid->StencilSendToRecvFrom(Packets[i].send_buf, + Packets[i].to_rank, + Packets[i].recv_buf, + Packets[i].from_rank, + Packets[i].bytes,i); + comm_time_thr[mythread] += usecond() - start; + } + } +#ifdef GRID_OMP + } +#endif + } template void HaloExchange(const Lattice &source,compressor &compress) { std::vector > reqs; Prepare(); HaloGather(source,compress); + // Concurrent CommunicateBegin(reqs); CommunicateComplete(reqs); + // Sequential + // Communicate(); CommsMergeSHM(compress); CommsMerge(compress); } @@ -363,7 +429,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal template void HaloGather(const Lattice &source,compressor &compress) { + mpi3synctime_g-=usecond(); _grid->StencilBarrier();// Synch shared memory on a single nodes + mpi3synctime_g+=usecond(); // conformable(source._grid,_grid); assert(source._grid==_grid); @@ -423,8 +491,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal CommsMerge(decompress,Mergers,Decompressions); } template void CommsMergeSHM(decompressor decompress) { + mpi3synctime-=usecond(); _grid->StencilBarrier();// Synch shared memory on a single nodes + mpi3synctime+=usecond(); + shmmergetime-=usecond(); CommsMerge(decompress,MergersSHM,DecompressionsSHM); + shmmergetime+=usecond(); } template @@ -470,8 +542,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal const std::vector &distances) : _permute_type(npoints), _comm_buf_size(npoints), - comms_bytesthr(npoints), - commtimethr(npoints) + comm_bytes_thr(npoints), + comm_enter_thr(npoints), + comm_leave_thr(npoints), + comm_time_thr(npoints) { face_table_computed=0; _npoints = npoints; @@ -1025,8 +1099,15 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal void ZeroCounters(void) { gathertime = 0.; commtime = 0.; - memset(&commtimethr[0], 0, sizeof(commtimethr)); - memset(&comms_bytesthr[0], 0, sizeof(comms_bytesthr)); + mpi3synctime=0.; + mpi3synctime_g=0.; + shmmergetime=0.; + for(int i=0;i<_npoints;i++){ + comm_time_thr[i]=0; + comm_bytes_thr[i]=0; + comm_enter_thr[i]=0; + comm_leave_thr[i]=0; + } halogtime = 0.; mergetime = 0.; decompresstime = 0.; @@ -1043,13 +1124,17 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal RealD NP = _grid->_Nprocessors; RealD NN = _grid->NodeCount(); double t = 0; - // if commtimethr is set they were all done in parallel so take the max + // if comm_time_thr is set they were all done in parallel so take the max // but add up the bytes + int threaded = 0 ; for (int i = 0; i < 8; ++i) { - comms_bytes += comms_bytesthr[i]; - if (t < commtimethr[i]) t = commtimethr[i]; + if ( comm_time_thr[i]>0.0 ) { + threaded = 1; + comms_bytes += comm_bytes_thr[i]; + if (t < comm_time_thr[i]) t = comm_time_thr[i]; + } } - commtime += t; + if (threaded) commtime += t; _grid->GlobalSum(commtime); commtime/=NP; if ( calls > 0. ) { @@ -1065,6 +1150,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"< Date: Fri, 25 Aug 2017 11:41:01 +0100 Subject: [PATCH 163/170] updated from cambridge mpi3 shakeout --- benchmarks/Benchmark_ITT.cc | 4 ++-- lib/qcd/action/fermion/WilsonCompressor.h | 7 +++++-- lib/stencil/Stencil.h | 8 ++++---- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index c5226ee1..bd75dd8e 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -108,7 +108,7 @@ public: static void Comms(void) { - int Nloop=1000; + int Nloop=200; int nmu=0; int maxlat=32; @@ -197,7 +197,7 @@ public: timestat.statistics(t_time); // for(int i=0;i > reqs; this->HaloExchangeOptGather(source,compress); double t1=usecond(); - this->CommunicateBegin(reqs); - this->CommunicateComplete(reqs); + // Asynchronous MPI calls multidirectional, Isend etc... + // this->CommunicateBegin(reqs); + // this->CommunicateComplete(reqs); + // Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways. + this->Communicate(); double t2=usecond(); timer1 += t2-t1; this->CommsMerge(compress); double t3=usecond(); timer2 += t3-t2; diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index ad454bcb..cd0792d5 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -365,10 +365,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal Prepare(); HaloGather(source,compress); // Concurrent - CommunicateBegin(reqs); - CommunicateComplete(reqs); - // Sequential - // Communicate(); + //CommunicateBegin(reqs); + //CommunicateComplete(reqs); + // Sequential, possibly threaded + Communicate(); CommsMergeSHM(compress); CommsMerge(compress); } From 3a582174053732f4e5645367b750fd446d8fcb1d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 25 Aug 2017 14:29:53 +0100 Subject: [PATCH 164/170] Updated --- benchmarks/Benchmark_ITT.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index bd75dd8e..2edae8d0 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -386,7 +386,7 @@ public: if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<Barrier(); for(int i=0;iBroadcast(0,&ncall,sizeof(ncall)); From d0f3d525d5dfb6cd7a2f5fe3be5a69c7ddc1306e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 25 Aug 2017 19:33:54 +0100 Subject: [PATCH 165/170] Optimal block size for KNL --- benchmarks/Benchmark_ITT.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 2edae8d0..c0ce451f 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -679,8 +679,11 @@ int main (int argc, char ** argv) Grid_init(&argc,&argv); CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); +#ifdef KNL + LebesgueOrder::Block = std::vector({8,2,2,2}); +#else LebesgueOrder::Block = std::vector({2,2,2,2}); - +#endif Benchmark::Decomposition(); int do_memory=1; From f68b5de9c8798779ef2657b9c2d469174ae8f53a Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 25 Aug 2017 19:35:21 +0100 Subject: [PATCH 166/170] No compile fix on Clang --- lib/qcd/action/fermion/CayleyFermion5D.cc | 12 ++++++------ lib/qcd/action/fermion/WilsonCompressor.h | 4 ---- lib/qcd/action/fermion/WilsonFermion5D.cc | 5 +++-- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/lib/qcd/action/fermion/CayleyFermion5D.cc b/lib/qcd/action/fermion/CayleyFermion5D.cc index 5e67d1f1..838b1c3d 100644 --- a/lib/qcd/action/fermion/CayleyFermion5D.cc +++ b/lib/qcd/action/fermion/CayleyFermion5D.cc @@ -414,7 +414,7 @@ void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vector0.0); + assert(omega[i]!=Coeff_t(0.0)); bs[i] = 0.5*(bpc/omega[i] + bmc); cs[i] = 0.5*(bpc/omega[i] - bmc); } @@ -429,7 +429,7 @@ void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vectorM5) +1.0); - assert(fabs(bee[i])>0.0); + assert(bee[i]!=Coeff_t(0.0)); cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5)); beo[i]=as[i]*bs[i]; ceo[i]=-as[i]*cs[i]; @@ -456,14 +456,14 @@ void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vector0.0); - assert(fabs(bee[0])>0.0); + assert(bee[i]!=Coeff_t(0.0)); + assert(bee[0]!=Coeff_t(0.0)); lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column leem[i]=mass*cee[Ls-1]/bee[0]; for(int j=0;j0.0); + assert(bee[j+1]!=Coeff_t(0.0)); leem[i]*= aee[j]/bee[j+1]; } @@ -484,7 +484,7 @@ void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vector0.0); + assert(bee[j] != Coeff_t(0.0)); delta_d *= cee[j]/bee[j]; } dee[Ls-1] += delta_d; diff --git a/lib/qcd/action/fermion/WilsonCompressor.h b/lib/qcd/action/fermion/WilsonCompressor.h index 406476b0..cc5c3c63 100644 --- a/lib/qcd/action/fermion/WilsonCompressor.h +++ b/lib/qcd/action/fermion/WilsonCompressor.h @@ -248,7 +248,6 @@ public: uint64_t callsi; void ZeroCountersi(void) { - std::cout << GridLogMessage << " ZeroCountersi()"<_npoints;point++){ same_node[point] = this->SameNode(point); - // std::cout << " dir " <HaloGatherDir(source,XpCompress,Xp,face_idx)); assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index c5b0f872..1da58ddb 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -123,12 +123,13 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, int vol4; vol4=FourDimGrid.oSites(); Stencil.BuildSurfaceList(LLs,vol4); + vol4=FourDimRedBlackGrid.oSites(); StencilEven.BuildSurfaceList(LLs,vol4); StencilOdd.BuildSurfaceList(LLs,vol4); - std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size() - <<" " << StencilEven.surface_list.size()< Date: Fri, 25 Aug 2017 20:43:37 +0100 Subject: [PATCH 167/170] Fix --- benchmarks/Benchmark_ITT.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 58fdb84a..c0ce451f 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -181,6 +181,7 @@ public: #ifdef GRID_OMP #pragma omp atomic +#endif ncomm++; #ifdef GRID_OMP From 54a5e6c1d0ec1cf1b66dac5ba407db49bc7e1016 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 25 Aug 2017 22:36:08 +0100 Subject: [PATCH 168/170] Check if we get huge pages on linux. Larry Meadows piece of magic. --- lib/allocator/AlignedAllocator.cc | 33 +++++++++++++++++++++++++++++++ lib/allocator/AlignedAllocator.h | 2 ++ 2 files changed, 35 insertions(+) diff --git a/lib/allocator/AlignedAllocator.cc b/lib/allocator/AlignedAllocator.cc index 04de20bf..764bd732 100644 --- a/lib/allocator/AlignedAllocator.cc +++ b/lib/allocator/AlignedAllocator.cc @@ -63,4 +63,37 @@ void *PointerCache::Lookup(size_t bytes) { return NULL; } + +void check_huge_pages(void *Buf,uint64_t BYTES) +{ +#ifdef __linux__ + int fd = open("/proc/self/pagemap", O_RDONLY); + assert(fd >= 0); + const int page_size = 4096; + uint64_t virt_pfn = (uint64_t)Buf / page_size; + off_t offset = sizeof(uint64_t) * virt_pfn; + uint64_t npages = (BYTES + page_size-1) / page_size; + uint64_t pagedata[npages]; + uint64_t ret = lseek(fd, offset, SEEK_SET); + assert(ret == offset); + ret = ::read(fd, pagedata, sizeof(uint64_t)*npages); + assert(ret == sizeof(uint64_t) * npages); + int nhugepages = npages / 512; + int n4ktotal, nnothuge; + n4ktotal = 0; + nnothuge = 0; + for (int i = 0; i < nhugepages; ++i) { + uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size; + for (int j = 0; j < 512; ++j) { + uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size; + ++n4ktotal; + if (pageaddr != baseaddr + j * page_size) + ++nnothuge; + } + } + int rank = CartesianCommunicator::RankWorld(); + printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge); +#endif +} + } diff --git a/lib/allocator/AlignedAllocator.h b/lib/allocator/AlignedAllocator.h index c5ad0883..e64a5949 100644 --- a/lib/allocator/AlignedAllocator.h +++ b/lib/allocator/AlignedAllocator.h @@ -64,6 +64,8 @@ namespace Grid { }; + void check_huge_pages(void *Buf,uint64_t BYTES); + //////////////////////////////////////////////////////////////////// // A lattice of something, but assume the something is SIMDized. //////////////////////////////////////////////////////////////////// From 4b4c2a715b319bcc7060ef9ae8aa983c49471167 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 26 Aug 2017 11:38:04 +0100 Subject: [PATCH 169/170] fcntl.h needed --- lib/allocator/AlignedAllocator.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/allocator/AlignedAllocator.cc b/lib/allocator/AlignedAllocator.cc index 764bd732..967b2571 100644 --- a/lib/allocator/AlignedAllocator.cc +++ b/lib/allocator/AlignedAllocator.cc @@ -1,7 +1,5 @@ - - - #include +#include namespace Grid { From 91676d1dda82e0e4779dbe64f0605be3db102142 Mon Sep 17 00:00:00 2001 From: James Harrison Date: Fri, 1 Sep 2017 15:48:30 +0100 Subject: [PATCH 170/170] =?UTF-8?q?Fix=20=E2=80=9CMAP=5FANONYMOUS=20undefi?= =?UTF-8?q?ned=E2=80=9D=20error=20on=20OSX.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lib/communicator/Communicator_base.cc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc index 956de0d2..ba9de1cc 100644 --- a/lib/communicator/Communicator_base.cc +++ b/lib/communicator/Communicator_base.cc @@ -148,6 +148,11 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { void CartesianCommunicator::ShmInitGeneric(void){ #if 1 +#if !defined(MAP_ANONYMOUS) + #define NO_MAP_ANONYMOUS + #define MAP_ANONYMOUS MAP_ANON +#endif + int mmap_flag = MAP_SHARED | MAP_ANONYMOUS; #ifdef MAP_HUGETLB if ( Hugepages ) mmap_flag |= MAP_HUGETLB; @@ -162,6 +167,11 @@ void CartesianCommunicator::ShmInitGeneric(void){ ShmCommBuf=(void *)&ShmBufStorageVector[0]; #endif bzero(ShmCommBuf,MAX_MPI_SHM_BYTES); + +#if defined(NO_MAP_ANONYMOUS) + #undef MAP_ANONYMOUS + #undef NO_MAP_ANONYMOUS +#endif } #endif