From 83f6fab8fa2b194d77d4fb23badef5fecbea74cd Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 2 Apr 2017 12:10:51 +0900 Subject: [PATCH] Big/Small crush test, and fast SITMO rng init, faster but not ideal MT and Ranlux init. --- lib/lattice/Lattice_rng.h | 175 ++++++++++++++++++------------- tests/testu01/Test_smallcrush.cc | 42 +++----- 2 files changed, 117 insertions(+), 100 deletions(-) diff --git a/lib/lattice/Lattice_rng.h b/lib/lattice/Lattice_rng.h index 31fef729..ddd2170f 100644 --- a/lib/lattice/Lattice_rng.h +++ b/lib/lattice/Lattice_rng.h @@ -30,12 +30,19 @@ Author: paboyle #define GRID_LATTICE_RNG_H #include + +#ifdef RNG_SITMO #include +#endif + +#if defined(RNG_SITMO) +#define RNG_FAST_DISCARD +#else +#undef RNG_FAST_DISCARD +#endif namespace Grid { - //http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-90Ar1.pdf ? - ////////////////////////////////////////////////////////////// // Allow the RNG state to be less dense than the fine grid ////////////////////////////////////////////////////////////// @@ -65,20 +72,22 @@ namespace Grid { multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d]; } - return multiplicity; } // real scalars are one component - template void fillScalar(scalar &s,distribution &dist,generator & gen) + template + void fillScalar(scalar &s,distribution &dist,generator & gen) { s=dist(gen); } - template void fillScalar(ComplexF &s,distribution &dist, generator &gen) + template + void fillScalar(ComplexF &s,distribution &dist, generator &gen) { s=ComplexF(dist(gen),dist(gen)); } - template void fillScalar(ComplexD &s,distribution &dist,generator &gen) + template + void fillScalar(ComplexD &s,distribution &dist,generator &gen) { s=ComplexD(dist(gen),dist(gen)); } @@ -102,27 +111,6 @@ namespace Grid { typedef uint64_t RngStateType; static const int RngStateCount = 4; #endif - /////////////////////// - // support for parallel init - /////////////////////// -#ifdef RNG_SITMO - static void Skip(RngEngine &eng) - { - uint64_t skip = 0x1; skip = skip<<40; - eng.discard(skip); - } -#endif - static RngEngine Reseed(RngEngine &eng) - { - const int reseeds=4; - std::uniform_int_distribution uid; - std::vector newseed(reseeds); - for(int i=0;i _generators; std::vector > _uniform; @@ -130,6 +118,46 @@ namespace Grid { std::vector > _bernoulli; std::vector > _uid; + /////////////////////// + // support for parallel init + /////////////////////// +#ifdef RNG_FAST_DISCARD + static void Skip(RngEngine &eng) + { + ///////////////////////////////////////////////////////////////////////////////////// + // Skip by 2^40 elements between successive lattice sites + // This goes by 10^12. + // Consider quenched updating; likely never exceeding rate of 1000 sweeps + // per second on any machine. This gives us of order 10^9 seconds, or 100 years + // skip ahead. + // For HMC unlikely to go at faster than a solve per second, and + // tens of seconds per trajectory so this is clean in all reasonable cases, + // and margin of safety is orders of magnitude. + // We could hack Sitmo to skip in the higher order words of state if necessary + ///////////////////////////////////////////////////////////////////////////////////// + uint64_t skip = 0x1; skip = skip<<40; + eng.discard(skip); + } +#endif + static RngEngine Reseed(RngEngine &eng) + { + std::vector newseed; + std::uniform_int_distribution uid; + return Reseed(eng,newseed,uid); + } + static RngEngine Reseed(RngEngine &eng,std::vector & newseed, + std::uniform_int_distribution &uid) + { + const int reseeds=4; + + newseed.resize(reseeds); + for(int i=0;i & saved,RngEngine &eng) { saved.resize(RngStateCount); std::stringstream ss; @@ -160,7 +188,6 @@ namespace Grid { void GetEngine(RngEngine &Eng, int gen){ Eng=_generators[gen]; } - template void Seed(source &src, int gen) { _generators[gen] = RngEngine(src); @@ -190,10 +217,11 @@ namespace Grid { for(int idx=0;idx inline void fill(ComplexF &l,std::vector &dist){ dist[0].reset(); fillScalar(l,dist[0],_generators[0]); @@ -253,14 +281,13 @@ namespace Grid { std::seed_seq src(seeds.begin(),seeds.end()); Seed(src,0); } - }; class GridParallelRNG : public GridRNGbase { public: - GridBase *_grid; int _vol; + public: int generator_idx(int os,int is){ return is*_grid->oSites()+os; @@ -289,7 +316,6 @@ namespace Grid { int osites=_grid->oSites(); int words=sizeof(scalar_object)/sizeof(scalar_type); - parallel_for(int ss=0;ss buf(Nsimd); @@ -314,23 +340,34 @@ namespace Grid { void SeedFixedIntegers(const std::vector &seeds){ + // Everyone generates the same seed_seq based on input seeds CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size()); std::seed_seq source(seeds.begin(),seeds.end()); RngEngine master_engine(source); -#ifdef RNG_SITMO +#ifdef RNG_FAST_DISCARD + //////////////////////////////////////////////// + // Skip ahead through a single stream. + // Applicable to SITMO and other has based/crypto RNGs + // Should be applicable to Mersenne Twister, but the C++11 + // MT implementation does not implement fast discard even though + // in principle this is possible + //////////////////////////////////////////////// std::vector gcoor; + int rank,o_idx,i_idx; + // Everybody loops over global volume. for(int gidx=0;gidx<_grid->_gsites;gidx++){ - Skip(master_engine); // advance the state; does this work? + Skip(master_engine); // Skip to next RNG sequence - int rank,o_idx,i_idx; + // Where is it? _grid->GlobalIndexToGlobalCoor(gidx,gcoor); _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); + // If this is one of mine we take it if( rank == _grid->ThisRank() ){ int l_idx=generator_idx(o_idx,i_idx); _generators[l_idx] = master_engine; @@ -338,23 +375,24 @@ namespace Grid { } #else - // Machine and thread decomposition dependent seeding - // is efficient and maximally parallel; but not - // reproducible from machine to machine. Not ideal, but fast. - // Different seed for each node. + //////////////////////////////////////////////////////////////// + // Machine and thread decomposition dependent seeding is efficient + // and maximally parallel; but NOT reproducible from machine to machine. + // Not ideal, but fastest way to reseed all nodes. + //////////////////////////////////////////////////////////////// { + // Obtain one Reseed per processor int Nproc = _grid->ProcessorCount(); - int me= _grid->ThisRank(); std::vector seeders(Nproc); - + int me= _grid->ThisRank(); for(int p=0;p seeders(Nthread); for(int t=0;t newseeds; + std::uniform_int_distribution uid; for(int l=0;l<_grid->lSites();l++) { if ( (l%Nthread)==t ) { - _generators[l] = Reseed(master_engine); + _generators[l] = Reseed(seeders[t],newseeds,uid); } } } } #endif } - + //////////////////////////////////////////////////////////////////////// + // Support for rigorous test of RNG's + // Return uniform random uint32_t from requested site generator + //////////////////////////////////////////////////////////////////////// uint32_t GlobalU01(int gsite){ - std::vector gcoor; - _grid->GlobalIndexToGlobalCoor(gsite,gcoor); + uint32_t the_number; + // who + std::vector gcoor; int rank,o_idx,i_idx; + _grid->GlobalIndexToGlobalCoor(gsite,gcoor); _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); + // draw int l_idx=generator_idx(o_idx,i_idx); - - uint32_t the_number; if( rank == _grid->ThisRank() ){ the_number = _uid[l_idx](_generators[l_idx]); } - + + // share & return _grid->Broadcast(rank,(void *)&the_number,sizeof(the_number)); - return the_number; } - }; - template inline void random(GridParallelRNG &rng,Lattice &l){ - rng.fill(l,rng._uniform); - } + template inline void random(GridParallelRNG &rng,Lattice &l) { rng.fill(l,rng._uniform); } + template inline void gaussian(GridParallelRNG &rng,Lattice &l) { rng.fill(l,rng._gaussian); } + template inline void bernoulli(GridParallelRNG &rng,Lattice &l){ rng.fill(l,rng._bernoulli);} - template inline void gaussian(GridParallelRNG &rng,Lattice &l){ - rng.fill(l,rng._gaussian); - } - - template inline void bernoulli(GridParallelRNG &rng,Lattice &l){ - rng.fill(l,rng._bernoulli); - } - - template inline void random(GridSerialRNG &rng,sobj &l){ - rng.fill(l,rng._uniform); - } - - template inline void gaussian(GridSerialRNG &rng,sobj &l){ - rng.fill(l,rng._gaussian); - } - - template inline void bernoulli(GridSerialRNG &rng,sobj &l){ - rng.fill(l,rng._bernoulli); - } + template inline void random(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._uniform ); } + template inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); } + template inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); } } #endif diff --git a/tests/testu01/Test_smallcrush.cc b/tests/testu01/Test_smallcrush.cc index 28974855..51f0d60c 100644 --- a/tests/testu01/Test_smallcrush.cc +++ b/tests/testu01/Test_smallcrush.cc @@ -32,17 +32,17 @@ using namespace Grid; using namespace Grid::QCD; // Wrap Grid's parallel RNG for testU01 +#define BIG_CRUSH // Big crush enable (long running) +#undef TEST_RNG_STANDALONE // Test serial RNGs in isolation extern "C" { #include "TestU01.h" -#include "gdef.h" -#include "unif01.h" -#include "ucarry.h" -#include "bbattery.h" } std::vector EngineRanlux; std::vector EngineMT; + +#include std::vector EngineSitmo; std::uniform_int_distribution uid; @@ -74,21 +74,13 @@ public: sRNG = _sRNG; _grid= grid; gsites= grid->_gsites; - std::cout << "Init: Global sites is " < uid; uint32_t ret_val; -#if 0 - ret_val = sRNG->_uid[0](sRNG->_generators[0]); -#else ret_val = pRNG->GlobalU01(site); site=(site+1)%gsites; -#endif - // std::cout << "site "<