Namespace indent

2026-08-02 08:53:29 +01:00 · 2018-01-14 23:55:44 +00:00
parent c19ccdad7c
commit 7e00f643f8
1 changed files with 371 additions and 371 deletions
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -24,8 +24,8 @@
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef GRID_LATTICE_RNG_H
 #define GRID_LATTICE_RNG_H

@@ -41,446 +41,446 @@
 #undef  RNG_FAST_DISCARD
 #endif

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  //////////////////////////////////////////////////////////////
-  // Allow the RNG state to be less dense than the fine grid
-  //////////////////////////////////////////////////////////////
-  inline int RNGfillable(GridBase *coarse,GridBase *fine)
-  {
+//////////////////////////////////////////////////////////////
+// Allow the RNG state to be less dense than the fine grid
+//////////////////////////////////////////////////////////////
+inline int RNGfillable(GridBase *coarse,GridBase *fine)
+{

-    int rngdims = coarse->_ndimension;
+  int rngdims = coarse->_ndimension;

-    // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
-    int lowerdims   = fine->_ndimension - coarse->_ndimension;
-    assert(lowerdims >= 0);
-    for(int d=0;d<lowerdims;d++){
-      assert(fine->_simd_layout[d]==1);
-      assert(fine->_processors[d]==1);
-    }
-
-    int multiplicity=1;
-    for(int d=0;d<lowerdims;d++){
-      multiplicity=multiplicity*fine->_rdimensions[d];
-    }
-    // local and global volumes subdivide cleanly after SIMDization
-    for(int d=0;d<rngdims;d++){
-      int fd= d+lowerdims;
-      assert(coarse->_processors[d]  == fine->_processors[fd]);
-      assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
-      assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]); 
-
-      multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d]; 
-    }
-    return multiplicity;
+  // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
+  int lowerdims   = fine->_ndimension - coarse->_ndimension;
+  assert(lowerdims >= 0);
+  for(int d=0;d<lowerdims;d++){
+    assert(fine->_simd_layout[d]==1);
+    assert(fine->_processors[d]==1);
  }

+  int multiplicity=1;
+  for(int d=0;d<lowerdims;d++){
+    multiplicity=multiplicity*fine->_rdimensions[d];
+  }
+  // local and global volumes subdivide cleanly after SIMDization
+  for(int d=0;d<rngdims;d++){
+    int fd= d+lowerdims;
+    assert(coarse->_processors[d]  == fine->_processors[fd]);
+    assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
+    assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]); 
+
+    multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d]; 
+  }
+  return multiplicity;
+}
+
  
 // merge of April 11 2017
-  // this function is necessary for the LS vectorised field
-  inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
-  {
-    int rngdims = coarse->_ndimension;
+// this function is necessary for the LS vectorised field
+inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
+{
+  int rngdims = coarse->_ndimension;
    
-    // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
-    int lowerdims   = fine->_ndimension - coarse->_ndimension;  assert(lowerdims >= 0);
-    // assumes that the higher dimensions are not using more processors
-    // all further divisions are local
-    for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1);
-    for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]);
+  // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
+  int lowerdims   = fine->_ndimension - coarse->_ndimension;  assert(lowerdims >= 0);
+  // assumes that the higher dimensions are not using more processors
+  // all further divisions are local
+  for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1);
+  for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]);

-    // then divide the number of local sites
-    // check that the total number of sims agree, meanse the iSites are the same
-    assert(fine->Nsimd() == coarse->Nsimd());
+  // then divide the number of local sites
+  // check that the total number of sims agree, meanse the iSites are the same
+  assert(fine->Nsimd() == coarse->Nsimd());

-    // check that the two grids divide cleanly
-    assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() );
+  // check that the two grids divide cleanly
+  assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() );

-    return fine->lSites() / coarse->lSites();
-  }
+  return fine->lSites() / coarse->lSites();
+}
  
-  // real scalars are one component
-  template<class scalar,class distribution,class generator> 
-  void fillScalar(scalar &s,distribution &dist,generator & gen)
-  {
-    s=dist(gen);
-  }
-  template<class distribution,class generator> 
-  void fillScalar(ComplexF &s,distribution &dist, generator &gen)
-  {
-    s=ComplexF(dist(gen),dist(gen));
-  }
-  template<class distribution,class generator> 
-  void fillScalar(ComplexD &s,distribution &dist,generator &gen)
-  {
-    s=ComplexD(dist(gen),dist(gen));
-  }
+// real scalars are one component
+template<class scalar,class distribution,class generator> 
+void fillScalar(scalar &s,distribution &dist,generator & gen)
+{
+  s=dist(gen);
+}
+template<class distribution,class generator> 
+void fillScalar(ComplexF &s,distribution &dist, generator &gen)
+{
+  s=ComplexF(dist(gen),dist(gen));
+}
+template<class distribution,class generator> 
+void fillScalar(ComplexD &s,distribution &dist,generator &gen)
+{
+  s=ComplexD(dist(gen),dist(gen));
+}
  
-  class GridRNGbase {
-  public:
-    // One generator per site.
-    // Uniform and Gaussian distributions from these generators.
+class GridRNGbase {
+public:
+  // One generator per site.
+  // Uniform and Gaussian distributions from these generators.
 #ifdef RNG_RANLUX
-    typedef std::ranlux48 RngEngine;
-    typedef uint64_t      RngStateType;
-    static const int RngStateCount = 15;
+  typedef std::ranlux48 RngEngine;
+  typedef uint64_t      RngStateType;
+  static const int RngStateCount = 15;
 #endif 
 #ifdef RNG_MT19937 
-    typedef std::mt19937 RngEngine;
-    typedef uint32_t     RngStateType;
-    static const int     RngStateCount = std::mt19937::state_size;
+  typedef std::mt19937 RngEngine;
+  typedef uint32_t     RngStateType;
+  static const int     RngStateCount = std::mt19937::state_size;
 #endif
 #ifdef RNG_SITMO
-    typedef sitmo::prng_engine 	RngEngine;
-    typedef uint64_t    	RngStateType;
-    static const int    	RngStateCount = 13;
+  typedef sitmo::prng_engine 	RngEngine;
+  typedef uint64_t    	RngStateType;
+  static const int    	RngStateCount = 13;
 #endif

-    std::vector<RngEngine>                             _generators;
-    std::vector<std::uniform_real_distribution<RealD> > _uniform;
-    std::vector<std::normal_distribution<RealD> >       _gaussian;
-    std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
-    std::vector<std::uniform_int_distribution<uint32_t> > _uid;
+  std::vector<RngEngine>                             _generators;
+  std::vector<std::uniform_real_distribution<RealD> > _uniform;
+  std::vector<std::normal_distribution<RealD> >       _gaussian;
+  std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
+  std::vector<std::uniform_int_distribution<uint32_t> > _uid;

-    ///////////////////////
-    // support for parallel init
-    ///////////////////////
+  ///////////////////////
+  // support for parallel init
+  ///////////////////////
 #ifdef RNG_FAST_DISCARD
-    static void Skip(RngEngine &eng,uint64_t site)
-    {
-      /////////////////////////////////////////////////////////////////////////////////////
-      // Skip by 2^40 elements between successive lattice sites
-      // This goes by 10^12.
-      // Consider quenched updating; likely never exceeding rate of 1000 sweeps
-      // per second on any machine. This gives us of order 10^9 seconds, or 100 years
-      // skip ahead.
-      // For HMC unlikely to go at faster than a solve per second, and 
-      // tens of seconds per trajectory so this is clean in all reasonable cases,
-      // and margin of safety is orders of magnitude.
-      // We could hack Sitmo to skip in the higher order words of state if necessary
-      /////////////////////////////////////////////////////////////////////////////////////
-      //      uint64_t skip = site+1;  //   Old init Skipped then drew.  Checked compat with faster init
-      uint64_t skip = site;
-      skip = skip<<40;
-      eng.discard(skip);
-      //      std::cout << " Engine  " <<site << " state " <<eng<<std::endl;
-    } 
+  static void Skip(RngEngine &eng,uint64_t site)
+  {
+    /////////////////////////////////////////////////////////////////////////////////////
+    // Skip by 2^40 elements between successive lattice sites
+    // This goes by 10^12.
+    // Consider quenched updating; likely never exceeding rate of 1000 sweeps
+    // per second on any machine. This gives us of order 10^9 seconds, or 100 years
+    // skip ahead.
+    // For HMC unlikely to go at faster than a solve per second, and 
+    // tens of seconds per trajectory so this is clean in all reasonable cases,
+    // and margin of safety is orders of magnitude.
+    // We could hack Sitmo to skip in the higher order words of state if necessary
+    /////////////////////////////////////////////////////////////////////////////////////
+    //      uint64_t skip = site+1;  //   Old init Skipped then drew.  Checked compat with faster init
+    uint64_t skip = site;
+    skip = skip<<40;
+    eng.discard(skip);
+    //      std::cout << " Engine  " <<site << " state " <<eng<<std::endl;
+  } 
 #endif
-    static RngEngine Reseed(RngEngine &eng)
-    {
-      std::vector<uint32_t> newseed;
-      std::uniform_int_distribution<uint32_t> uid;
-      return Reseed(eng,newseed,uid);
-    }
-    static RngEngine Reseed(RngEngine &eng,std::vector<uint32_t> & newseed,
-			    std::uniform_int_distribution<uint32_t> &uid)
-    {
-      const int reseeds=4;
+  static RngEngine Reseed(RngEngine &eng)
+  {
+    std::vector<uint32_t> newseed;
+    std::uniform_int_distribution<uint32_t> uid;
+    return Reseed(eng,newseed,uid);
+  }
+  static RngEngine Reseed(RngEngine &eng,std::vector<uint32_t> & newseed,
+			  std::uniform_int_distribution<uint32_t> &uid)
+  {
+    const int reseeds=4;
      
-      newseed.resize(reseeds);
-      for(int i=0;i<reseeds;i++){
-	newseed[i] = uid(eng);
-      }
-      std::seed_seq sseq(newseed.begin(),newseed.end());
-      return RngEngine(sseq);
-    }    
+    newseed.resize(reseeds);
+    for(int i=0;i<reseeds;i++){
+      newseed[i] = uid(eng);
+    }
+    std::seed_seq sseq(newseed.begin(),newseed.end());
+    return RngEngine(sseq);
+  }    

-    void GetState(std::vector<RngStateType> & saved,RngEngine &eng) {
-      saved.resize(RngStateCount);
-      std::stringstream ss;
-      ss<<eng;
-      ss.seekg(0,ss.beg);
-      for(int i=0;i<RngStateCount;i++){
-        ss>>saved[i];
-      }
+  void GetState(std::vector<RngStateType> & saved,RngEngine &eng) {
+    saved.resize(RngStateCount);
+    std::stringstream ss;
+    ss<<eng;
+    ss.seekg(0,ss.beg);
+    for(int i=0;i<RngStateCount;i++){
+      ss>>saved[i];
    }
-    void GetState(std::vector<RngStateType> & saved,int gen) {
-      GetState(saved,_generators[gen]);
+  }
+  void GetState(std::vector<RngStateType> & saved,int gen) {
+    GetState(saved,_generators[gen]);
+  }
+  void SetState(std::vector<RngStateType> & saved,RngEngine &eng){
+    assert(saved.size()==RngStateCount);
+    std::stringstream ss;
+    for(int i=0;i<RngStateCount;i++){
+      ss<< saved[i]<<" ";
    }
-    void SetState(std::vector<RngStateType> & saved,RngEngine &eng){
-      assert(saved.size()==RngStateCount);
-      std::stringstream ss;
-      for(int i=0;i<RngStateCount;i++){
-        ss<< saved[i]<<" ";
-      }
-      ss.seekg(0,ss.beg);
-      ss>>eng;
-    }
-    void SetState(std::vector<RngStateType> & saved,int gen){
-      SetState(saved,_generators[gen]);
-    }
-    void SetEngine(RngEngine &Eng, int gen){
-      _generators[gen]=Eng;
-    }
-    void GetEngine(RngEngine &Eng, int gen){
-      Eng=_generators[gen];
-    }
-    template<class source> void Seed(source &src, int gen)
-    {
-      _generators[gen] = RngEngine(src);
-    }    
-  };
+    ss.seekg(0,ss.beg);
+    ss>>eng;
+  }
+  void SetState(std::vector<RngStateType> & saved,int gen){
+    SetState(saved,_generators[gen]);
+  }
+  void SetEngine(RngEngine &Eng, int gen){
+    _generators[gen]=Eng;
+  }
+  void GetEngine(RngEngine &Eng, int gen){
+    Eng=_generators[gen];
+  }
+  template<class source> void Seed(source &src, int gen)
+  {
+    _generators[gen] = RngEngine(src);
+  }    
+};

-  class GridSerialRNG : public GridRNGbase {
-  public:
+class GridSerialRNG : public GridRNGbase {
+public:

-    GridSerialRNG() : GridRNGbase() {
-      _generators.resize(1);
-      _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
-      _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
-      _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
-      _uid.resize(1,std::uniform_int_distribution<uint32_t>() );
-    }
+  GridSerialRNG() : GridRNGbase() {
+    _generators.resize(1);
+    _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
+    _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
+    _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
+    _uid.resize(1,std::uniform_int_distribution<uint32_t>() );
+  }

-    template <class sobj,class distribution> inline void fill(sobj &l,std::vector<distribution> &dist){
+  template <class sobj,class distribution> inline void fill(sobj &l,std::vector<distribution> &dist){

-      typedef typename sobj::scalar_type scalar_type;
+    typedef typename sobj::scalar_type scalar_type;
 
-      int words = sizeof(sobj)/sizeof(scalar_type);
+    int words = sizeof(sobj)/sizeof(scalar_type);

-      scalar_type *buf = (scalar_type *) & l;
+    scalar_type *buf = (scalar_type *) & l;

-      dist[0].reset();
-      for(int idx=0;idx<words;idx++){
-  fillScalar(buf[idx],dist[0],_generators[0]);
-      }
+    dist[0].reset();
+    for(int idx=0;idx<words;idx++){
+      fillScalar(buf[idx],dist[0],_generators[0]);
+    }

-      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+    CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));

-    };
-
-    template <class distribution>  inline void fill(ComplexF &l,std::vector<distribution> &dist){
-      dist[0].reset();
-      fillScalar(l,dist[0],_generators[0]);
-      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
-    }
-    template <class distribution>  inline void fill(ComplexD &l,std::vector<distribution> &dist){
-      dist[0].reset();
-      fillScalar(l,dist[0],_generators[0]);
-      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
-    }
-    template <class distribution>  inline void fill(RealF &l,std::vector<distribution> &dist){
-      dist[0].reset();
-      fillScalar(l,dist[0],_generators[0]);
-      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
-    }
-    template <class distribution>  inline void fill(RealD &l,std::vector<distribution> &dist){
-      dist[0].reset();
-      fillScalar(l,dist[0],_generators[0]);
-      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
-    }
-    // vector fill
-    template <class distribution>  inline void fill(vComplexF &l,std::vector<distribution> &dist){
-      RealF *pointer=(RealF *)&l;
-      dist[0].reset();
-      for(int i=0;i<2*vComplexF::Nsimd();i++){
-  fillScalar(pointer[i],dist[0],_generators[0]);
-      }
-      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
-    }
-    template <class distribution>  inline void fill(vComplexD &l,std::vector<distribution> &dist){
-      RealD *pointer=(RealD *)&l;
-      dist[0].reset();
-      for(int i=0;i<2*vComplexD::Nsimd();i++){
-  fillScalar(pointer[i],dist[0],_generators[0]);
-      }
-      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
-    }
-    template <class distribution>  inline void fill(vRealF &l,std::vector<distribution> &dist){
-      RealF *pointer=(RealF *)&l;
-      dist[0].reset();
-      for(int i=0;i<vRealF::Nsimd();i++){
-  fillScalar(pointer[i],dist[0],_generators[0]);
-      }
-      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
-    }
-    template <class distribution>  inline void fill(vRealD &l,std::vector<distribution> &dist){
-      RealD *pointer=(RealD *)&l;
-      dist[0].reset();
-      for(int i=0;i<vRealD::Nsimd();i++){
-	fillScalar(pointer[i],dist[0],_generators[0]);
-      }
-      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
-    }
-    
-    void SeedFixedIntegers(const std::vector<int> &seeds){
-      CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
-      std::seed_seq src(seeds.begin(),seeds.end());
-      Seed(src,0);
-    }
  };

-  class GridParallelRNG : public GridRNGbase {
-
-    double _time_counter;
-
-  public:
-    GridBase *_grid;
-    unsigned int _vol;
-
-    int generator_idx(int os,int is) {
-      return is*_grid->oSites()+os;
+  template <class distribution>  inline void fill(ComplexF &l,std::vector<distribution> &dist){
+    dist[0].reset();
+    fillScalar(l,dist[0],_generators[0]);
+    CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+  }
+  template <class distribution>  inline void fill(ComplexD &l,std::vector<distribution> &dist){
+    dist[0].reset();
+    fillScalar(l,dist[0],_generators[0]);
+    CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+  }
+  template <class distribution>  inline void fill(RealF &l,std::vector<distribution> &dist){
+    dist[0].reset();
+    fillScalar(l,dist[0],_generators[0]);
+    CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+  }
+  template <class distribution>  inline void fill(RealD &l,std::vector<distribution> &dist){
+    dist[0].reset();
+    fillScalar(l,dist[0],_generators[0]);
+    CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+  }
+  // vector fill
+  template <class distribution>  inline void fill(vComplexF &l,std::vector<distribution> &dist){
+    RealF *pointer=(RealF *)&l;
+    dist[0].reset();
+    for(int i=0;i<2*vComplexF::Nsimd();i++){
+      fillScalar(pointer[i],dist[0],_generators[0]);
    }
-
-    GridParallelRNG(GridBase *grid) : GridRNGbase() {
-      _grid = grid;
-      _vol  =_grid->iSites()*_grid->oSites();
-
-      _generators.resize(_vol);
-      _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
-      _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
-      _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
-      _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
+    CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+  }
+  template <class distribution>  inline void fill(vComplexD &l,std::vector<distribution> &dist){
+    RealD *pointer=(RealD *)&l;
+    dist[0].reset();
+    for(int i=0;i<2*vComplexD::Nsimd();i++){
+      fillScalar(pointer[i],dist[0],_generators[0]);
    }
+    CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+  }
+  template <class distribution>  inline void fill(vRealF &l,std::vector<distribution> &dist){
+    RealF *pointer=(RealF *)&l;
+    dist[0].reset();
+    for(int i=0;i<vRealF::Nsimd();i++){
+      fillScalar(pointer[i],dist[0],_generators[0]);
+    }
+    CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+  }
+  template <class distribution>  inline void fill(vRealD &l,std::vector<distribution> &dist){
+    RealD *pointer=(RealD *)&l;
+    dist[0].reset();
+    for(int i=0;i<vRealD::Nsimd();i++){
+      fillScalar(pointer[i],dist[0],_generators[0]);
+    }
+    CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+  }
+    
+  void SeedFixedIntegers(const std::vector<int> &seeds){
+    CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
+    std::seed_seq src(seeds.begin(),seeds.end());
+    Seed(src,0);
+  }
+};

-    template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){
+class GridParallelRNG : public GridRNGbase {

-      typedef typename vobj::scalar_object scalar_object;
-      typedef typename vobj::scalar_type scalar_type;
-      typedef typename vobj::vector_type vector_type;
+  double _time_counter;

-      double inner_time_counter = usecond();
+public:
+  GridBase *_grid;
+  unsigned int _vol;

-      int multiplicity = RNGfillable_general(_grid, l._grid); // l has finer or same grid
-      int Nsimd  = _grid->Nsimd();  // guaranteed to be the same for l._grid too
-      int osites = _grid->oSites();  // guaranteed to be <= l._grid->oSites() by a factor multiplicity
-      int words  = sizeof(scalar_object) / sizeof(scalar_type);
+  int generator_idx(int os,int is) {
+    return is*_grid->oSites()+os;
+  }

-      parallel_for(int ss=0;ss<osites;ss++){
-        std::vector<scalar_object> buf(Nsimd);
-        for (int m = 0; m < multiplicity; m++) {  // Draw from same generator multiplicity times
+  GridParallelRNG(GridBase *grid) : GridRNGbase() {
+    _grid = grid;
+    _vol  =_grid->iSites()*_grid->oSites();

-          int sm = multiplicity * ss + m;  // Maps the generator site to the fine site
+    _generators.resize(_vol);
+    _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
+    _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
+    _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
+    _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
+  }

-          for (int si = 0; si < Nsimd; si++) {
+  template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){
+
+    typedef typename vobj::scalar_object scalar_object;
+    typedef typename vobj::scalar_type scalar_type;
+    typedef typename vobj::vector_type vector_type;
+
+    double inner_time_counter = usecond();
+
+    int multiplicity = RNGfillable_general(_grid, l._grid); // l has finer or same grid
+    int Nsimd  = _grid->Nsimd();  // guaranteed to be the same for l._grid too
+    int osites = _grid->oSites();  // guaranteed to be <= l._grid->oSites() by a factor multiplicity
+    int words  = sizeof(scalar_object) / sizeof(scalar_type);
+
+    parallel_for(int ss=0;ss<osites;ss++){
+      std::vector<scalar_object> buf(Nsimd);
+      for (int m = 0; m < multiplicity; m++) {  // Draw from same generator multiplicity times
+
+	int sm = multiplicity * ss + m;  // Maps the generator site to the fine site
+
+	for (int si = 0; si < Nsimd; si++) {
            
-            int gdx = generator_idx(ss, si);  // index of generator state
-            scalar_type *pointer = (scalar_type *)&buf[si];
-            dist[gdx].reset();
-            for (int idx = 0; idx < words; idx++) 
-              fillScalar(pointer[idx], dist[gdx], _generators[gdx]);
-          }
-          // merge into SIMD lanes, FIXME suboptimal implementation
-          merge(l._odata[sm], buf);
-        }
+	  int gdx = generator_idx(ss, si);  // index of generator state
+	  scalar_type *pointer = (scalar_type *)&buf[si];
+	  dist[gdx].reset();
+	  for (int idx = 0; idx < words; idx++) 
+	    fillScalar(pointer[idx], dist[gdx], _generators[gdx]);
+	}
+	// merge into SIMD lanes, FIXME suboptimal implementation
+	merge(l._odata[sm], buf);
      }
+    }

-      _time_counter += usecond()- inner_time_counter;
-    };
+    _time_counter += usecond()- inner_time_counter;
+  };

-    void SeedFixedIntegers(const std::vector<int> &seeds){
+  void SeedFixedIntegers(const std::vector<int> &seeds){

-      // Everyone generates the same seed_seq based on input seeds
-      CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
+    // Everyone generates the same seed_seq based on input seeds
+    CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());

-      std::seed_seq source(seeds.begin(),seeds.end());
+    std::seed_seq source(seeds.begin(),seeds.end());

-      RngEngine master_engine(source);
+    RngEngine master_engine(source);

 #ifdef RNG_FAST_DISCARD
-      ////////////////////////////////////////////////
-      // Skip ahead through a single stream.
-      // Applicable to SITMO and other has based/crypto RNGs
-      // Should be applicable to Mersenne Twister, but the C++11
-      // MT implementation does not implement fast discard even though
-      // in principle this is possible
-      ////////////////////////////////////////////////
+    ////////////////////////////////////////////////
+    // Skip ahead through a single stream.
+    // Applicable to SITMO and other has based/crypto RNGs
+    // Should be applicable to Mersenne Twister, but the C++11
+    // MT implementation does not implement fast discard even though
+    // in principle this is possible
+    ////////////////////////////////////////////////

-      // Everybody loops over global volume.
-      parallel_for(int gidx=0;gidx<_grid->_gsites;gidx++){
+    // Everybody loops over global volume.
+    parallel_for(int gidx=0;gidx<_grid->_gsites;gidx++){

-	// Where is it?
-	int rank,o_idx,i_idx;
-	std::vector<int> gcoor;
+      // Where is it?
+      int rank,o_idx,i_idx;
+      std::vector<int> gcoor;

-	_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
-	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
-
-	// If this is one of mine we take it
-	if( rank == _grid->ThisRank() ){
-	  int l_idx=generator_idx(o_idx,i_idx);
-	  _generators[l_idx] = master_engine;
-	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
-	}
+      _grid->GlobalIndexToGlobalCoor(gidx,gcoor);
+      _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);

+      // If this is one of mine we take it
+      if( rank == _grid->ThisRank() ){
+	int l_idx=generator_idx(o_idx,i_idx);
+	_generators[l_idx] = master_engine;
+	Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
      }
+
+    }
 #else 
-      ////////////////////////////////////////////////////////////////
-      // Machine and thread decomposition dependent seeding is efficient
-      // and maximally parallel; but NOT reproducible from machine to machine. 
-      // Not ideal, but fastest way to reseed all nodes.
-      ////////////////////////////////////////////////////////////////
-      {
-	// Obtain one Reseed per processor
-	int Nproc = _grid->ProcessorCount();
-	std::vector<RngEngine> seeders(Nproc);
-	int me= _grid->ThisRank();
-	for(int p=0;p<Nproc;p++){
-	  seeders[p] = Reseed(master_engine);
-	}
-	master_engine = seeders[me];
+    ////////////////////////////////////////////////////////////////
+    // Machine and thread decomposition dependent seeding is efficient
+    // and maximally parallel; but NOT reproducible from machine to machine. 
+    // Not ideal, but fastest way to reseed all nodes.
+    ////////////////////////////////////////////////////////////////
+    {
+      // Obtain one Reseed per processor
+      int Nproc = _grid->ProcessorCount();
+      std::vector<RngEngine> seeders(Nproc);
+      int me= _grid->ThisRank();
+      for(int p=0;p<Nproc;p++){
+	seeders[p] = Reseed(master_engine);
+      }
+      master_engine = seeders[me];
+    }
+
+    {
+      // Obtain one reseeded generator per thread
+      int Nthread = GridThread::GetThreads();
+      std::vector<RngEngine> seeders(Nthread);
+      for(int t=0;t<Nthread;t++){
+	seeders[t] = Reseed(master_engine);
      }

-      {
-	// Obtain one reseeded generator per thread
-	int Nthread = GridThread::GetThreads();
-	std::vector<RngEngine> seeders(Nthread);
-	for(int t=0;t<Nthread;t++){
-	  seeders[t] = Reseed(master_engine);
-	}
-
-	parallel_for(int t=0;t<Nthread;t++) {
-	  // set up one per local site in threaded fashion
-	  std::vector<uint32_t> newseeds;
-	  std::uniform_int_distribution<uint32_t> uid;	
-	  for(int l=0;l<_grid->lSites();l++) {
-	    if ( (l%Nthread)==t ) {
-	      _generators[l] = Reseed(seeders[t],newseeds,uid);
-	    }
+      parallel_for(int t=0;t<Nthread;t++) {
+	// set up one per local site in threaded fashion
+	std::vector<uint32_t> newseeds;
+	std::uniform_int_distribution<uint32_t> uid;	
+	for(int l=0;l<_grid->lSites();l++) {
+	  if ( (l%Nthread)==t ) {
+	    _generators[l] = Reseed(seeders[t],newseeds,uid);
 	  }
 	}
      }
+    }
 #endif
+  }
+
+  void Report(){
+    std::cout << GridLogMessage << "Time spent in the fill() routine by GridParallelRNG: "<< _time_counter/1e3 << " ms" << std::endl;
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////
+  // Support for rigorous test of RNG's
+  // Return uniform random uint32_t from requested site generator
+  ////////////////////////////////////////////////////////////////////////
+  uint32_t GlobalU01(int gsite){
+
+    uint32_t the_number;
+    // who
+    std::vector<int> gcoor;
+    int rank,o_idx,i_idx;
+    _grid->GlobalIndexToGlobalCoor(gsite,gcoor);
+    _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
+
+    // draw
+    int l_idx=generator_idx(o_idx,i_idx);
+    if( rank == _grid->ThisRank() ){
+      the_number = _uid[l_idx](_generators[l_idx]);
    }
-
-    void Report(){
-      std::cout << GridLogMessage << "Time spent in the fill() routine by GridParallelRNG: "<< _time_counter/1e3 << " ms" << std::endl;
-    }
-
-
-    ////////////////////////////////////////////////////////////////////////
-    // Support for rigorous test of RNG's
-    // Return uniform random uint32_t from requested site generator
-    ////////////////////////////////////////////////////////////////////////
-    uint32_t GlobalU01(int gsite){
-
-      uint32_t the_number;
-      // who
-      std::vector<int> gcoor;
-      int rank,o_idx,i_idx;
-      _grid->GlobalIndexToGlobalCoor(gsite,gcoor);
-      _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
-
-      // draw
-      int l_idx=generator_idx(o_idx,i_idx);
-      if( rank == _grid->ThisRank() ){
-	the_number = _uid[l_idx](_generators[l_idx]);
-      }
      
-      // share & return
-      _grid->Broadcast(rank,(void *)&the_number,sizeof(the_number));
-      return the_number;
-    }
+    // share & return
+    _grid->Broadcast(rank,(void *)&the_number,sizeof(the_number));
+    return the_number;
+  }

-  };
+};

-  template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l)   { rng.fill(l,rng._uniform);  }
-  template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
-  template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
+template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l)   { rng.fill(l,rng._uniform);  }
+template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
+template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}

-  template <class sobj> inline void random(GridSerialRNG &rng,sobj &l)   { rng.fill(l,rng._uniform  ); }
-  template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
-  template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
+template <class sobj> inline void random(GridSerialRNG &rng,sobj &l)   { rng.fill(l,rng._uniform  ); }
+template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
+template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }

-}
+NAMESPACE_END(Grid);
 #endif