From 1c4bc7ed38c6e008972e205920d11e31880313d0 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Fri, 31 Mar 2017 14:41:48 +0900
Subject: [PATCH 01/13] Debugged staggered conventions

---
 .../fermion/ImprovedStaggeredFermion.cc       |  61 ++-
 .../action/fermion/ImprovedStaggeredFermion.h |   9 +
 lib/qcd/action/fermion/StaggeredKernelsAsm.cc |  13 +-
 .../action/fermion/StaggeredKernelsHand.cc    |  21 +-
 scripts/zmobius.sh                            |  35 ++
 tests/qdpxx/Test_qdpxx_stag.cc                | 364 ++++++++++++++++++
 6 files changed, 491 insertions(+), 12 deletions(-)
 create mode 100644 scripts/zmobius.sh
 create mode 100644 tests/qdpxx/Test_qdpxx_stag.cc
diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
index ec5811e0..2ba4f4af 100644
--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
+++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
@@ -40,10 +40,10 @@ ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3,
 // Constructor and gauge import
 /////////////////////////////////
 
+
 template <class Impl>
-ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid,
-							 GridRedBlackCartesian &Hgrid, RealD _mass,
-							 RealD _c1, RealD _c2,RealD _u0,
+ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, 
+							 RealD _mass,
 							 const ImplParams &p)
     : Kernels(p),
       _grid(&Fgrid),
@@ -52,9 +52,6 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin, Gau
       StencilEven(&Hgrid, npoint, Even, directions, displacements),  // source is Even
       StencilOdd(&Hgrid, npoint, Odd, directions, displacements),  // source is Odd
       mass(_mass),
-      c1(_c1),
-      c2(_c2),
-      u0(_u0),
       Lebesgue(_grid),
       LebesgueEvenOdd(_cbgrid),
       Umu(&Fgrid),
@@ -65,9 +62,29 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin, Gau
       UUUmuOdd(&Hgrid) ,
       _tmp(&Hgrid)
 {
-  // Allocate the required comms buffer
+}
+
+template <class Impl>
+ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid,
+							 GridRedBlackCartesian &Hgrid, RealD _mass,
+							 RealD _c1, RealD _c2,RealD _u0,
+							 const ImplParams &p)
+  : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p)
+{
+  c1=_c1;
+  c2=_c2;
+  u0=_u0;
   ImportGauge(_Uthin,_Ufat);
 }
+template <class Impl>
+ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin,GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid,
+							 GridRedBlackCartesian &Hgrid, RealD _mass,
+							 const ImplParams &p)
+  : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p)
+{
+  ImportGaugeSimple(_Utriple,_Ufat);
+}
+
 
   ////////////////////////////////////////////////////////////
   // Momentum space propagator should be 
@@ -86,6 +103,34 @@ void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin)
   ImportGauge(_Uthin,_Uthin);
 };
 template <class Impl>
+void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat) 
+{
+  /////////////////////////////////////////////////////////////////
+  // Trivial import; phases and fattening and such like preapplied
+  /////////////////////////////////////////////////////////////////
+  GaugeLinkField U(GaugeGrid());
+
+  for (int mu = 0; mu < Nd; mu++) {
+
+    U = PeekIndex<LorentzIndex>(_Utriple, mu);
+    PokeIndex<LorentzIndex>(UUUmu, U, mu );
+
+    U = adj( Cshift(U, mu, -3));
+    PokeIndex<LorentzIndex>(UUUmu, -U, mu+4 );
+
+    U = PeekIndex<LorentzIndex>(_Ufat, mu);
+    PokeIndex<LorentzIndex>(Umu, U, mu);
+
+    U = adj( Cshift(U, mu, -1));
+    PokeIndex<LorentzIndex>(Umu, -U, mu+4);
+
+  }
+  pickCheckerboard(Even, UmuEven,  Umu);
+  pickCheckerboard(Odd,  UmuOdd ,  Umu);
+  pickCheckerboard(Even, UUUmuEven,UUUmu);
+  pickCheckerboard(Odd,  UUUmuOdd, UUUmu);
+}
+template <class Impl>
 void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat) 
 {
   GaugeLinkField U(GaugeGrid());
@@ -115,6 +160,8 @@ void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const
     PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
   }
 
+  std::cout << " Umu " << Umu._odata[0]<<std::endl;
+  std::cout << " UUUmu " << UUUmu._odata[0]<<std::endl;
   pickCheckerboard(Even, UmuEven, Umu);
   pickCheckerboard(Odd,  UmuOdd , Umu);
   pickCheckerboard(Even, UUUmuEven, UUUmu);
diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion.h b/lib/qcd/action/fermion/ImprovedStaggeredFermion.h
index 45b55760..7d1f2996 100644
--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion.h
@@ -112,7 +112,16 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
 			   RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0,
 			   const ImplParams &p = ImplParams());
 
+  ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid,
+			   GridRedBlackCartesian &Hgrid, RealD _mass,
+			   const ImplParams &p = ImplParams());
+
+  ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass,
+			   const ImplParams &p = ImplParams());
+
+
   // DoubleStore impl dependent
+  void ImportGaugeSimple(const GaugeField &_Utriple, const GaugeField &_Ufat);
   void ImportGauge(const GaugeField &_Uthin, const GaugeField &_Ufat);
   void ImportGauge(const GaugeField &_Uthin);
 
diff --git a/lib/qcd/action/fermion/StaggeredKernelsAsm.cc b/lib/qcd/action/fermion/StaggeredKernelsAsm.cc
index bfe13f07..fd881716 100644
--- a/lib/qcd/action/fermion/StaggeredKernelsAsm.cc
+++ b/lib/qcd/action/fermion/StaggeredKernelsAsm.cc
@@ -587,7 +587,6 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
 					 int sU, const FermionField &in, FermionField &out) 
 {
   assert(0);
-
 };
 
 
@@ -905,9 +904,17 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
 #endif
 }
 
+#define KERNEL_INSTANTIATE(CLASS,FUNC,IMPL)			    \
+  template void CLASS<IMPL>::FUNC(StencilImpl &st, LebesgueOrder &lo,	\
+				  DoubledGaugeField &U,			\
+				  DoubledGaugeField &UUU,		\
+				  SiteSpinor *buf, int LLs,		\
+				  int sU, const FermionField &in, FermionField &out);
 
-FermOpStaggeredTemplateInstantiate(StaggeredKernels);
-FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);
+KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD);
+KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF);
+KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredVec5dImplD);
+KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredVec5dImplF);
 
 }}
 
diff --git a/lib/qcd/action/fermion/StaggeredKernelsHand.cc b/lib/qcd/action/fermion/StaggeredKernelsHand.cc
index f5806657..7de8480c 100644
--- a/lib/qcd/action/fermion/StaggeredKernelsHand.cc
+++ b/lib/qcd/action/fermion/StaggeredKernelsHand.cc
@@ -299,7 +299,24 @@ void StaggeredKernels<Impl>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &l
 
 }
 
-FermOpStaggeredTemplateInstantiate(StaggeredKernels);
-FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);
+#define DHOP_SITE_HAND_INSTANTIATE(IMPL)				\
+  template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
+						     DoubledGaugeField &U,DoubledGaugeField &UUU, \
+						     SiteSpinor *buf, int LLs, \
+						     int sU, const FermionField &in, FermionField &out, int dag);
+
+#define DHOP_SITE_DEPTH_HAND_INSTANTIATE(IMPL)				\
+  template void StaggeredKernels<IMPL>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, \
+							  SiteSpinor *buf, int sF, \
+							  int sU, const FermionField &in, SiteSpinor &out,int threeLink) ;
+DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD);
+DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF);
+DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD);
+DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF);
+
+DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplD);
+DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplF);
+DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplD);
+DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplF);
 
 }}
diff --git a/scripts/zmobius.sh b/scripts/zmobius.sh
new file mode 100644
index 00000000..04b223d2
--- /dev/null
+++ b/scripts/zmobius.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+fn=$1
+
+grep "double zmobius_" $fn |
+awk 'BEGIN{ m["zmobius_b_coeff"]=0; m["zmobius_c_coeff"]=1; }{ val[m[substr($2,0,15)]][substr($2,17)+0]=$4; }END{
+
+    ls=length(val[0])/2;
+
+    print "ls = " ls
+
+    bmc=-111;
+
+    for (s=0;s<ls;s++) {
+      br[s] = val[0][2*s + 0];
+      bi[s] = val[0][2*s + 1];
+      cr[s] = val[1][2*s + 0];
+      ci[s] = val[1][2*s + 1];
+
+      t=br[s] - cr[s];
+      if (bmc == -111)
+        bmc=t;
+      else if (bmc != t)
+        print "Warning: b-c is not constant!";
+
+      omegar[s] = (-1.0 + 2.0* br[s])/(4.0*bi[s]**2.0 + (1.0 - 2.0* br[s])**2);
+      omegai[s] = - 2.0* bi[s]/(4.0*bi[s]**2.0 + (1.0 - 2.0* br[s])**2);
+    }
+
+    print "b-c = " bmc
+
+    for (s=0;s<ls;s++) {
+      printf( "omega.push_back( std::complex<double>(%.15g,%.15g) );\n",omegar[s],omegai[s]);
+    }
+
+}'
diff --git a/tests/qdpxx/Test_qdpxx_stag.cc b/tests/qdpxx/Test_qdpxx_stag.cc
new file mode 100644
index 00000000..a7563924
--- /dev/null
+++ b/tests/qdpxx/Test_qdpxx_stag.cc
@@ -0,0 +1,364 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/qdpxx/Test_qdpxx_munprec.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+double mq=0.1;
+
+typedef Grid::QCD::StaggeredImplR::FermionField FermionField;
+typedef Grid::QCD::LatticeGaugeField GaugeField;
+
+void make_gauge     (GaugeField & lat, FermionField &src);
+void calc_grid      (GaugeField & lat, GaugeField & uthin,GaugeField & ufat, FermionField &src, FermionField &res,int dag);
+void calc_chroma    (GaugeField & lat,GaugeField & uthin,GaugeField & ufat, FermionField &src, FermionField &res,int dag);
+
+#include <chroma.h>
+#include <actions/ferm/invert/syssolver_linop_cg_array.h>
+#include <actions/ferm/invert/syssolver_linop_aggregate.h>
+
+namespace Chroma { 
+
+
+class ChromaWrapper {
+public:
+  
+  typedef multi1d<LatticeColorMatrix> U;
+  typedef LatticeStaggeredFermion T4;
+  
+  static void ImportGauge(GaugeField & gr,
+			  QDP::multi1d<QDP::LatticeColorMatrix> & ch) 
+  {
+    Grid::QCD::LorentzColourMatrix LCM;
+    Grid::Complex cc;
+    QDP::ColorMatrix cm;
+    QDP::Complex c;
+
+    std::vector<int> x(4);
+    QDP::multi1d<int> cx(4);
+    std::vector<int> gd= gr._grid->GlobalDimensions();
+
+    for (x[0]=0;x[0]<gd[0];x[0]++){
+    for (x[1]=0;x[1]<gd[1];x[1]++){
+    for (x[2]=0;x[2]<gd[2];x[2]++){
+    for (x[3]=0;x[3]<gd[3];x[3]++){
+      cx[0] = x[0];
+      cx[1] = x[1];
+      cx[2] = x[2];
+      cx[3] = x[3];
+      Grid::peekSite(LCM,gr,x);
+
+      for(int mu=0;mu<4;mu++){
+	for(int i=0;i<3;i++){
+	for(int j=0;j<3;j++){
+	  cc = LCM(mu)()(i,j);
+	  c = QDP::cmplx(QDP::Real(real(cc)),QDP::Real(imag(cc)));
+	  QDP::pokeColor(cm,c,i,j);
+	}}
+	QDP::pokeSite(ch[mu],cm,cx);
+      }
+
+    }}}}
+  }
+
+  static void ExportGauge(GaugeField & gr,
+			  QDP::multi1d<QDP::LatticeColorMatrix> & ch) 
+  {
+    Grid::QCD::LorentzColourMatrix LCM;
+    Grid::Complex cc;
+    QDP::ColorMatrix cm;
+    QDP::Complex c;
+
+    std::vector<int> x(4);
+    QDP::multi1d<int> cx(4);
+    std::vector<int> gd= gr._grid->GlobalDimensions();
+
+    for (x[0]=0;x[0]<gd[0];x[0]++){
+    for (x[1]=0;x[1]<gd[1];x[1]++){
+    for (x[2]=0;x[2]<gd[2];x[2]++){
+    for (x[3]=0;x[3]<gd[3];x[3]++){
+      cx[0] = x[0];
+      cx[1] = x[1];
+      cx[2] = x[2];
+      cx[3] = x[3];
+
+      for(int mu=0;mu<4;mu++){
+	for(int i=0;i<3;i++){
+	for(int j=0;j<3;j++){
+	  cm = QDP::peekSite(ch[mu],cx);
+	  c  = QDP::peekColor(cm,i,j);
+	  cc = Grid::Complex(toDouble(real(c)),toDouble(imag(c)));
+	  LCM(mu)()(i,j)= cc;
+	}}
+      }
+      Grid::pokeSite(LCM,gr,x);
+
+    }}}}
+  }
+
+  
+  static void ImportFermion(FermionField & gr,
+			    QDP::LatticeStaggeredFermion & ch  ) 
+  {
+    Grid::QCD::ColourVector F;
+    Grid::Complex c;
+
+
+    std::vector<int> x(5);
+    QDP::multi1d<int> cx(4);
+    std::vector<int> gd= gr._grid->GlobalDimensions();
+
+    for (x[0]=0;x[0]<gd[0];x[0]++){
+    for (x[1]=0;x[1]<gd[1];x[1]++){
+    for (x[2]=0;x[2]<gd[2];x[2]++){
+    for (x[3]=0;x[3]<gd[3];x[3]++){
+      cx[0] = x[0];
+      cx[1] = x[1];
+      cx[2] = x[2];
+      cx[3] = x[3];
+
+      Grid::peekSite(F,gr,x);
+      QDP::ColorVector cv;
+      for(int j=0;j<3;j++){
+	QDP::Complex cc;
+	c  = F()()(j) ;
+	cc = QDP::cmplx(QDP::Real(real(c)),QDP::Real(imag(c)));
+	pokeColor(cv,cc,j);
+      }
+      QDP::StaggeredFermion cF;
+      pokeSpin(cF,cv,0);
+      QDP::pokeSite(ch,cF,cx);
+    }}}}
+  }
+  static void ExportFermion(FermionField & gr,
+			    QDP::LatticeStaggeredFermion & ch  ) 
+  {
+    Grid::QCD::ColourVector F;
+    Grid::Complex c;
+
+    std::vector<int> x(5);
+    QDP::multi1d<int> cx(4);
+    std::vector<int> gd= gr._grid->GlobalDimensions();
+
+    for (x[0]=0;x[0]<gd[0];x[0]++){
+    for (x[1]=0;x[1]<gd[1];x[1]++){
+    for (x[2]=0;x[2]<gd[2];x[2]++){
+    for (x[3]=0;x[3]<gd[3];x[3]++){
+      cx[0] = x[0];
+      cx[1] = x[1];
+      cx[2] = x[2];
+      cx[3] = x[3];
+
+      QDP::StaggeredFermion cF = QDP::peekSite(ch,cx);
+      for(int j=0;j<3;j++){
+	QDP::ColorVector cS=QDP::peekSpin(cF,0);
+	QDP::Complex cc=QDP::peekColor(cS,j);
+	c = Grid::Complex(QDP::toDouble(QDP::real(cc)), 
+			  QDP::toDouble(QDP::imag(cc)));
+	F()()(j) = c;
+      }
+      Grid::pokeSite(F,gr,x);
+    }}}}
+  }
+
+  static Handle< Chroma::EvenOddLinearOperator<T4,U,U> >  GetLinOp (U &u,U &u_fat,U &u_triple)
+  {
+    QDP::Real _mq(mq);
+    QDP::multi1d<int> bcs(QDP::Nd);
+
+    bcs[0] = bcs[1] = bcs[2] = bcs[3] = 1;
+
+    Chroma::AsqtadFermActParams p; 
+    p.Mass = _mq; 
+    p.u0 = Real(1.0);
+
+
+    Chroma::Handle<Chroma::FermBC<T4,U,U> > fbc(new Chroma::SimpleFermBC< T4, U, U >(bcs));
+    Chroma::Handle<Chroma::CreateFermState<T4,U,U> > cfs( new Chroma::CreateSimpleFermState<T4,U,U>(fbc));
+    Chroma::AsqtadFermAct S_f(cfs,p);
+    Chroma::Handle< Chroma::FermState<T4,U,U> >  ffs(  S_f.createState(u) );
+    u_fat   =ffs.cast<AsqtadConnectStateBase>()->getFatLinks();
+    u_triple=ffs.cast<AsqtadConnectStateBase>()->getTripleLinks();
+    return S_f.linOp(ffs);
+  }
+
+};
+}
+
+int main (int argc,char **argv )
+{
+
+  /********************************************************
+   * Setup QDP
+   *********************************************************/
+  Chroma::initialize(&argc,&argv);
+  Chroma::WilsonTypeFermActs4DEnv::registerAll(); 
+
+  /********************************************************
+   * Setup Grid
+   *********************************************************/
+  Grid::Grid_init(&argc,&argv);
+  Grid::GridCartesian * UGrid   = Grid::QCD::SpaceTimeGrid::makeFourDimGrid(Grid::GridDefaultLatt(), 
+									    Grid::GridDefaultSimd(Grid::QCD::Nd,Grid::vComplex::Nsimd()),
+									    Grid::GridDefaultMpi());
+  
+  std::vector<int> gd = UGrid->GlobalDimensions();
+  QDP::multi1d<int> nrow(QDP::Nd);
+  for(int mu=0;mu<4;mu++) nrow[mu] = gd[mu];
+
+  QDP::Layout::setLattSize(nrow);
+  QDP::Layout::create();
+
+  GaugeField uthin  (UGrid);
+  GaugeField ufat   (UGrid);
+  GaugeField utriple(UGrid);
+  FermionField    src(UGrid);
+  FermionField    res_chroma(UGrid);
+  FermionField    res_grid  (UGrid);
+  
+
+  {
+
+    std::cout << "*****************************"<<std::endl;
+    std::cout << "Staggered Action "            <<std::endl;
+    std::cout << "*****************************"<<std::endl;
+
+    make_gauge(uthin,src);
+
+    for(int dag=0;dag<2;dag++) {
+
+      std::cout << "Dag =  "<<dag<<std::endl;
+      
+      calc_chroma(uthin,utriple,ufat,src,res_chroma,dag);
+
+      // Remove the normalisation of Chroma Gauge links ??
+      std::cout << "Norm of chroma Asqtad multiply "<<Grid::norm2(res_chroma)<<std::endl;
+      calc_grid  (uthin,utriple,ufat,src,res_grid,dag);
+
+      std::cout << "Norm of thin gauge "<< Grid::norm2(uthin) <<std::endl;
+      std::cout << "Norm of fat  gauge "<< Grid::norm2(ufat) <<std::endl;
+
+      std::cout << "Norm of Grid Asqtad multiply "<<Grid::norm2(res_grid)<<std::endl;
+      
+      /*
+      std::cout << " site 0 of Uthin  "<<uthin._odata[0] <<std::endl;
+      std::cout << " site 0 of Utriple"<<utriple._odata[0] <<std::endl;
+      std::cout << " site 0 of Ufat   "<<ufat._odata[0] <<std::endl;
+
+      std::cout << " site 0 of Grid   "<<res_grid._odata[0] <<std::endl;
+      std::cout << " site 0 of Chroma "<<res_chroma._odata[0] <<std::endl;
+      */
+
+      res_chroma=res_chroma - res_grid;
+      std::cout << "Norm of difference "<<Grid::norm2(res_chroma)<<std::endl;
+    }
+  }
+
+  std::cout << "Finished test "<<std::endl;
+
+  Chroma::finalize();
+}
+
+void calc_chroma(GaugeField & lat, GaugeField &uthin, GaugeField &ufat, FermionField &src, FermionField &res,int dag)
+{
+  typedef QDP::LatticeStaggeredFermion T;
+  typedef QDP::multi1d<QDP::LatticeColorMatrix> U;
+  
+  U u(4);
+  U ut(4);
+  U uf(4);
+
+  //  Chroma::HotSt(u);
+  Chroma::ChromaWrapper::ImportGauge(lat,u) ;
+
+  QDP::LatticeStaggeredFermion  check;
+  QDP::LatticeStaggeredFermion  result;
+  QDP::LatticeStaggeredFermion  tmp;
+  QDP::LatticeStaggeredFermion  psi;
+
+  Chroma::ChromaWrapper::ImportFermion(src,psi);
+
+  auto linop =Chroma::ChromaWrapper::GetLinOp(u,uf,ut);
+
+  Chroma::ChromaWrapper::ExportGauge(uthin,ut) ;
+  Chroma::ChromaWrapper::ExportGauge(ufat ,uf) ;
+
+  enum Chroma::PlusMinus isign;
+  if ( dag ) {
+    isign=Chroma::MINUS;
+  } else {
+    isign=Chroma::PLUS;
+  }
+
+  std::cout << "Calling Chroma Linop "<< std::endl;
+  linop->evenEvenLinOp(tmp,psi,isign); check[rb[0]] = tmp;
+  linop->oddOddLinOp  (tmp,psi,isign); check[rb[1]] = tmp;
+  linop->evenOddLinOp(tmp,psi,isign) ; check[rb[0]]+= tmp;
+  linop->oddEvenLinOp(tmp,psi,isign) ; check[rb[1]]+= tmp;
+
+  Chroma::ChromaWrapper::ExportFermion(res,check) ;
+}
+
+
+void make_gauge(GaugeField & Umu,FermionField &src)
+{
+  using namespace Grid;
+  using namespace Grid::QCD;
+
+  std::vector<int> seeds4({1,2,3,4});
+
+  Grid::GridCartesian         * UGrid   = (Grid::GridCartesian *) Umu._grid;
+  Grid::GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+  Grid::QCD::SU3::HotConfiguration(RNG4,Umu);
+  Grid::gaussian(RNG4,src);
+}
+
+void calc_grid(GaugeField & Uthin, GaugeField & Utriple, GaugeField & Ufat, FermionField &src, FermionField &res,int dag)
+{
+  using namespace Grid;
+  using namespace Grid::QCD;
+
+  Grid::GridCartesian         * UGrid   = (Grid::GridCartesian *) Uthin._grid;
+  Grid::GridRedBlackCartesian * UrbGrid = Grid::QCD::SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  Grid::QCD::ImprovedStaggeredFermionR Dstag(Uthin,Utriple,Ufat,*UGrid,*UrbGrid,mq*2.0);
+
+  std::cout << Grid::GridLogMessage <<" Calling Grid staggered multiply "<<std::endl;
+
+  if ( dag ) 
+    Dstag.Mdag(src,res);  
+  else 
+    Dstag.M(src,res);  
+
+  res = res ; // Convention mismatch to Chroma
+  return;
+} 
+
+
+
+
+

From 6af459cae4509c19d95c9d9658140b9a4a1ad488 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Fri, 31 Mar 2017 17:07:43 +0900
Subject: [PATCH 02/13] Christoph's coefficients.

---
 tests/core/Test_zmobius_even_odd.cc  | 26 ++++++++++++++++++++------
 tests/solver/Test_zmobius_cg_prec.cc | 26 ++++++++++++++++++++------
 2 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/tests/core/Test_zmobius_even_odd.cc b/tests/core/Test_zmobius_even_odd.cc
index d547f2f7..867c3359 100644
--- a/tests/core/Test_zmobius_even_odd.cc
+++ b/tests/core/Test_zmobius_even_odd.cc
@@ -53,7 +53,7 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 
 
-  const int Ls=8;
+  const int Ls=10;
   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@@ -87,13 +87,27 @@ int main (int argc, char ** argv)
   RealD mass=0.1;
   RealD M5  =1.8;
   std::vector < std::complex<double>  > omegas;
+#if 0
   for(int i=0;i<Ls;i++){
-        double imag = 0.;
-        if (i==0) imag=1.;
-        if (i==Ls-1) imag=-1.;
-        std::complex<double> temp (0.25+0.01*i, imag*0.1);
-        omegas.push_back(temp);
+    double imag = 0.;
+    if (i==0) imag=1.;
+    if (i==Ls-1) imag=-1.;
+    std::complex<double> temp (0.25+0.01*i, imag*0.01);
+    omegas.push_back(temp);
   }
+#else
+  omegas.push_back( std::complex<double>(1.45806438985048,-0) );
+  omegas.push_back( std::complex<double>(1.18231318389348,-0) );
+  omegas.push_back( std::complex<double>(0.830951166685955,-0) );
+  omegas.push_back( std::complex<double>(0.542352409156791,-0) );
+  omegas.push_back( std::complex<double>(0.341985020453729,-0) );
+  omegas.push_back( std::complex<double>(0.21137902619029,-0) );
+  omegas.push_back( std::complex<double>(0.126074299502912,-0) );
+  omegas.push_back( std::complex<double>(0.0990136651962626,-0) );
+  omegas.push_back( std::complex<double>(0.0686324988446592,0.0550658530827402) );
+  omegas.push_back( std::complex<double>(0.0686324988446592,-0.0550658530827402) );
+#endif
+
   ZMobiusFermionR Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5, omegas,1.,0.);
 //  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 
diff --git a/tests/solver/Test_zmobius_cg_prec.cc b/tests/solver/Test_zmobius_cg_prec.cc
index 4ae98d71..922ca8ae 100644
--- a/tests/solver/Test_zmobius_cg_prec.cc
+++ b/tests/solver/Test_zmobius_cg_prec.cc
@@ -43,7 +43,7 @@ Gamma::Algebra Gmu[] = {Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, Gamma::A
 int main(int argc, char** argv) {
   Grid_init(&argc, &argv);
 
-  const int Ls = 16;
+  const int Ls = 10;
 
   GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
       GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
@@ -80,13 +80,27 @@ int main(int argc, char** argv) {
   RealD mass = 0.01;
   RealD M5 = 1.8;
   std::vector < std::complex<double>  > omegas;
+#if 0
   for(int i=0;i<Ls;i++){
-	double imag = 0.;
-	if (i==0) imag=1.;
-	if (i==Ls-1) imag=-1.;
-	std::complex<double> temp (0.25+0.01*i, imag*0.01);
-	omegas.push_back(temp);
+    double imag = 0.;
+    if (i==0) imag=1.;
+    if (i==Ls-1) imag=-1.;
+    std::complex<double> temp (0.25+0.01*i, imag*0.01);
+    omegas.push_back(temp);
   }
+#else
+  omegas.push_back( std::complex<double>(1.45806438985048,-0) );
+  omegas.push_back( std::complex<double>(1.18231318389348,-0) );
+  omegas.push_back( std::complex<double>(0.830951166685955,-0) );
+  omegas.push_back( std::complex<double>(0.542352409156791,-0) );
+  omegas.push_back( std::complex<double>(0.341985020453729,-0) );
+  omegas.push_back( std::complex<double>(0.21137902619029,-0) );
+  omegas.push_back( std::complex<double>(0.126074299502912,-0) );
+  omegas.push_back( std::complex<double>(0.0990136651962626,-0) );
+  omegas.push_back( std::complex<double>(0.0686324988446592,0.0550658530827402) );
+  omegas.push_back( std::complex<double>(0.0686324988446592,-0.0550658530827402) );
+#endif
+
   ZMobiusFermionR Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5, omegas,1.,0.);
 
   LatticeFermion src_o(FrbGrid);

From 7e5faa0f3486c9eba1f8ecc68bd386686a73da5b Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 2 Apr 2017 00:25:44 +0900
Subject: [PATCH 03/13] Multiple RNGs

---
 lib/parallelIO/NerscIO.h | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/lib/parallelIO/NerscIO.h b/lib/parallelIO/NerscIO.h
index afd35236..72b3e32d 100644
--- a/lib/parallelIO/NerscIO.h
+++ b/lib/parallelIO/NerscIO.h
@@ -491,10 +491,15 @@ static inline void writeRNGState(GridSerialRNG &serial,GridParallelRNG &parallel
 #ifdef RNG_RANLUX
     header.floating_point = std::string("UINT64");
     header.data_type      = std::string("RANLUX48");
-#else
+#endif
+#ifdef RNG_MT19937
     header.floating_point = std::string("UINT32");
     header.data_type      = std::string("MT19937");
 #endif
+#ifdef RNG_SITMO
+    header.floating_point = std::string("UINT64");
+    header.data_type      = std::string("SITMO");
+#endif
 
   truncate(file);
   offset = writeHeader(header,file);
@@ -522,10 +527,15 @@ static inline void readRNGState(GridSerialRNG &serial,GridParallelRNG & parallel
 #ifdef RNG_RANLUX
   assert(format == std::string("UINT64"));
   assert(data_type == std::string("RANLUX48"));
-#else
+#endif
+#ifdef RNG_MT19937
   assert(format == std::string("UINT32"));
   assert(data_type == std::string("MT19937"));
 #endif
+#ifdef RNG_SITMO
+  assert(format == std::string("UINT64"));
+  assert(data_type == std::string("SITMO"));
+#endif
 
   // depending on datatype, set up munger;
   // munger is a function of <floating point, Real, data_type>

From d1d63a4f2d0bc78c46930b762683c441ef6b226e Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 2 Apr 2017 00:26:05 +0900
Subject: [PATCH 04/13] sitmo default

---
 configure.ac | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 95f573f5..3a323a5d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -321,7 +321,7 @@ AM_CONDITIONAL(BUILD_COMMS_NONE,  [ test "${comms_type}X" == "noneX" ])
 ############### RNG selection
 AC_ARG_ENABLE([rng],[AC_HELP_STRING([--enable-rng=ranlux48|mt19937|sitmo],\
 	            [Select Random Number Generator to be used])],\
-	            [ac_RNG=${enable_rng}],[ac_RNG=ranlux48])
+	            [ac_RNG=${enable_rng}],[ac_RNG=sitmo])
 
 case ${ac_RNG} in
      ranlux48)
@@ -401,6 +401,7 @@ AC_CONFIG_FILES(tests/hadrons/Makefile)
 AC_CONFIG_FILES(tests/hmc/Makefile)
 AC_CONFIG_FILES(tests/solver/Makefile)
 AC_CONFIG_FILES(tests/qdpxx/Makefile)
+AC_CONFIG_FILES(tests/testu01/Makefile)
 AC_CONFIG_FILES(benchmarks/Makefile)
 AC_CONFIG_FILES(extras/Makefile)
 AC_CONFIG_FILES(extras/Hadrons/Makefile)

From f18f5ed926ac435b4cb14a7cde3b2a4a9fc3887e Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 2 Apr 2017 00:26:26 +0900
Subject: [PATCH 05/13] Drop random device

---
 benchmarks/Benchmark_memory_asynch.cc    | 3 ++-
 benchmarks/Benchmark_memory_bandwidth.cc | 8 ++++----
 benchmarks/Benchmark_staggered.cc        | 2 +-
 benchmarks/Benchmark_su3.cc              | 8 ++++----
 benchmarks/Benchmark_wilson.cc           | 2 +-
 5 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/benchmarks/Benchmark_memory_asynch.cc b/benchmarks/Benchmark_memory_asynch.cc
index 9f4f5d61..2124b137 100644
--- a/benchmarks/Benchmark_memory_asynch.cc
+++ b/benchmarks/Benchmark_memory_asynch.cc
@@ -66,7 +66,8 @@ int main (int argc, char ** argv)
 
     Vec tsum; tsum = zero;
 
-    GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+    GridParallelRNG          pRNG(&Grid);      
+    pRNG.SeedFixedIntegers(std::vector<int>({56,17,89,101}));
 
     std::vector<double> stop(threads);
     Vector<Vec> sum(threads);
diff --git a/benchmarks/Benchmark_memory_bandwidth.cc b/benchmarks/Benchmark_memory_bandwidth.cc
index 435af7f4..d57c4df5 100644
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@@ -65,7 +65,7 @@ int main (int argc, char ** argv)
 
       uint64_t Nloop=NLOOP;
 
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 
       LatticeVec z(&Grid); //random(pRNG,z);
       LatticeVec x(&Grid); //random(pRNG,x);
@@ -100,7 +100,7 @@ int main (int argc, char ** argv)
       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 
       LatticeVec z(&Grid); //random(pRNG,z);
       LatticeVec x(&Grid); //random(pRNG,x);
@@ -138,7 +138,7 @@ int main (int argc, char ** argv)
 
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 
       LatticeVec z(&Grid); //random(pRNG,z);
       LatticeVec x(&Grid); //random(pRNG,x);
@@ -173,7 +173,7 @@ int main (int argc, char ** argv)
       uint64_t Nloop=NLOOP;
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
       LatticeVec z(&Grid); //random(pRNG,z);
       LatticeVec x(&Grid); //random(pRNG,x);
       LatticeVec y(&Grid); //random(pRNG,y);
diff --git a/benchmarks/Benchmark_staggered.cc b/benchmarks/Benchmark_staggered.cc
index 121dc0d5..dc2dcf91 100644
--- a/benchmarks/Benchmark_staggered.cc
+++ b/benchmarks/Benchmark_staggered.cc
@@ -51,7 +51,7 @@ int main (int argc, char ** argv)
   std::vector<int> seeds({1,2,3,4});
   GridParallelRNG          pRNG(&Grid);
   pRNG.SeedFixedIntegers(seeds);
-  //  pRNG.SeedRandomDevice();
+  //  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 
   typedef typename ImprovedStaggeredFermionR::FermionField FermionField; 
   typename ImprovedStaggeredFermionR::ImplParams params; 
diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc
index b6d1d303..c234c301 100644
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -55,7 +55,7 @@ int main (int argc, char ** argv)
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 
       LatticeColourMatrix z(&Grid);// random(pRNG,z);
       LatticeColourMatrix x(&Grid);// random(pRNG,x);
@@ -88,7 +88,7 @@ int main (int argc, char ** argv)
       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 
       LatticeColourMatrix z(&Grid); //random(pRNG,z);
       LatticeColourMatrix x(&Grid); //random(pRNG,x);
@@ -119,7 +119,7 @@ int main (int argc, char ** argv)
       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 
       LatticeColourMatrix z(&Grid); //random(pRNG,z);
       LatticeColourMatrix x(&Grid); //random(pRNG,x);
@@ -150,7 +150,7 @@ int main (int argc, char ** argv)
       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 
       LatticeColourMatrix z(&Grid); //random(pRNG,z);
       LatticeColourMatrix x(&Grid); //random(pRNG,x);
diff --git a/benchmarks/Benchmark_wilson.cc b/benchmarks/Benchmark_wilson.cc
index 11f1353f..671e7654 100644
--- a/benchmarks/Benchmark_wilson.cc
+++ b/benchmarks/Benchmark_wilson.cc
@@ -69,7 +69,7 @@ int main (int argc, char ** argv)
   std::vector<int> seeds({1,2,3,4});
   GridParallelRNG          pRNG(&Grid);
   pRNG.SeedFixedIntegers(seeds);
-  //  pRNG.SeedRandomDevice();
+  //  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 
   LatticeFermion src   (&Grid); random(pRNG,src);
   LatticeFermion result(&Grid); result=zero;

From 9cbcdd65d7c5717a67376d3068970c3657d4c683 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 2 Apr 2017 00:26:57 +0900
Subject: [PATCH 06/13] No random device seed

---
 lib/algorithms/CoarsenedMatrix.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/algorithms/CoarsenedMatrix.h b/lib/algorithms/CoarsenedMatrix.h
index 73f6baff..c2910151 100644
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@@ -425,7 +425,7 @@ namespace Grid {
 	A[p]=zero;
       }
 
-      GridParallelRNG  RNG(Grid()); RNG.SeedRandomDevice();
+      GridParallelRNG  RNG(Grid()); RNG.SeedFixedIntegers(std::vector<int>({55,72,19,17,34}));
       Lattice<iScalar<CComplex> > val(Grid()); random(RNG,val);
 
       Complex one(1.0);

From 935d82f5b10f594f40a46e1d8c0dd042d36b3352 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 2 Apr 2017 00:27:28 +0900
Subject: [PATCH 07/13] sanity checks

---
 lib/cartesian/Cartesian_base.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/cartesian/Cartesian_base.h b/lib/cartesian/Cartesian_base.h
index cea0f3dc..4ca8c580 100644
--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@@ -177,9 +177,11 @@ public:
     // Global addressing
     ////////////////////////////////////////////////////////////////
     void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){
+      assert(gidx< gSites());
       Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
     }
     void LocalIndexToLocalCoor(int lidx,std::vector<int> &lcoor){
+      assert(lidx<lSites());
       Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
     }
     void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){

From 9dc7ca4c3be4bad81bedb7c54e34b26db58fad38 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 2 Apr 2017 00:28:22 +0900
Subject: [PATCH 08/13] Sitmo fast init

---
 lib/lattice/Lattice_rng.h | 252 ++++++++++++++++++--------------------
 1 file changed, 122 insertions(+), 130 deletions(-)

diff --git a/lib/lattice/Lattice_rng.h b/lib/lattice/Lattice_rng.h
index 3d653d17..31fef729 100644
--- a/lib/lattice/Lattice_rng.h
+++ b/lib/lattice/Lattice_rng.h
@@ -69,40 +69,6 @@ namespace Grid {
     return multiplicity;
   }
 
-  // Wrap seed_seq to give common interface with random_device
-  // Should rather wrap random_device and have a generate
-  class fixedSeed {
-  public:
-
-    typedef std::seed_seq::result_type result_type;
-
-    std::seed_seq src;
-    
-    template<class int_type> fixedSeed(const std::vector<int_type> &seeds) : src(seeds.begin(),seeds.end()) {};
-
-    template< class RandomIt > void generate( RandomIt begin, RandomIt end ) {
-      src.generate(begin,end);
-    }
-
-  };
-
-
-  class deviceSeed {
-  public:
-
-    std::random_device rd;
-
-    typedef std::random_device::result_type result_type;
-    
-    deviceSeed(void) : rd(){};
-
-    template< class RandomIt > void generate( RandomIt begin, RandomIt end ) {
-      for(RandomIt it=begin; it!=end;it++){
-	*it = rd();
-      }
-    }
-  };
-
   // real scalars are one component
   template<class scalar,class distribution,class generator> void fillScalar(scalar &s,distribution &dist,generator & gen)
   {
@@ -118,67 +84,100 @@ namespace Grid {
   }
   
   class GridRNGbase {
-
   public:
-
-    int _seeded;
     // One generator per site.
     // Uniform and Gaussian distributions from these generators.
 #ifdef RNG_RANLUX
-    typedef uint64_t      RngStateType;
     typedef std::ranlux48 RngEngine;
+    typedef uint64_t      RngStateType;
     static const int RngStateCount = 15;
-#elif RNG_MT19937 
+#endif 
+#ifdef RNG_MT19937 
     typedef std::mt19937 RngEngine;
     typedef uint32_t     RngStateType;
     static const int     RngStateCount = std::mt19937::state_size;
-#elif RNG_SITMO
+#endif
+#ifdef RNG_SITMO
     typedef sitmo::prng_engine 	RngEngine;
     typedef uint64_t    	RngStateType;
     static const int    	RngStateCount = 4;
 #endif
-    std::vector<RngEngine>                             _generators;
-    std::vector<std::uniform_real_distribution<RealD>> _uniform;
-    std::vector<std::normal_distribution<RealD>>       _gaussian;
-    std::vector<std::discrete_distribution<int32_t>>   _bernoulli;
+    ///////////////////////
+    // support for parallel init
+    ///////////////////////
+#ifdef RNG_SITMO
+    static void Skip(RngEngine &eng)
+    {
+      uint64_t skip = 0x1; skip = skip<<40;
+      eng.discard(skip);
+    } 
+#endif
+    static RngEngine Reseed(RngEngine &eng)
+    {
+      const int reseeds=4;
+      std::uniform_int_distribution<uint32_t> uid;
+      std::vector<uint32_t> newseed(reseeds);
+      for(int i=0;i<reseeds;i++){
+	newseed[i] = uid(eng);
+      }
+      std::seed_seq sseq(newseed.begin(),newseed.end());
+      return RngEngine(sseq);
+    }    
 
-    void GetState(std::vector<RngStateType> & saved,int gen) {
+    std::vector<RngEngine>                             _generators;
+    std::vector<std::uniform_real_distribution<RealD> > _uniform;
+    std::vector<std::normal_distribution<RealD> >       _gaussian;
+    std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
+    std::vector<std::uniform_int_distribution<uint32_t> > _uid;
+
+    void GetState(std::vector<RngStateType> & saved,RngEngine &eng) {
       saved.resize(RngStateCount);
       std::stringstream ss;
-      ss<<_generators[gen];
+      ss<<eng;
       ss.seekg(0,ss.beg);
       for(int i=0;i<RngStateCount;i++){
 	ss>>saved[i];
       }
     }
-    void SetState(std::vector<RngStateType> & saved,int gen){
+    void GetState(std::vector<RngStateType> & saved,int gen) {
+      GetState(saved,_generators[gen]);
+    }
+    void SetState(std::vector<RngStateType> & saved,RngEngine &eng){
       assert(saved.size()==RngStateCount);
       std::stringstream ss;
       for(int i=0;i<RngStateCount;i++){
 	ss<< saved[i]<<" ";
       }
       ss.seekg(0,ss.beg);
-      ss>>_generators[gen];
+      ss>>eng;
     }
+    void SetState(std::vector<RngStateType> & saved,int gen){
+      SetState(saved,_generators[gen]);
+    }
+    void SetEngine(RngEngine &Eng, int gen){
+      _generators[gen]=Eng;
+    }
+    void GetEngine(RngEngine &Eng, int gen){
+      Eng=_generators[gen];
+    }
+
+    template<class source> void Seed(source &src, int gen)
+    {
+      _generators[gen] = RngEngine(src);
+    }    
   };
 
   class GridSerialRNG : public GridRNGbase {
   public:
 
-    // FIXME ... do we require lockstep draws of randoms 
-    // from all nodes keeping seeds consistent.
-    // place a barrier/broadcast in the fill routine
-
     GridSerialRNG() : GridRNGbase() {
       _generators.resize(1);
       _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
       _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
       _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
-      _seeded=0;
+      _uid.resize(1,std::uniform_int_distribution<uint32_t>() );
     }
 
-
-
     template <class sobj,class distribution> inline void fill(sobj &l,std::vector<distribution> &dist){
 
       typedef typename sobj::scalar_type scalar_type;
@@ -195,7 +194,6 @@ namespace Grid {
       CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 
     };
-
     template <class distribution>  inline void fill(ComplexF &l,std::vector<distribution> &dist){
       dist[0].reset();
       fillScalar(l,dist[0],_generators[0]);
@@ -250,19 +248,10 @@ namespace Grid {
       CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
     }
 
-    template<class source> void Seed(source &src)
-    {
-      _generators[0] = RngEngine(src);
-      _seeded=1;
-    }    
-    void SeedRandomDevice(void){
-      deviceSeed src;
-      Seed(src);
-    }
     void SeedFixedIntegers(const std::vector<int> &seeds){
       CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
-      fixedSeed src(seeds);
-      Seed(src);
+      std::seed_seq src(seeds.begin(),seeds.end());
+      Seed(src,0);
     }
 
   };
@@ -285,15 +274,9 @@ namespace Grid {
       _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
       _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
       _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
-      _seeded=0;
+      _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
     }
 
-
-
-    //FIXME implement generic IO and create state save/restore
-    //void SaveState(const std::string<char> &file);
-    //void LoadState(const std::string<char> &file);
-
     template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){
 
       typedef typename vobj::scalar_object scalar_object;
@@ -329,79 +312,88 @@ namespace Grid {
       }
     };
 
-    // This loop could be made faster to avoid the Ahmdahl by
-    // i)  seed generators on each timeslice, for x=y=z=0;
-    // ii) seed generators on each z for x=y=0
-    // iii)seed generators on each y,z for x=0
-    // iv) seed generators on each y,z,x 
-    // made possible by physical indexing.
-    template<class source> void Seed(source &src)
-    {
+    void SeedFixedIntegers(const std::vector<int> &seeds){
 
-      typedef typename source::result_type seed_t;
-      std::uniform_int_distribution<seed_t> uid;
+      CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
 
-      int numseed=4;
-      int gsites = _grid->_gsites;
-      std::vector<seed_t> site_init(numseed);
+      std::seed_seq source(seeds.begin(),seeds.end());
+
+      RngEngine master_engine(source);
+
+#ifdef RNG_SITMO
       std::vector<int> gcoor;
 
+      for(int gidx=0;gidx<_grid->_gsites;gidx++){
 
-      // Master RngEngine
-      std::vector<seed_t> master_init(numseed);  src.generate(master_init.begin(),master_init.end());
-      _grid->Broadcast(0,(void *)&master_init[0],sizeof(seed_t)*numseed);
-      fixedSeed master_seed(master_init);
-      RngEngine master_engine(master_seed);
+	Skip(master_engine); // advance the state; does this work?
 
-      // Per node RngEngine
-      std::vector<seed_t> node_init(numseed);
-      for(int r=0;r<_grid->ProcessorCount();r++) {
-
-	std::vector<seed_t> rank_init(numseed);
-	for(int i=0;i<numseed;i++) rank_init[i] = uid(master_engine);
-
-	std::cout << GridLogMessage << "SeedSeq for rank "<<r;
-	for(int i=0;i<numseed;i++) std::cout<<" "<<rank_init[i];
-	std::cout <<std::endl;
-
-	if ( r==_grid->ThisRank() ) { 
-	  for(int i=0;i<numseed;i++) node_init[i] = rank_init[i];
-	}
-
-      }
-
-      ////////////////////////////////////////////////////
-      // Set up a seed_seq wrapper with these 8 words
-      // and draw for each site within node.
-      ////////////////////////////////////////////////////
-      fixedSeed node_seed(node_init);
-      RngEngine node_engine(node_seed);
-
-      for(int gidx=0;gidx<gsites;gidx++){
 	int rank,o_idx,i_idx;
-
 	_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
 
 	if( rank == _grid->ThisRank() ){
 	  int l_idx=generator_idx(o_idx,i_idx);
-	  for(int i=0;i<numseed;i++)  site_init[i] = uid(node_engine);
-	  fixedSeed site_seed(site_init);
-	  _generators[l_idx] = RngEngine(site_seed);
+	  _generators[l_idx] = master_engine;
+	}
+
+      }
+#else 
+      // Machine and thread decomposition dependent seeding
+      // is efficient and maximally parallel; but not
+      // reproducible from machine to machine. Not ideal, but fast.
+      // Different seed for each node.
+      {
+	int Nproc = _grid->ProcessorCount();
+	int me= _grid->ThisRank();
+	std::vector<RngEngine> seeders(Nproc);
+	
+	for(int p=0;p<Nproc;p++){
+	  seeders[p] = Reseed(master_engine);
+	}
+	master_engine = seeders[me];
+      }
+
+      // Different seed for each thread
+      {
+	int Nthread = GridThread::GetThreads();
+	std::vector<RngEngine> seeders(Nthread);
+	for(int t=0;t<Nthread;t++){
+	  seeders[t] = Reseed(master_engine);
+	}
+
+	parallel_for(int t=0;t<Nthread;t++) {
+	  master_engine = seeders[t];
+	  for(int l=0;l<_grid->lSites();l++) {
+	    if ( (l%Nthread)==t ) {
+	      _generators[l] = Reseed(master_engine);
+	    }
+	  }
 	}
       }
-      _seeded=1;
-    }    
-    void SeedRandomDevice(void){
-      deviceSeed src;
-      Seed(src);
+#endif
     }
-    void SeedFixedIntegers(const std::vector<int> &seeds){
-      CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
-      fixedSeed src(seeds);
-      Seed(src);
+
+    uint32_t GlobalU01(int gsite){
+
+      std::vector<int> gcoor;
+      _grid->GlobalIndexToGlobalCoor(gsite,gcoor);
+
+      int rank,o_idx,i_idx;
+      _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
+
+      int l_idx=generator_idx(o_idx,i_idx);
+
+      uint32_t the_number;
+      if( rank == _grid->ThisRank() ){
+	the_number = _uid[l_idx](_generators[l_idx]);
+      }
+
+      _grid->Broadcast(rank,(void *)&the_number,sizeof(the_number));
+
+      return the_number;
     }
 
+
   };
 
   template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l){

From 0fade84ab258c454951b383ea04635c3ab95ab7f Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 2 Apr 2017 00:29:40 +0900
Subject: [PATCH 09/13] No random device

---
 tests/IO/Test_nersc_io.cc                     |   4 +-
 tests/Test_cshift.cc                          |   2 +-
 tests/Test_simd.cc                            |  10 +-
 tests/Test_stencil.cc                         |   2 +-
 tests/core/Test_cshift_red_black.cc           |   2 +-
 tests/core/Test_cshift_red_black_rotate.cc    |   2 +-
 tests/core/Test_cshift_rotate.cc              |   2 +-
 tests/core/Test_gamma.cc                      |   2 +-
 tests/core/Test_gpwilson_even_odd.cc          |   2 +-
 tests/core/Test_lie_generators.cc             |   2 +-
 tests/core/Test_main.cc                       |   4 +-
 tests/core/Test_rng.cc                        |   4 +-
 tests/core/Test_staggered.cc                  |   2 +-
 tests/core/Test_wilson_even_odd.cc            |   2 +-
 tests/core/Test_wilson_tm_even_odd.cc         |   2 +-
 tests/forces/Test_dwf_gpforce.cc              |   4 +-
 tests/forces/Test_gp_rect_force.cc            |   2 +-
 tests/forces/Test_rect_force.cc               |   2 +-
 tests/forces/Test_wilson_force.cc             |   2 +-
 tests/forces/Test_wilson_force_phiMdagMphi.cc |   2 +-
 tests/forces/Test_wilson_force_phiMphi.cc     |   2 +-
 tests/testu01/Makefile.am                     |   3 +
 tests/testu01/Test_smallcrush.cc              | 176 ++++++++++++++++++
 23 files changed, 208 insertions(+), 29 deletions(-)
 create mode 100644 tests/testu01/Makefile.am
 create mode 100644 tests/testu01/Test_smallcrush.cc

diff --git a/tests/IO/Test_nersc_io.cc b/tests/IO/Test_nersc_io.cc
index 0273d02a..e5ea7cec 100644
--- a/tests/IO/Test_nersc_io.cc
+++ b/tests/IO/Test_nersc_io.cc
@@ -54,8 +54,8 @@ int main (int argc, char ** argv)
   GridSerialRNG     sRNGa;
   GridSerialRNG     sRNGb;
 
-  pRNGa.SeedRandomDevice();
-  sRNGa.SeedRandomDevice();
+  pRNGa.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+  sRNGa.SeedFixedIntegers(std::vector<int>({45,12,81,9});
   
   std::string rfile("./ckpoint_rng.4000");
   NerscIO::writeRNGState(sRNGa,pRNGa,rfile);
diff --git a/tests/Test_cshift.cc b/tests/Test_cshift.cc
index e1dd0db8..f9559a83 100644
--- a/tests/Test_cshift.cc
+++ b/tests/Test_cshift.cc
@@ -41,7 +41,7 @@ int main (int argc, char ** argv)
 
   GridCartesian        Fine(latt_size,simd_layout,mpi_layout);
 
-  GridParallelRNG      FineRNG(&Fine);  FineRNG.SeedRandomDevice();
+  GridParallelRNG      FineRNG(&Fine);  FineRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
   LatticeComplex U(&Fine);
   LatticeComplex ShiftU(&Fine);
diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc
index f19ebd76..b3933ec6 100644
--- a/tests/Test_simd.cc
+++ b/tests/Test_simd.cc
@@ -125,7 +125,7 @@ template<class scal, class vec,class functor >
 void Tester(const functor &func)
 {
   GridSerialRNG          sRNG;
-  sRNG.SeedRandomDevice();
+  sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
   
   int Nsimd = vec::Nsimd();
 
@@ -184,7 +184,7 @@ void IntTester(const functor &func)
   typedef Integer  scal;
   typedef vInteger vec;
   GridSerialRNG          sRNG;
-  sRNG.SeedRandomDevice();
+  sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
   int Nsimd = vec::Nsimd();
 
@@ -242,7 +242,7 @@ template<class reduced,class scal, class vec,class functor >
 void ReductionTester(const functor &func)
 {
   GridSerialRNG          sRNG;
-  sRNG.SeedRandomDevice();
+  sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
   
   int Nsimd = vec::Nsimd();
 
@@ -343,7 +343,7 @@ template<class scal, class vec,class functor >
 void PermTester(const functor &func)
 {
   GridSerialRNG          sRNG;
-  sRNG.SeedRandomDevice();
+  sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
   
   int Nsimd = vec::Nsimd();
 
@@ -409,7 +409,7 @@ template<class scal, class vec,class functor >
 void ExchangeTester(const functor &func)
 {
   GridSerialRNG          sRNG;
-  sRNG.SeedRandomDevice();
+  sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
   
   int Nsimd = vec::Nsimd();
 
diff --git a/tests/Test_stencil.cc b/tests/Test_stencil.cc
index 38873310..d7bc5a6c 100644
--- a/tests/Test_stencil.cc
+++ b/tests/Test_stencil.cc
@@ -52,7 +52,7 @@ int main (int argc, char ** argv)
   GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout);
   GridParallelRNG       fRNG(&Fine);
 
-  //  fRNG.SeedRandomDevice();
+  //  fRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
   std::vector<int> seeds({1,2,3,4});
   fRNG.SeedFixedIntegers(seeds);
   
diff --git a/tests/core/Test_cshift_red_black.cc b/tests/core/Test_cshift_red_black.cc
index ae55cece..f9269709 100644
--- a/tests/core/Test_cshift_red_black.cc
+++ b/tests/core/Test_cshift_red_black.cc
@@ -49,7 +49,7 @@ int main (int argc, char ** argv)
   GridCartesian         Fine  (latt_size,simd_layout,mpi_layout);
   GridRedBlackCartesian RBFine(latt_size,simd_layout,mpi_layout,mask,1);
 
-  GridParallelRNG      FineRNG(&Fine);  FineRNG.SeedRandomDevice();
+  GridParallelRNG      FineRNG(&Fine);  FineRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
   LatticeComplex U(&Fine);
   LatticeComplex ShiftU(&Fine);
diff --git a/tests/core/Test_cshift_red_black_rotate.cc b/tests/core/Test_cshift_red_black_rotate.cc
index 4ff7067b..3ef1cd21 100644
--- a/tests/core/Test_cshift_red_black_rotate.cc
+++ b/tests/core/Test_cshift_red_black_rotate.cc
@@ -49,7 +49,7 @@ int main (int argc, char ** argv)
   GridCartesian         Fine  (latt_size,simd_layout,mpi_layout);
   GridRedBlackCartesian RBFine(latt_size,simd_layout,mpi_layout,mask,1);
 
-  GridParallelRNG      FineRNG(&Fine);  FineRNG.SeedRandomDevice();
+  GridParallelRNG      FineRNG(&Fine);  FineRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
   LatticeComplex err(&Fine);
   LatticeComplex U(&Fine);
diff --git a/tests/core/Test_cshift_rotate.cc b/tests/core/Test_cshift_rotate.cc
index a42fd22e..64c08892 100644
--- a/tests/core/Test_cshift_rotate.cc
+++ b/tests/core/Test_cshift_rotate.cc
@@ -41,7 +41,7 @@ int main (int argc, char ** argv)
 
   GridCartesian        Fine(latt_size,simd_layout,mpi_layout);
 
-  GridParallelRNG      FineRNG(&Fine);  FineRNG.SeedRandomDevice();
+  GridParallelRNG      FineRNG(&Fine);  FineRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
   LatticeComplex U(&Fine);
   LatticeComplex ShiftU(&Fine);
diff --git a/tests/core/Test_gamma.cc b/tests/core/Test_gamma.cc
index ee31a69a..f3805bdb 100644
--- a/tests/core/Test_gamma.cc
+++ b/tests/core/Test_gamma.cc
@@ -245,7 +245,7 @@ int main(int argc, char *argv[])
   GridCartesian Grid(latt_size,simd_layout,mpi_layout);
   GridSerialRNG sRNG;
   
-  sRNG.SeedRandomDevice();
+  sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
   
   std::cout << GridLogMessage << "======== Test algebra" << std::endl;
   createTestAlgebra();
diff --git a/tests/core/Test_gpwilson_even_odd.cc b/tests/core/Test_gpwilson_even_odd.cc
index b8b320d8..fc12fe75 100644
--- a/tests/core/Test_gpwilson_even_odd.cc
+++ b/tests/core/Test_gpwilson_even_odd.cc
@@ -50,7 +50,7 @@ int main (int argc, char ** argv)
   GridParallelRNG          pRNG(&Grid);
   //  std::vector<int> seeds({1,2,3,4});
   //  pRNG.SeedFixedIntegers(seeds);
-  pRNG.SeedRandomDevice();
+  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
   typedef typename GparityWilsonFermionR::FermionField FermionField;
 
diff --git a/tests/core/Test_lie_generators.cc b/tests/core/Test_lie_generators.cc
index 5623b74b..114b49f7 100644
--- a/tests/core/Test_lie_generators.cc
+++ b/tests/core/Test_lie_generators.cc
@@ -86,7 +86,7 @@ int main(int argc, char** argv) {
 
   // Projectors 
   GridParallelRNG gridRNG(grid);
-  gridRNG.SeedRandomDevice();
+  gridRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
   SU3Adjoint::LatticeAdjMatrix Gauss(grid);
   SU3::LatticeAlgebraVector ha(grid);
   SU3::LatticeAlgebraVector hb(grid);
diff --git a/tests/core/Test_main.cc b/tests/core/Test_main.cc
index 78c28539..1868b0fe 100644
--- a/tests/core/Test_main.cc
+++ b/tests/core/Test_main.cc
@@ -89,8 +89,8 @@ int main(int argc, char **argv) {
       GridSerialRNG SerialRNG;
       GridSerialRNG SerialRNG1;
 
-      FineRNG.SeedRandomDevice();
-      SerialRNG.SeedRandomDevice();
+      FineRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+      SerialRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
       std::cout << "SerialRNG" << SerialRNG._generators[0] << std::endl;
 
diff --git a/tests/core/Test_rng.cc b/tests/core/Test_rng.cc
index 1fcb3a13..b5d27c29 100644
--- a/tests/core/Test_rng.cc
+++ b/tests/core/Test_rng.cc
@@ -43,10 +43,10 @@ int main (int argc, char ** argv)
 
   std::vector<int> seeds({1,2,3,4});
 
-  GridSerialRNG             sRNG;   sRNG.SeedRandomDevice();
+  GridSerialRNG             sRNG;   sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
   GridSerialRNG            fsRNG;  fsRNG.SeedFixedIntegers(seeds);
 
-  GridParallelRNG           pRNG(&Grid);   pRNG.SeedRandomDevice();
+  GridParallelRNG           pRNG(&Grid);   pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
   GridParallelRNG          fpRNG(&Grid);  fpRNG.SeedFixedIntegers(seeds);
 
   SpinMatrix rnd  ; 
diff --git a/tests/core/Test_staggered.cc b/tests/core/Test_staggered.cc
index 89055fc7..75531c83 100644
--- a/tests/core/Test_staggered.cc
+++ b/tests/core/Test_staggered.cc
@@ -51,7 +51,7 @@ int main (int argc, char ** argv)
   std::vector<int> seeds({1,2,3,4});
   GridParallelRNG          pRNG(&Grid);
   pRNG.SeedFixedIntegers(seeds);
-  //  pRNG.SeedRandomDevice();
+  //  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 
   typedef typename ImprovedStaggeredFermionR::FermionField FermionField; 
   typedef typename ImprovedStaggeredFermionR::ComplexField ComplexField; 
diff --git a/tests/core/Test_wilson_even_odd.cc b/tests/core/Test_wilson_even_odd.cc
index b33bd74d..4933c36e 100644
--- a/tests/core/Test_wilson_even_odd.cc
+++ b/tests/core/Test_wilson_even_odd.cc
@@ -62,7 +62,7 @@ int main (int argc, char ** argv)
   GridParallelRNG          pRNG(&Grid);
   //  std::vector<int> seeds({1,2,3,4});
   //  pRNG.SeedFixedIntegers(seeds);
-  pRNG.SeedRandomDevice();
+  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
   LatticeFermion src   (&Grid); random(pRNG,src);
   LatticeFermion phi   (&Grid); random(pRNG,phi);
diff --git a/tests/core/Test_wilson_tm_even_odd.cc b/tests/core/Test_wilson_tm_even_odd.cc
index 36de83ea..a2773244 100644
--- a/tests/core/Test_wilson_tm_even_odd.cc
+++ b/tests/core/Test_wilson_tm_even_odd.cc
@@ -61,7 +61,7 @@ int main (int argc, char ** argv)
   GridParallelRNG          pRNG(&Grid);
   //  std::vector<int> seeds({1,2,3,4});
   //  pRNG.SeedFixedIntegers(seeds);
-  pRNG.SeedRandomDevice();
+  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
   LatticeFermion src   (&Grid); random(pRNG,src);
   LatticeFermion phi   (&Grid); random(pRNG,phi);
diff --git a/tests/forces/Test_dwf_gpforce.cc b/tests/forces/Test_dwf_gpforce.cc
index 5094b8a7..96a90ff6 100644
--- a/tests/forces/Test_dwf_gpforce.cc
+++ b/tests/forces/Test_dwf_gpforce.cc
@@ -54,8 +54,8 @@ int main (int argc, char ** argv)
 
   std::vector<int> seeds({1,2,3,4});
 
-  GridParallelRNG          RNG5(FGrid);  RNG5.SeedRandomDevice();
-  GridParallelRNG          RNG4(UGrid);  RNG4.SeedRandomDevice();
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
   
   FermionField phi        (FGrid); gaussian(RNG5,phi);
   FermionField Mphi       (FGrid); 
diff --git a/tests/forces/Test_gp_rect_force.cc b/tests/forces/Test_gp_rect_force.cc
index 551c3a20..17a7cd2b 100644
--- a/tests/forces/Test_gp_rect_force.cc
+++ b/tests/forces/Test_gp_rect_force.cc
@@ -50,7 +50,7 @@ int main (int argc, char ** argv)
   std::vector<int> seeds({1,2,3,4});
 
   GridParallelRNG          pRNG(&Grid);
-  pRNG.SeedRandomDevice();
+  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
   LatticeGaugeField U(&Grid);
 
diff --git a/tests/forces/Test_rect_force.cc b/tests/forces/Test_rect_force.cc
index 97281854..2e2f87b2 100644
--- a/tests/forces/Test_rect_force.cc
+++ b/tests/forces/Test_rect_force.cc
@@ -50,7 +50,7 @@ int main (int argc, char ** argv)
   std::vector<int> seeds({1,2,3,4});
 
   GridParallelRNG          pRNG(&Grid);
-  pRNG.SeedRandomDevice();
+  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
   LatticeGaugeField U(&Grid);
 
diff --git a/tests/forces/Test_wilson_force.cc b/tests/forces/Test_wilson_force.cc
index 60d31b51..52ed00b4 100644
--- a/tests/forces/Test_wilson_force.cc
+++ b/tests/forces/Test_wilson_force.cc
@@ -50,7 +50,7 @@ int main (int argc, char ** argv)
   std::vector<int> seeds({1,2,3,4});
 
   GridParallelRNG          pRNG(&Grid);
-  pRNG.SeedRandomDevice();
+  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
   LatticeFermion phi        (&Grid); gaussian(pRNG,phi);
   LatticeFermion Mphi       (&Grid); 
diff --git a/tests/forces/Test_wilson_force_phiMdagMphi.cc b/tests/forces/Test_wilson_force_phiMdagMphi.cc
index 7717e9bc..2a5814fe 100644
--- a/tests/forces/Test_wilson_force_phiMdagMphi.cc
+++ b/tests/forces/Test_wilson_force_phiMdagMphi.cc
@@ -50,7 +50,7 @@ int main (int argc, char ** argv)
   std::vector<int> seeds({1,2,3,4});
 
   GridParallelRNG          pRNG(&Grid);
-  pRNG.SeedRandomDevice();
+  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
   LatticeFermion phi        (&Grid); gaussian(pRNG,phi);
   LatticeFermion Mphi       (&Grid); 
diff --git a/tests/forces/Test_wilson_force_phiMphi.cc b/tests/forces/Test_wilson_force_phiMphi.cc
index c9e56c32..8cfb1de6 100644
--- a/tests/forces/Test_wilson_force_phiMphi.cc
+++ b/tests/forces/Test_wilson_force_phiMphi.cc
@@ -50,7 +50,7 @@ int main (int argc, char ** argv)
   std::vector<int> seeds({1,2,3,4});
 
   GridParallelRNG          pRNG(&Grid);
-  pRNG.SeedRandomDevice();
+  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
   LatticeFermion phi        (&Grid); gaussian(pRNG,phi);
   LatticeFermion Mphi       (&Grid); 
diff --git a/tests/testu01/Makefile.am b/tests/testu01/Makefile.am
new file mode 100644
index 00000000..eb4d1eae
--- /dev/null
+++ b/tests/testu01/Makefile.am
@@ -0,0 +1,3 @@
+AM_LDFLAGS += -L$(LIBRARY_PATH) -ltestu01 -lprobdist -lmylib -lm
+AM_CXXFLAGS += -I$(C_INCLUDE_PATH)
+include Make.inc
diff --git a/tests/testu01/Test_smallcrush.cc b/tests/testu01/Test_smallcrush.cc
new file mode 100644
index 00000000..28974855
--- /dev/null
+++ b/tests/testu01/Test_smallcrush.cc
@@ -0,0 +1,176 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_smallcrush.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+// Wrap Grid's parallel RNG for testU01
+
+extern "C" { 
+#include "TestU01.h"
+#include "gdef.h"
+#include "unif01.h"
+#include "ucarry.h"
+#include "bbattery.h"
+}
+
+std::vector<std::ranlux48>      EngineRanlux;
+std::vector<std::mt19937>       EngineMT;
+std::vector<sitmo::prng_engine> EngineSitmo;
+
+std::uniform_int_distribution<uint32_t> uid;
+
+uint32_t GetU01Ranlux(void) {
+  return uid(EngineRanlux[0]);
+};
+uint32_t GetU01MT(void) {
+  return uid(EngineMT[0]);
+};
+uint32_t GetU01Sitmo(void) {
+  return uid(EngineSitmo[0]);
+};
+
+typedef Grid::GridRNGbase::RngEngine RngEngine;
+
+struct TestRNG { 
+public:
+  static GridParallelRNG *pRNG;
+  static GridSerialRNG *sRNG;
+  static GridBase *_grid;
+  static RngEngine Eng;
+  static uint64_t site;
+  static uint64_t gsites;
+  static char *name;
+
+  static void Init(GridParallelRNG *_pRNG,GridSerialRNG *_sRNG,GridBase *grid) {
+    pRNG = _pRNG;
+    sRNG = _sRNG;
+    _grid= grid;
+    gsites= grid->_gsites;
+    std::cout << "Init: Global sites is " <<gsites<<std::endl;
+    site = 0;
+  }
+  static uint32_t GetU01(void) { 
+    //    std::uniform_int_distribution<uint32_t> uid;
+    uint32_t ret_val;
+#if 0
+    ret_val = sRNG->_uid[0](sRNG->_generators[0]);
+#else
+    ret_val = pRNG->GlobalU01(site);
+    site=(site+1)%gsites;
+#endif
+    //    std::cout << "site "<<site <<" "<<std::hex << ret_val <<std::dec<<std::endl; 
+    return ret_val;
+
+  }
+};
+
+GridParallelRNG *TestRNG::pRNG;
+GridSerialRNG   *TestRNG::sRNG;
+GridBase        *TestRNG::_grid;
+RngEngine        TestRNG::Eng;
+uint64_t         TestRNG::site;
+uint64_t         TestRNG::gsites;
+
+#ifdef RNG_SITMO
+char * TestRNG::name = (char *)"Grid_Sitmo";
+#endif
+#ifdef RNG_RANLUX
+char * TestRNG::name = (char *)"Grid_ranlux48";
+#endif
+#ifdef RNG_MT19937
+char * TestRNG::name = (char *)"Grid_mt19937";
+#endif
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(4,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+     
+  GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+  std::vector<int> seeds({1,2,3,4});
+  std::seed_seq seq(seeds.begin(),seeds.end());
+
+  EngineRanlux.push_back(std::ranlux48(seq));
+  EngineMT.push_back(std::mt19937(seq));
+  EngineSitmo.push_back(sitmo::prng_engine(seq));
+
+  std::cout << GridLogMessage<< "Initialising Grid RNGs "<<std::endl; 
+  GridParallelRNG           pRNG(&Grid);   
+  pRNG.SeedFixedIntegers(std::vector<int>({43,12,7019,9}));
+  GridSerialRNG           sRNG;
+  sRNG.SeedFixedIntegers(std::vector<int>({102,12,99,15}));
+  std::cout << GridLogMessage<< "Initialised Grid RNGs "<<std::endl; 
+
+  TestRNG::Init(&pRNG,&sRNG,&Grid);
+  std::cout << GridLogMessage<< "Grid RNG's are  "<< std::string(TestRNG::name) <<std::endl; 
+
+  unif01_Gen * gen;
+
+  //  gen = ulcg_CreateLCG (2147483647, 397204094, 0, 12345);
+  //  bbattery_SmallCrush (gen);
+
+  //  gen =  ucarry_CreateRanlux (48, 0x12345);
+  //  bbattery_SmallCrush (gen);
+
+  /*
+  std::cout << GridLogMessage<< "Testing Standalone Ranlux" <<std::endl; 
+  gen = unif01_CreateExternGenBits ((char *)"GridRanlux",GetU01Ranlux);
+  bbattery_SmallCrush (gen);
+  unif01_DeleteExternGenBits(gen);
+  std::cout << GridLogMessage<< "Testing Standalone Ranlux is complete" <<std::endl; 
+  */
+  /*
+  std::cout << GridLogMessage<< "Testing Standalone Mersenne Twister" <<std::endl; 
+  gen = unif01_CreateExternGenBits ((char *)"GridMT",GetU01MT);
+  bbattery_SmallCrush (gen);
+  unif01_DeleteExternGenBits(gen);
+  std::cout << GridLogMessage<< "Testing Standalone Mersenne Twister is complete" <<std::endl; 
+
+  std::cout << GridLogMessage<< "Testing Standalone Sitmo" <<std::endl; 
+  gen = unif01_CreateExternGenBits ((char *)"GridSitmo",GetU01Sitmo);
+  bbattery_SmallCrush (gen);
+  unif01_DeleteExternGenBits(gen);
+  std::cout << GridLogMessage<< "Testing Standalone Sitmo is complete" <<std::endl; 
+  */
+
+  //  gen = unif01_CreateExternGenBits ((char *)"xorshift", xorshift);
+  std::cout << GridLogMessage<< "Testing Grid "<< std::string(TestRNG::name) <<std::endl; 
+  gen = unif01_CreateExternGenBits(TestRNG::name,TestRNG::GetU01);
+  bbattery_SmallCrush (gen);
+  std::cout << GridLogMessage<< "Testing Grid "<< std::string(TestRNG::name)<<" is complete" <<std::endl; 
+
+  Grid_finalize();
+}
+

From 83f6fab8fa2b194d77d4fb23badef5fecbea74cd Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 2 Apr 2017 12:10:51 +0900
Subject: [PATCH 10/13] Big/Small crush test, and fast SITMO rng init, faster
 but not ideal MT and Ranlux init.

---
 lib/lattice/Lattice_rng.h        | 175 ++++++++++++++++++-------------
 tests/testu01/Test_smallcrush.cc |  42 +++-----
 2 files changed, 117 insertions(+), 100 deletions(-)

diff --git a/lib/lattice/Lattice_rng.h b/lib/lattice/Lattice_rng.h
index 31fef729..ddd2170f 100644
--- a/lib/lattice/Lattice_rng.h
+++ b/lib/lattice/Lattice_rng.h
@@ -30,12 +30,19 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define GRID_LATTICE_RNG_H
 
 #include <random>
+
+#ifdef RNG_SITMO
 #include <Grid/sitmo_rng/sitmo_prng_engine.hpp>
+#endif 
+
+#if defined(RNG_SITMO)
+#define RNG_FAST_DISCARD
+#else 
+#undef  RNG_FAST_DISCARD
+#endif
 
 namespace Grid {
 
-  //http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-90Ar1.pdf ?
-
   //////////////////////////////////////////////////////////////
   // Allow the RNG state to be less dense than the fine grid
   //////////////////////////////////////////////////////////////
@@ -65,20 +72,22 @@ namespace Grid {
 
       multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d]; 
     }
-
     return multiplicity;
   }
 
   // real scalars are one component
-  template<class scalar,class distribution,class generator> void fillScalar(scalar &s,distribution &dist,generator & gen)
+  template<class scalar,class distribution,class generator> 
+  void fillScalar(scalar &s,distribution &dist,generator & gen)
   {
     s=dist(gen);
   }
-  template<class distribution,class generator> void fillScalar(ComplexF &s,distribution &dist, generator &gen)
+  template<class distribution,class generator> 
+  void fillScalar(ComplexF &s,distribution &dist, generator &gen)
   {
     s=ComplexF(dist(gen),dist(gen));
   }
-  template<class distribution,class generator> void fillScalar(ComplexD &s,distribution &dist,generator &gen)
+  template<class distribution,class generator> 
+  void fillScalar(ComplexD &s,distribution &dist,generator &gen)
   {
     s=ComplexD(dist(gen),dist(gen));
   }
@@ -102,27 +111,6 @@ namespace Grid {
     typedef uint64_t    	RngStateType;
     static const int    	RngStateCount = 4;
 #endif
-    ///////////////////////
-    // support for parallel init
-    ///////////////////////
-#ifdef RNG_SITMO
-    static void Skip(RngEngine &eng)
-    {
-      uint64_t skip = 0x1; skip = skip<<40;
-      eng.discard(skip);
-    } 
-#endif
-    static RngEngine Reseed(RngEngine &eng)
-    {
-      const int reseeds=4;
-      std::uniform_int_distribution<uint32_t> uid;
-      std::vector<uint32_t> newseed(reseeds);
-      for(int i=0;i<reseeds;i++){
-	newseed[i] = uid(eng);
-      }
-      std::seed_seq sseq(newseed.begin(),newseed.end());
-      return RngEngine(sseq);
-    }    
 
     std::vector<RngEngine>                             _generators;
     std::vector<std::uniform_real_distribution<RealD> > _uniform;
@@ -130,6 +118,46 @@ namespace Grid {
     std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
     std::vector<std::uniform_int_distribution<uint32_t> > _uid;
 
+    ///////////////////////
+    // support for parallel init
+    ///////////////////////
+#ifdef RNG_FAST_DISCARD
+    static void Skip(RngEngine &eng)
+    {
+      /////////////////////////////////////////////////////////////////////////////////////
+      // Skip by 2^40 elements between successive lattice sites
+      // This goes by 10^12.
+      // Consider quenched updating; likely never exceeding rate of 1000 sweeps
+      // per second on any machine. This gives us of order 10^9 seconds, or 100 years
+      // skip ahead.
+      // For HMC unlikely to go at faster than a solve per second, and 
+      // tens of seconds per trajectory so this is clean in all reasonable cases,
+      // and margin of safety is orders of magnitude.
+      // We could hack Sitmo to skip in the higher order words of state if necessary
+      /////////////////////////////////////////////////////////////////////////////////////
+      uint64_t skip = 0x1; skip = skip<<40;
+      eng.discard(skip);
+    } 
+#endif
+    static RngEngine Reseed(RngEngine &eng)
+    {
+      std::vector<uint32_t> newseed;
+      std::uniform_int_distribution<uint32_t> uid;
+      return Reseed(eng,newseed,uid);
+    }
+    static RngEngine Reseed(RngEngine &eng,std::vector<uint32_t> & newseed,
+			    std::uniform_int_distribution<uint32_t> &uid)
+    {
+      const int reseeds=4;
+      
+      newseed.resize(reseeds);
+      for(int i=0;i<reseeds;i++){
+	newseed[i] = uid(eng);
+      }
+      std::seed_seq sseq(newseed.begin(),newseed.end());
+      return RngEngine(sseq);
+    }    
+
     void GetState(std::vector<RngStateType> & saved,RngEngine &eng) {
       saved.resize(RngStateCount);
       std::stringstream ss;
@@ -160,7 +188,6 @@ namespace Grid {
     void GetEngine(RngEngine &Eng, int gen){
       Eng=_generators[gen];
     }
-
     template<class source> void Seed(source &src, int gen)
     {
       _generators[gen] = RngEngine(src);
@@ -190,10 +217,11 @@ namespace Grid {
       for(int idx=0;idx<words;idx++){
 	fillScalar(buf[idx],dist[0],_generators[0]);
       }
-      
+
       CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 
     };
+
     template <class distribution>  inline void fill(ComplexF &l,std::vector<distribution> &dist){
       dist[0].reset();
       fillScalar(l,dist[0],_generators[0]);
@@ -253,14 +281,13 @@ namespace Grid {
       std::seed_seq src(seeds.begin(),seeds.end());
       Seed(src,0);
     }
-
   };
 
   class GridParallelRNG : public GridRNGbase {
   public:
-
     GridBase *_grid;
     int _vol;
+  public:
 
     int generator_idx(int os,int is){
       return is*_grid->oSites()+os;
@@ -289,7 +316,6 @@ namespace Grid {
       int     osites=_grid->oSites();
       int words=sizeof(scalar_object)/sizeof(scalar_type);
 
-
       parallel_for(int ss=0;ss<osites;ss++){
 
 	std::vector<scalar_object> buf(Nsimd);
@@ -314,23 +340,34 @@ namespace Grid {
 
     void SeedFixedIntegers(const std::vector<int> &seeds){
 
+      // Everyone generates the same seed_seq based on input seeds
       CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
 
       std::seed_seq source(seeds.begin(),seeds.end());
 
       RngEngine master_engine(source);
 
-#ifdef RNG_SITMO
+#ifdef RNG_FAST_DISCARD
+      ////////////////////////////////////////////////
+      // Skip ahead through a single stream.
+      // Applicable to SITMO and other has based/crypto RNGs
+      // Should be applicable to Mersenne Twister, but the C++11
+      // MT implementation does not implement fast discard even though
+      // in principle this is possible
+      ////////////////////////////////////////////////
       std::vector<int> gcoor;
+      int rank,o_idx,i_idx;
 
+      // Everybody loops over global volume.
       for(int gidx=0;gidx<_grid->_gsites;gidx++){
 
-	Skip(master_engine); // advance the state; does this work?
+	Skip(master_engine); // Skip to next RNG sequence
 
-	int rank,o_idx,i_idx;
+	// Where is it?
 	_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
 
+	// If this is one of mine we take it
 	if( rank == _grid->ThisRank() ){
 	  int l_idx=generator_idx(o_idx,i_idx);
 	  _generators[l_idx] = master_engine;
@@ -338,23 +375,24 @@ namespace Grid {
 
       }
 #else 
-      // Machine and thread decomposition dependent seeding
-      // is efficient and maximally parallel; but not
-      // reproducible from machine to machine. Not ideal, but fast.
-      // Different seed for each node.
+      ////////////////////////////////////////////////////////////////
+      // Machine and thread decomposition dependent seeding is efficient
+      // and maximally parallel; but NOT reproducible from machine to machine. 
+      // Not ideal, but fastest way to reseed all nodes.
+      ////////////////////////////////////////////////////////////////
       {
+	// Obtain one Reseed per processor
 	int Nproc = _grid->ProcessorCount();
-	int me= _grid->ThisRank();
 	std::vector<RngEngine> seeders(Nproc);
-	
+	int me= _grid->ThisRank();
 	for(int p=0;p<Nproc;p++){
 	  seeders[p] = Reseed(master_engine);
 	}
 	master_engine = seeders[me];
       }
 
-      // Different seed for each thread
       {
+	// Obtain one reseeded generator per thread
 	int Nthread = GridThread::GetThreads();
 	std::vector<RngEngine> seeders(Nthread);
 	for(int t=0;t<Nthread;t++){
@@ -362,63 +400,52 @@ namespace Grid {
 	}
 
 	parallel_for(int t=0;t<Nthread;t++) {
-	  master_engine = seeders[t];
+	  // set up one per local site in threaded fashion
+	  std::vector<uint32_t> newseeds;
+	  std::uniform_int_distribution<uint32_t> uid;	
 	  for(int l=0;l<_grid->lSites();l++) {
 	    if ( (l%Nthread)==t ) {
-	      _generators[l] = Reseed(master_engine);
+	      _generators[l] = Reseed(seeders[t],newseeds,uid);
 	    }
 	  }
 	}
       }
 #endif
     }
-
+    ////////////////////////////////////////////////////////////////////////
+    // Support for rigorous test of RNG's
+    // Return uniform random uint32_t from requested site generator
+    ////////////////////////////////////////////////////////////////////////
     uint32_t GlobalU01(int gsite){
 
-      std::vector<int> gcoor;
-      _grid->GlobalIndexToGlobalCoor(gsite,gcoor);
+      uint32_t the_number;
 
+      // who
+      std::vector<int> gcoor;
       int rank,o_idx,i_idx;
+      _grid->GlobalIndexToGlobalCoor(gsite,gcoor);
       _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
 
+      // draw
       int l_idx=generator_idx(o_idx,i_idx);
-
-      uint32_t the_number;
       if( rank == _grid->ThisRank() ){
 	the_number = _uid[l_idx](_generators[l_idx]);
       }
-
+      
+      // share & return
       _grid->Broadcast(rank,(void *)&the_number,sizeof(the_number));
-
       return the_number;
     }
 
-
   };
 
-  template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l){
-    rng.fill(l,rng._uniform);
-  }
+  template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l)   { rng.fill(l,rng._uniform);  }
+  template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
+  template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
 
-  template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l){
-    rng.fill(l,rng._gaussian);
-  }
-  
-  template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){
-    rng.fill(l,rng._bernoulli);
-  }
-
-  template <class sobj> inline void random(GridSerialRNG &rng,sobj &l){
-    rng.fill(l,rng._uniform);
-  }
-  
-  template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l){
-    rng.fill(l,rng._gaussian);
-  }
-  
-  template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){
-    rng.fill(l,rng._bernoulli);
-  }
+  template <class sobj> inline void random(GridSerialRNG &rng,sobj &l)   { rng.fill(l,rng._uniform  ); }
+  template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
+  template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
 
 }
 #endif
diff --git a/tests/testu01/Test_smallcrush.cc b/tests/testu01/Test_smallcrush.cc
index 28974855..51f0d60c 100644
--- a/tests/testu01/Test_smallcrush.cc
+++ b/tests/testu01/Test_smallcrush.cc
@@ -32,17 +32,17 @@ using namespace Grid;
 using namespace Grid::QCD;
 
 // Wrap Grid's parallel RNG for testU01
+#define BIG_CRUSH             // Big crush enable (long running)
+#undef TEST_RNG_STANDALONE   // Test serial RNGs in isolation
 
 extern "C" { 
 #include "TestU01.h"
-#include "gdef.h"
-#include "unif01.h"
-#include "ucarry.h"
-#include "bbattery.h"
 }
 
 std::vector<std::ranlux48>      EngineRanlux;
 std::vector<std::mt19937>       EngineMT;
+
+#include <Grid/sitmo_rng/sitmo_prng_engine.hpp>
 std::vector<sitmo::prng_engine> EngineSitmo;
 
 std::uniform_int_distribution<uint32_t> uid;
@@ -74,21 +74,13 @@ public:
     sRNG = _sRNG;
     _grid= grid;
     gsites= grid->_gsites;
-    std::cout << "Init: Global sites is " <<gsites<<std::endl;
     site = 0;
   }
   static uint32_t GetU01(void) { 
-    //    std::uniform_int_distribution<uint32_t> uid;
     uint32_t ret_val;
-#if 0
-    ret_val = sRNG->_uid[0](sRNG->_generators[0]);
-#else
     ret_val = pRNG->GlobalU01(site);
     site=(site+1)%gsites;
-#endif
-    //    std::cout << "site "<<site <<" "<<std::hex << ret_val <<std::dec<<std::endl; 
     return ret_val;
-
   }
 };
 
@@ -134,24 +126,17 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage<< "Initialised Grid RNGs "<<std::endl; 
 
   TestRNG::Init(&pRNG,&sRNG,&Grid);
-  std::cout << GridLogMessage<< "Grid RNG's are  "<< std::string(TestRNG::name) <<std::endl; 
+  std::cout << GridLogMessage<< "Grid RNG's are "<< std::string(TestRNG::name) <<std::endl; 
 
   unif01_Gen * gen;
 
-  //  gen = ulcg_CreateLCG (2147483647, 397204094, 0, 12345);
-  //  bbattery_SmallCrush (gen);
-
-  //  gen =  ucarry_CreateRanlux (48, 0x12345);
-  //  bbattery_SmallCrush (gen);
-
-  /*
+#ifdef TEST_RNG_STANDALONE
   std::cout << GridLogMessage<< "Testing Standalone Ranlux" <<std::endl; 
   gen = unif01_CreateExternGenBits ((char *)"GridRanlux",GetU01Ranlux);
   bbattery_SmallCrush (gen);
   unif01_DeleteExternGenBits(gen);
   std::cout << GridLogMessage<< "Testing Standalone Ranlux is complete" <<std::endl; 
-  */
-  /*
+
   std::cout << GridLogMessage<< "Testing Standalone Mersenne Twister" <<std::endl; 
   gen = unif01_CreateExternGenBits ((char *)"GridMT",GetU01MT);
   bbattery_SmallCrush (gen);
@@ -163,14 +148,19 @@ int main (int argc, char ** argv)
   bbattery_SmallCrush (gen);
   unif01_DeleteExternGenBits(gen);
   std::cout << GridLogMessage<< "Testing Standalone Sitmo is complete" <<std::endl; 
-  */
+#endif
 
-  //  gen = unif01_CreateExternGenBits ((char *)"xorshift", xorshift);
-  std::cout << GridLogMessage<< "Testing Grid "<< std::string(TestRNG::name) <<std::endl; 
+#ifdef BIG_CRUSH
+  std::cout << GridLogMessage<< "Testing Grid BigCrush for "<< std::string(TestRNG::name) <<std::endl; 
+  gen = unif01_CreateExternGenBits(TestRNG::name,TestRNG::GetU01);
+  bbattery_BigCrush (gen);
+  std::cout << GridLogMessage<< "Testing Grid "<< std::string(TestRNG::name)<<" is complete" <<std::endl; 
+#else
+  std::cout << GridLogMessage<< "Testing Grid SmallCrush for "<< std::string(TestRNG::name) <<std::endl; 
   gen = unif01_CreateExternGenBits(TestRNG::name,TestRNG::GetU01);
   bbattery_SmallCrush (gen);
   std::cout << GridLogMessage<< "Testing Grid "<< std::string(TestRNG::name)<<" is complete" <<std::endl; 
-
+#endif
   Grid_finalize();
 }
 

From 561426f6eb466ffb174ee2ff57ab2af5c3655e6b Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 2 Apr 2017 23:13:48 +0900
Subject: [PATCH 11/13] Clean up

---
 tests/testu01/Test_smallcrush.cc | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/tests/testu01/Test_smallcrush.cc b/tests/testu01/Test_smallcrush.cc
index 51f0d60c..d09cd577 100644
--- a/tests/testu01/Test_smallcrush.cc
+++ b/tests/testu01/Test_smallcrush.cc
@@ -32,7 +32,9 @@ using namespace Grid;
 using namespace Grid::QCD;
 
 // Wrap Grid's parallel RNG for testU01
-#define BIG_CRUSH             // Big crush enable (long running)
+#undef BIG_CRUSH             // Big crush enable (long running)
+#define MIDDLE_CRUSH             // Big crush enable (long running)
+#undef SMALL_CRUSH             // Big crush enable (long running)
 #undef TEST_RNG_STANDALONE   // Test serial RNGs in isolation
 
 extern "C" { 
@@ -154,12 +156,19 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage<< "Testing Grid BigCrush for "<< std::string(TestRNG::name) <<std::endl; 
   gen = unif01_CreateExternGenBits(TestRNG::name,TestRNG::GetU01);
   bbattery_BigCrush (gen);
-  std::cout << GridLogMessage<< "Testing Grid "<< std::string(TestRNG::name)<<" is complete" <<std::endl; 
-#else
+  std::cout << GridLogMessage<< "Testing Grid BigCrush "<< std::string(TestRNG::name)<<" is complete" <<std::endl; 
+#endif
+#ifdef MIDDLE_CRUSH
+  std::cout << GridLogMessage<< "Testing Grid Crush for "<< std::string(TestRNG::name) <<std::endl; 
+  gen = unif01_CreateExternGenBits(TestRNG::name,TestRNG::GetU01);
+  bbattery_Crush (gen);
+  std::cout << GridLogMessage<< "Testing Grid Crush "<< std::string(TestRNG::name)<<" is complete" <<std::endl; 
+#endif
+#ifdef SMALL_CRUSH
   std::cout << GridLogMessage<< "Testing Grid SmallCrush for "<< std::string(TestRNG::name) <<std::endl; 
   gen = unif01_CreateExternGenBits(TestRNG::name,TestRNG::GetU01);
   bbattery_SmallCrush (gen);
-  std::cout << GridLogMessage<< "Testing Grid "<< std::string(TestRNG::name)<<" is complete" <<std::endl; 
+  std::cout << GridLogMessage<< "Testing Grid SmallCrush "<< std::string(TestRNG::name)<<" is complete" <<std::endl; 
 #endif
   Grid_finalize();
 }

From 35da4ece0b5ee91cdd056c401260f22bededff80 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 5 Apr 2017 02:18:15 +0900
Subject: [PATCH 12/13] UID fix

---
 lib/communicator/Communicator_mpi3.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index 7685768c..6e40142c 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -203,7 +203,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
 
       size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
 
-      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",GroupRank,r);
+      sprintf(shm_name,"/Grid_mpi3_uid%d_shm_%d_%d",getuid(),GroupRank,r);
 
       shm_unlink(shm_name);
       int fd=shm_open(shm_name,O_RDWR|O_CREAT,0660);
@@ -224,7 +224,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
     for(int r=0;r<ShmSize;r++){
       size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES ;
     
-      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",GroupRank,r);
+      sprintf(shm_name,"/Grid_mpi3_uid%d_shm_%d_%d",getuid(),GroupRank,r);
 
       int fd=shm_open(shm_name,O_RDWR,0660);
       if ( fd<0 ) {	perror("failed shm_open");	assert(0);      }

From 5592f7b8c1888ab0175d1f8d28fa2e8db0e8e102 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 5 Apr 2017 02:35:34 +0900
Subject: [PATCH 13/13] Creation mode better implementation

---
 lib/communicator/Communicator_mpi3.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index 6e40142c..a8bffc14 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -203,10 +203,10 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
 
       size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
 
-      sprintf(shm_name,"/Grid_mpi3_uid%d_shm_%d_%d",getuid(),GroupRank,r);
+      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",GroupRank,r);
 
       shm_unlink(shm_name);
-      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0660);
+      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
       if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      }
       ftruncate(fd, size);
 
@@ -224,9 +224,9 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
     for(int r=0;r<ShmSize;r++){
       size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES ;
     
-      sprintf(shm_name,"/Grid_mpi3_uid%d_shm_%d_%d",getuid(),GroupRank,r);
+      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",GroupRank,r);
 
-      int fd=shm_open(shm_name,O_RDWR,0660);
+      int fd=shm_open(shm_name,O_RDWR,0666);
       if ( fd<0 ) {	perror("failed shm_open");	assert(0);      }
 
       void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);