No compile on comms == none fix

Shaking out
Get subrank info from communicator constructor
2025-08-05 14:07:12 +01:00 · 2017-10-30 01:14:11 +00:00 · 2017-10-30 00:25:31 +00:00 · 2017-10-30 00:24:11 +00:00 · 2017-10-30 00:23:34 +00:00 · 2017-10-30 00:22:52 +00:00
61 changed files with 5864 additions and 1284 deletions
--- a/configure.ac
+++ b/configure.ac
@@ -550,6 +550,7 @@ AC_CONFIG_FILES(tests/forces/Makefile)
 AC_CONFIG_FILES(tests/hadrons/Makefile)
 AC_CONFIG_FILES(tests/hmc/Makefile)
 AC_CONFIG_FILES(tests/solver/Makefile)
 AC_CONFIG_FILES(tests/lanczos/Makefile)
 AC_CONFIG_FILES(tests/smearing/Makefile)
 AC_CONFIG_FILES(tests/qdpxx/Makefile)
 AC_CONFIG_FILES(tests/testu01/Makefile)
--- a/extras/Hadrons/Modules.hpp
+++ b/extras/Hadrons/Modules.hpp
@@ -1,26 +1,25 @@
-#include <Grid/Hadrons/Modules/MLoop/NoiseLoop.hpp>
+#include <Grid/Hadrons/Modules/MAction/DWF.hpp>
-#include <Grid/Hadrons/Modules/MFermion/GaugeProp.hpp>
+#include <Grid/Hadrons/Modules/MAction/Wilson.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 #include <Grid/Hadrons/Modules/MContraction/Meson.hpp>
 #include <Grid/Hadrons/Modules/MContraction/DiscLoop.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp>
 #include <Grid/Hadrons/Modules/MContraction/Baryon.hpp>
 #include <Grid/Hadrons/Modules/MContraction/DiscLoop.hpp>
 #include <Grid/Hadrons/Modules/MContraction/Gamma3pt.hpp>
 #include <Grid/Hadrons/Modules/MContraction/Meson.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp>
-#include <Grid/Hadrons/Modules/MContraction/Gamma3pt.hpp>
+#include <Grid/Hadrons/Modules/MFermion/GaugeProp.hpp>
-#include <Grid/Hadrons/Modules/MSource/Z2.hpp>
+#include <Grid/Hadrons/Modules/MGauge/Load.hpp>
-#include <Grid/Hadrons/Modules/MSource/SeqGamma.hpp>
+#include <Grid/Hadrons/Modules/MGauge/Random.hpp>
-#include <Grid/Hadrons/Modules/MSource/Point.hpp>
+#include <Grid/Hadrons/Modules/MGauge/StochEm.hpp>
-#include <Grid/Hadrons/Modules/MSource/Wall.hpp>
+#include <Grid/Hadrons/Modules/MGauge/Unit.hpp>
-#include <Grid/Hadrons/Modules/MSource/Laplacian.hpp>
+#include <Grid/Hadrons/Modules/MLoop/NoiseLoop.hpp>
 #include <Grid/Hadrons/Modules/MSolver/RBPrecCG.hpp>
 #include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
 #include <Grid/Hadrons/Modules/MAction/DWF.hpp>
 #include <Grid/Hadrons/Modules/MAction/Wilson.hpp>
 #include <Grid/Hadrons/Modules/MGauge/StochEm.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Unit.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Random.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Load.hpp>
 #include <Grid/Hadrons/Modules/MSink/Point.hpp>
 #include <Grid/Hadrons/Modules/MSolver/RBPrecCG.hpp>
 #include <Grid/Hadrons/Modules/MSource/Point.hpp>
 #include <Grid/Hadrons/Modules/MSource/SeqGamma.hpp>
 #include <Grid/Hadrons/Modules/MSource/Wall.hpp>
 #include <Grid/Hadrons/Modules/MSource/Z2.hpp>
--- a/extras/Hadrons/Modules/MSource/Laplacian.hpp
+++ b/extras/Hadrons/Modules/MSource/Laplacian.hpp
@@ -1,153 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MSource/Laplacian.hpp
 Copyright (C) 2017
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MSource_Laplacian_hpp_
 #define Hadrons_MSource_Laplacian_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /*
 Laplacian smearing source
 -----------------------------
 * options:
 - source: name of source object to be smeared (string)
 - N: number of steps (integer)
 - alpha: smearing parameter (real)
 */
 /******************************************************************************
 *                          Laplace smearing operator                         *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MSource)
 class LaplacianPar : Serializable
 {
  public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(LaplacianPar,
                                    std::string, source,
                                    std::string, gauge,
                                    unsigned int, N,
                                    double, alpha);
 };
 template <typename FImpl>
 class TLaplacian : public Module<LaplacianPar>
 {
  public:
    FERM_TYPE_ALIASES(FImpl, );
  public:
    // constructor
    TLaplacian(const std::string name);
    // destructor
    virtual ~TLaplacian(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(LaplaceSmearing, TLaplacian<FIMPL>, MSource);
 /******************************************************************************
 *                       TLaplacian template implementation                   *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TLaplacian<FImpl>::TLaplacian(const std::string name)
    : Module<LaplacianPar>(name)
 {
 }
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TLaplacian<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().source, par().gauge};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TLaplacian<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TLaplacian<FImpl>::setup(void)
 {
    env().template registerLattice<PropagatorField>(getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TLaplacian<FImpl>::execute(void)
 {
    FermionField source(env().getGrid()), tmp(env().getGrid());
    PropagatorField &SmrSrc = *env().template createLattice<PropagatorField>(getName());
    PropagatorField &fullSrc = *env().template getObject<PropagatorField>(par().source);
    auto &U      = *env().template getObject<LatticeGaugeField>(par().gauge);
    Laplacian<FImpl> LaplaceOperator(env().getGrid());
    LaplaceOperator.ImportGauge(U);
    double prefactor = par().alpha / (double)(par().N);
    for (unsigned int s = 0; s < Ns; ++s)
    {
        for (unsigned int c = 0; c < Nc; ++c)
        {
            PropToFerm(source, fullSrc, s, c);
            for (int smr = 0; smr < par().N; ++smr)
            {
                LaplaceOperator.M(source, tmp);
                source += prefactor * tmp;
            }
            FermToProp(SmrSrc, source, s, c);
        }
    }
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MSource_Z2_hpp_
--- a/extras/Hadrons/modules.inc
+++ b/extras/Hadrons/modules.inc
@@ -1,39 +1,38 @@
 modules_cc =\
  Modules/MContraction/WeakHamiltonianEye.cc \
  Modules/MContraction/WeakHamiltonianNonEye.cc \
  Modules/MContraction/WeakNeutral4ptDisc.cc \
-  Modules/MContraction/WeakHamiltonianEye.cc \
+  Modules/MGauge/Load.cc \
  Modules/MScalar/FreeProp.cc \
  Modules/MScalar/ChargedProp.cc \
  Modules/MGauge/Unit.cc \
  Modules/MGauge/Random.cc \
  Modules/MGauge/StochEm.cc \
-  Modules/MGauge/Load.cc
+  Modules/MGauge/Unit.cc \
  Modules/MScalar/ChargedProp.cc \
  Modules/MScalar/FreeProp.cc
 modules_hpp =\
-  Modules/MLoop/NoiseLoop.hpp \
+  Modules/MAction/DWF.hpp \
-  Modules/MFermion/GaugeProp.hpp \
+  Modules/MAction/Wilson.hpp \
  Modules/MContraction/WeakHamiltonian.hpp \
  Modules/MContraction/Meson.hpp \
  Modules/MContraction/DiscLoop.hpp \
  Modules/MContraction/WeakHamiltonianEye.hpp \
  Modules/MContraction/Baryon.hpp \
  Modules/MContraction/DiscLoop.hpp \
  Modules/MContraction/Gamma3pt.hpp \
  Modules/MContraction/Meson.hpp \
  Modules/MContraction/WeakHamiltonian.hpp \
  Modules/MContraction/WeakHamiltonianEye.hpp \
  Modules/MContraction/WeakHamiltonianNonEye.hpp \
  Modules/MContraction/WeakNeutral4ptDisc.hpp \
-  Modules/MContraction/Gamma3pt.hpp \
+  Modules/MFermion/GaugeProp.hpp \
-  Modules/MSource/Z2.hpp \
+  Modules/MGauge/Load.hpp \
-  Modules/MSource/SeqGamma.hpp \
+  Modules/MGauge/Random.hpp \
-  Modules/MSource/Point.hpp \
+  Modules/MGauge/StochEm.hpp \
-  Modules/MSource/Wall.hpp \
+  Modules/MGauge/Unit.hpp \
-  Modules/MSource/Laplacian.hpp \
+  Modules/MLoop/NoiseLoop.hpp \
  Modules/MSolver/RBPrecCG.hpp \
  Modules/MScalar/ChargedProp.hpp \
  Modules/MScalar/FreeProp.hpp \
  Modules/MScalar/Scalar.hpp \
-  Modules/MAction/DWF.hpp \
+  Modules/MSink/Point.hpp \
-  Modules/MAction/Wilson.hpp \
+  Modules/MSolver/RBPrecCG.hpp \
-  Modules/MGauge/StochEm.hpp \
+  Modules/MSource/Point.hpp \
-  Modules/MGauge/Unit.hpp \
+  Modules/MSource/SeqGamma.hpp \
-  Modules/MGauge/Random.hpp \
+  Modules/MSource/Wall.hpp \
-  Modules/MGauge/Load.hpp \
+  Modules/MSource/Z2.hpp
  Modules/MSink/Point.hpp
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@@ -103,29 +103,32 @@ namespace Grid {
    GridBase *CoarseGrid;
    GridBase *FineGrid;
    std::vector<Lattice<Fobj> > subspace;
    int checkerboard;
-    Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid) : 
+  Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : 
-      CoarseGrid(_CoarseGrid),
+    CoarseGrid(_CoarseGrid),
      FineGrid(_FineGrid),
-      subspace(nbasis,_FineGrid)
+      subspace(nbasis,_FineGrid),
      checkerboard(_checkerboard)
 	{
 	};
    void Orthogonalise(void){
      CoarseScalar InnerProd(CoarseGrid); 
      std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
      blockOrthogonalise(InnerProd,subspace);
      std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
      blockOrthogonalise(InnerProd,subspace);
      //      std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
      //      CheckOrthogonal();
    } 
    void CheckOrthogonal(void){
      CoarseVector iProj(CoarseGrid); 
      CoarseVector eProj(CoarseGrid); 
      Lattice<CComplex> pokey(CoarseGrid);
      for(int i=0;i<nbasis;i++){
 	blockProject(iProj,subspace[i],subspace);
 	eProj=zero; 
-	for(int ss=0;ss<CoarseGrid->oSites();ss++){
+	parallel_for(int ss=0;ss<CoarseGrid->oSites();ss++){
 	  eProj._odata[ss](i)=CComplex(1.0);
 	}
 	eProj=eProj - iProj;
@@ -137,6 +140,7 @@ namespace Grid {
      blockProject(CoarseVec,FineVec,subspace);
    }
    void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
      FineVec.checkerboard = subspace[0].checkerboard;
      blockPromote(CoarseVec,FineVec,subspace);
    }
    void CreateSubspaceRandom(GridParallelRNG &RNG){
@@ -147,6 +151,7 @@ namespace Grid {
      Orthogonalise();
    }
    /*
    virtual void CreateSubspaceLanczos(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) 
    {
      // Run a Lanczos with sloppy convergence
@@ -195,7 +200,7 @@ namespace Grid {
 	  std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
 	}
    }
-
+    */
    virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
      RealD scale;
--- a/lib/algorithms/LinearOperator.h
+++ b/lib/algorithms/LinearOperator.h
@@ -162,15 +162,10 @@ namespace Grid {
 	_Mat.M(in,out);
      }
      void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
 	ComplexD dot;
 	_Mat.M(in,out);
-	dot= innerProduct(in,out);
+	ComplexD dot= innerProduct(in,out); n1=real(dot);
-	n1=real(dot);
+	n2=norm2(out);
 	dot = innerProduct(out,out);
 	n2=real(dot);
      }
      void HermOp(const Field &in, Field &out){
 	_Mat.M(in,out);
@@ -192,10 +187,10 @@ namespace Grid {
 	ni=Mpc(in,tmp);
 	no=MpcDag(tmp,out);
      }
-      void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+      virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
 	MpcDagMpc(in,out,n1,n2);
      }
-      void HermOp(const Field &in, Field &out){
+      virtual void HermOp(const Field &in, Field &out){
 	RealD n1,n2;
 	HermOpAndNorm(in,out,n1,n2);
      }
@@ -212,7 +207,6 @@ namespace Grid {
      void OpDir  (const Field &in, Field &out,int dir,int disp) {
 	assert(0);
      }
    };
    template<class Matrix,class Field>
      class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {
@@ -270,7 +264,6 @@ namespace Grid {
 	return axpy_norm(out,-1.0,tmp,in);
      }
    };
    template<class Matrix,class Field>
      class SchurDiagTwoOperator :  public SchurOperatorBase<Field> {
    protected:
@@ -299,6 +292,45 @@ namespace Grid {
 	return axpy_norm(out,-1.0,tmp,in);
      }
    };
    ///////////////////////////////////////////////////////////////////////////////////////////////////
    // Left  handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta  -->  ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
    // Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta  -->  ( 1 - Moe Mee^-1 Meo ) Moo^-1 phi=eta ; psi = Moo^-1 phi
    ///////////////////////////////////////////////////////////////////////////////////////////////////
    template<class Matrix,class Field> using SchurDiagOneRH = SchurDiagTwoOperator<Matrix,Field> ;
    template<class Matrix,class Field> using SchurDiagOneLH = SchurDiagOneOperator<Matrix,Field> ;
    ///////////////////////////////////////////////////////////////////////////////////////////////////
    //  Staggered use
    ///////////////////////////////////////////////////////////////////////////////////////////////////
    template<class Matrix,class Field>
      class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
    protected:
      Matrix &_Mat;
    public:
      SchurStaggeredOperator (Matrix &Mat): _Mat(Mat){};
      virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
 	n2 = Mpc(in,out);
 	ComplexD dot= innerProduct(in,out);
 	n1 = real(dot);
      }
      virtual void HermOp(const Field &in, Field &out){
 	Mpc(in,out);
      }
      virtual  RealD Mpc      (const Field &in, Field &out) {
 	Field tmp(in._grid);
 	_Mat.Meooe(in,tmp);
 	_Mat.MooeeInv(tmp,out);
 	_Mat.Meooe(out,tmp);
 	_Mat.Mooee(in,out);
        return axpy_norm(out,-1.0,tmp,out);
      }
      virtual  RealD MpcDag   (const Field &in, Field &out){
 	return Mpc(in,out);
      }
      virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
 	assert(0);// Never need with staggered
      }
    };
    template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;
    /////////////////////////////////////////////////////////////
@@ -314,6 +346,14 @@ namespace Grid {
      virtual void operator() (const Field &in, Field &out) = 0;
    };
    template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
    public:
      void operator() (const Field &in, Field &out){
 	out = in;
      };
    };
    /////////////////////////////////////////////////////////////
    // Base classes for Multishift solvers for operators
    /////////////////////////////////////////////////////////////
@@ -336,6 +376,64 @@ namespace Grid {
     };
    */
  ////////////////////////////////////////////////////////////////////////////////////////////
  // Hermitian operator Linear function and operator function
  ////////////////////////////////////////////////////////////////////////////////////////////
    template<class Field>
      class HermOpOperatorFunction : public OperatorFunction<Field> {
      void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
 	Linop.HermOp(in,out);
      };
    };
    template<typename Field>
      class PlainHermOp : public LinearFunction<Field> {
    public:
      LinearOperatorBase<Field> &_Linop;
      PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop) 
      {}
      void operator()(const Field& in, Field& out) {
 	_Linop.HermOp(in,out);
      }
    };
    template<typename Field>
    class FunctionHermOp : public LinearFunction<Field> {
    public:
      OperatorFunction<Field>   & _poly;
      LinearOperatorBase<Field> &_Linop;
      FunctionHermOp(OperatorFunction<Field> & poly,LinearOperatorBase<Field>& linop) 
 	: _poly(poly), _Linop(linop) {};
      void operator()(const Field& in, Field& out) {
 	_poly(_Linop,in,out);
      }
    };
  template<class Field>
  class Polynomial : public OperatorFunction<Field> {
  private:
    std::vector<RealD> Coeffs;
  public:
    Polynomial(std::vector<RealD> &_Coeffs) : Coeffs(_Coeffs) { };
    // Implement the required interface
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
      Field AtoN(in._grid);
      Field Mtmp(in._grid);
      AtoN = in;
      out = AtoN*Coeffs[0];
      for(int n=1;n<Coeffs.size();n++){
 	Mtmp = AtoN;
 	Linop.HermOp(Mtmp,AtoN);
 	out=out+AtoN*Coeffs[n];
      }
    };
  };
 }
--- a/lib/algorithms/approx/Chebyshev.h
+++ b/lib/algorithms/approx/Chebyshev.h
@@ -8,6 +8,7 @@
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Christoph Lehner <clehner@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -33,41 +34,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
-  ////////////////////////////////////////////////////////////////////////////////////////////
+struct ChebyParams : Serializable {
-  // Simple general polynomial with user supplied coefficients
+  GRID_SERIALIZABLE_CLASS_MEMBERS(ChebyParams,
-  ////////////////////////////////////////////////////////////////////////////////////////////
+				  RealD, alpha,  
-  template<class Field>
+				  RealD, beta,   
-  class HermOpOperatorFunction : public OperatorFunction<Field> {
+				  int, Npoly);
-    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
+};
      Linop.HermOp(in,out);
    };
  };
  template<class Field>
  class Polynomial : public OperatorFunction<Field> {
  private:
    std::vector<RealD> Coeffs;
  public:
    Polynomial(std::vector<RealD> &_Coeffs) : Coeffs(_Coeffs) { };
    // Implement the required interface
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
      Field AtoN(in._grid);
      Field Mtmp(in._grid);
      AtoN = in;
      out = AtoN*Coeffs[0];
 //            std::cout <<"Poly in " <<norm2(in)<<" size "<< Coeffs.size()<<std::endl;
 //            std::cout <<"Coeffs[0]= "<<Coeffs[0]<< " 0 " <<norm2(out)<<std::endl;
      for(int n=1;n<Coeffs.size();n++){
 	Mtmp = AtoN;
 	Linop.HermOp(Mtmp,AtoN);
 	out=out+AtoN*Coeffs[n];
 //            std::cout <<"Coeffs "<<n<<"= "<< Coeffs[n]<< " 0 " <<std::endl;
 //		std::cout << n<<" " <<norm2(out)<<std::endl;
      }
    };
  };
  ////////////////////////////////////////////////////////////////////////////////////////////
  // Generic Chebyshev approximations
@@ -82,8 +54,10 @@ namespace Grid {
  public:
    void csv(std::ostream &out){
-	RealD diff = hi-lo;
+      RealD diff = hi-lo;
-      for (RealD x=lo-0.2*diff; x<hi+0.2*diff; x+=(hi-lo)/1000) {
+      RealD delta = (hi-lo)*1.0e-9;
      for (RealD x=lo; x<hi; x+=delta) {
 	delta*=1.1;
 	RealD f = approx(x);
 	out<< x<<" "<<f<<std::endl;
      }
@@ -99,6 +73,7 @@ namespace Grid {
    };
    Chebyshev(){};
    Chebyshev(ChebyParams p){ Init(p.alpha,p.beta,p.Npoly);};
    Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
    Chebyshev(RealD _lo,RealD _hi,int _order) {Init(_lo,_hi,_order);};
@@ -193,6 +168,47 @@ namespace Grid {
      return sum;
    };
    RealD approxD(RealD x)
    {
      RealD Un;
      RealD Unm;
      RealD Unp;
      RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
      RealD U0=1;
      RealD U1=2*y;
      RealD sum;
      sum = Coeffs[1]*U0;
      sum+= Coeffs[2]*U1*2.0;
      Un =U1;
      Unm=U0;
      for(int i=2;i<order-1;i++){
 	Unp=2*y*Un-Unm;
 	Unm=Un;
 	Un =Unp;
 	sum+= Un*Coeffs[i+1]*(i+1.0);
      }
      return sum/(0.5*(hi-lo));
    };
    RealD approxInv(RealD z, RealD x0, int maxiter, RealD resid) {
      RealD x = x0;
      RealD eps;
      int i;
      for (i=0;i<maxiter;i++) {
 	eps = approx(x) - z;
 	if (fabs(eps / z) < resid)
 	  return x;
 	x = x - eps / approxD(x);
      }
      return std::numeric_limits<double>::quiet_NaN();
    }
    // Implement the required interface
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@@ -78,12 +78,12 @@ class ConjugateGradient : public OperatorFunction<Field> {
    cp = a;
    ssq = norm2(src);
-    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: guess " << guess << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: guess " << guess << std::endl;
-    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:   src " << ssq << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:   src " << ssq << std::endl;
-    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:    mp " << d << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:    mp " << d << std::endl;
-    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:   mmp " << b << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:   mmp " << b << std::endl;
-    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:  cp,r " << cp << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:  cp,r " << cp << std::endl;
-    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:     p " << a << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:     p " << a << std::endl;
    RealD rsq = Tolerance * Tolerance * ssq;
@@ -92,7 +92,7 @@ class ConjugateGradient : public OperatorFunction<Field> {
      return;
    }
-    std::cout << GridLogIterative << std::setprecision(4)
+    std::cout << GridLogIterative << std::setprecision(8)
              << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
    GridStopWatch LinalgTimer;
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -7,8 +7,9 @@
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Chulwoo Jung
+Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: Guido Cossu
+Author: Chulwoo Jung <chulwoo@bnl.gov>
 Author: Christoph Lehner <clehner@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -27,108 +28,269 @@ Author: Guido Cossu
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#ifndef GRID_IRL_H
+#ifndef GRID_BIRL_H
-#define GRID_IRL_H
+#define GRID_BIRL_H
 #include <string.h> //memset
 //#include <zlib.h>
 #include <sys/stat.h>
 namespace Grid { 
-  enum IRLdiagonalisation { 
+  ////////////////////////////////////////////////////////
-    IRLdiagonaliseWithDSTEGR,
+  // Move following 100 LOC to lattice/Lattice_basis.h
-    IRLdiagonaliseWithQR,
+  ////////////////////////////////////////////////////////
    IRLdiagonaliseWithEigen
  };
 ////////////////////////////////////////////////////////////////////////////////
 // Helper class for sorting the evalues AND evectors by Field
 // Use pointer swizzle on vectors
 ////////////////////////////////////////////////////////////////////////////////
 template<class Field>
-class SortEigen {
+void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k) 
- private:
+{
-  static bool less_lmd(RealD left,RealD right){
+  for(int j=0; j<k; ++j){
-    return left > right;
+    auto ip = innerProduct(basis[j],w);
-  }  
+    w = w - ip*basis[j];
  static bool less_pair(std::pair<RealD,Field const*>& left,
                        std::pair<RealD,Field const*>& right){
    return left.first > (right.first);
  }
 }
- public:
+template<class Field>
-  void push(std::vector<RealD>& lmd,std::vector<Field>& evec,int N) {
+void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) 
 {
  typedef typename Field::vector_object vobj;
  GridBase* grid = basis[0]._grid;
-    ////////////////////////////////////////////////////////////////////////
+  parallel_region
-    // PAB: FIXME: VERY VERY VERY wasteful: takes a copy of the entire vector set.
+  {
-    //    : The vector reorder should be done by pointer swizzle somehow
+    std::vector < vobj > B(Nm); // Thread private
    ////////////////////////////////////////////////////////////////////////
    std::vector<Field> cpy(lmd.size(),evec[0]._grid);
    for(int i=0;i<lmd.size();i++) cpy[i] = evec[i];
-    std::vector<std::pair<RealD, Field const*> > emod(lmd.size());    
+    parallel_for_internal(int ss=0;ss < grid->oSites();ss++){
      for(int j=j0; j<j1; ++j) B[j]=0.;
-    for(int i=0;i<lmd.size();++i)  emod[i] = std::pair<RealD,Field const*>(lmd[i],&cpy[i]);
+      for(int j=j0; j<j1; ++j){
-
+	for(int k=k0; k<k1; ++k){
-    partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
+	  B[j] +=Qt(j,k) * basis[k]._odata[ss];
-
+	}
-    typename std::vector<std::pair<RealD, Field const*> >::iterator it = emod.begin();
+      }
-    for(int i=0;i<N;++i){
+      for(int j=j0; j<j1; ++j){
-      lmd[i]=it->first;
+	  basis[j]._odata[ss] = B[j];
-      evec[i]=*(it->second);
+      }
      ++it;
    }
  }
-  void push(std::vector<RealD>& lmd,int N) {
+}
-    std::partial_sort(lmd.begin(),lmd.begin()+N,lmd.end(),less_lmd);
+
 // Extract a single rotated vector
 template<class Field>
 void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) 
 {
  typedef typename Field::vector_object vobj;
  GridBase* grid = basis[0]._grid;
  result.checkerboard = basis[0].checkerboard;
  parallel_for(int ss=0;ss < grid->oSites();ss++){
    vobj B = zero;
    for(int k=k0; k<k1; ++k){
      B +=Qt(j,k) * basis[k]._odata[ss];
    }
    result._odata[ss] = B;
  }
-  bool saturated(RealD lmd, RealD thrs) {
+}
-    return fabs(lmd) > fabs(thrs);
+
 template<class Field>
 void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx) 
 {
  int vlen = idx.size();
  assert(vlen>=1);
  assert(vlen<=sort_vals.size());
  assert(vlen<=_v.size());
  for (size_t i=0;i<vlen;i++) {
    if (idx[i] != i) {
      //////////////////////////////////////
      // idx[i] is a table of desired sources giving a permutation.
      // Swap v[i] with v[idx[i]].
      // Find  j>i for which _vnew[j] = _vold[i],
      // track the move idx[j] => idx[i]
      // track the move idx[i] => i
      //////////////////////////////////////
      size_t j;
      for (j=i;j<idx.size();j++)
 	if (idx[j]==i)
 	  break;
      assert(idx[i] > i);     assert(j!=idx.size());      assert(idx[j]==i);
      std::swap(_v[i]._odata,_v[idx[i]]._odata); // should use vector move constructor, no data copy
      std::swap(sort_vals[i],sort_vals[idx[i]]);
      idx[j] = idx[i];
      idx[i] = i;
    }
  }
-};
+}
 inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals) 
 {
  std::vector<int> idx(sort_vals.size());
  std::iota(idx.begin(), idx.end(), 0);
  // sort indexes based on comparing values in v
  std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
    return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
  });
  return idx;
 }
 template<class Field>
 void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse) 
 {
  std::vector<int> idx = basisSortGetIndex(sort_vals);
  if (reverse)
    std::reverse(idx.begin(), idx.end());
  basisReorderInPlace(_v,sort_vals,idx);
 }
 // PAB: faster to compute the inner products first then fuse loops.
 // If performance critical can improve.
 template<class Field>
 void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
  result = zero;
  assert(_v.size()==eval.size());
  int N = (int)_v.size();
  for (int i=0;i<N;i++) {
    Field& tmp = _v[i];
    axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
  }
 }
 /////////////////////////////////////////////////////////////
 // Implicitly restarted lanczos
 /////////////////////////////////////////////////////////////
 template<class Field> class ImplicitlyRestartedLanczosTester 
 {
 public:
  virtual int TestConvergence(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox);
  virtual int ReconstructEval(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox);
 };
 enum IRLdiagonalisation { 
  IRLdiagonaliseWithDSTEGR,
  IRLdiagonaliseWithQR,
  IRLdiagonaliseWithEigen
 };
 template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public ImplicitlyRestartedLanczosTester<Field>
 {
 public:
  LinearFunction<Field>       &_HermOpTest;
  ImplicitlyRestartedLanczosHermOpTester(LinearFunction<Field> &HermOpTest) : _HermOpTest(HermOpTest)  {  };
  int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox)
  {
    return TestConvergence(j,resid,B,eval,evalMaxApprox);
  }
  int TestConvergence(int j,RealD eresid,Field &B, RealD &eval,RealD evalMaxApprox)
  {
    Field v(B);
    RealD eval_poly = eval;
    // Apply operator
    _HermOpTest(B,v);
    RealD vnum = real(innerProduct(B,v)); // HermOp.
    RealD vden = norm2(B);
    RealD vv0  = norm2(v);
    eval   = vnum/vden;
    v -= eval*B;
    RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
    std::cout.precision(13);
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
 	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;
    int conv=0;
    if( (vv<eresid*eresid) ) conv = 1;
    return conv;
  }
 };
 template<class Field> 
 class ImplicitlyRestartedLanczos {
-
+ private:
-private:       
+  const RealD small = 1.0e-8;
-
+  int MaxIter;
-  int MaxIter;   // Max iterations
+  int MinRestart; // Minimum number of restarts; only check for convergence after
-  int Nstop;     // Number of evecs checked for convergence
+  int Nstop;   // Number of evecs checked for convergence
-  int Nk;        // Number of converged sought
+  int Nk;      // Number of converged sought
-  int Nm;        // Nm -- total number of vectors
+  //  int Np;      // Np -- Number of spare vecs in krylov space //  == Nm - Nk
-  RealD eresid;
+  int Nm;      // Nm -- total number of vectors
  IRLdiagonalisation diagonalisation;
-  ////////////////////////////////////
+  int orth_period;
  // Embedded objects
  ////////////////////////////////////
           SortEigen<Field> _sort;
  LinearOperatorBase<Field> &_Linop;
    OperatorFunction<Field> &_poly;
  RealD OrthoTime;
  RealD eresid, betastp;
  ////////////////////////////////
  // Embedded objects
  ////////////////////////////////
  LinearFunction<Field>       &_HermOp;
  LinearFunction<Field>       &_HermOpTest;
  ImplicitlyRestartedLanczosTester<Field> &_Tester;
  // Default tester provided (we need a ref to something in default case)
  ImplicitlyRestartedLanczosHermOpTester<Field> SimpleTester;
  /////////////////////////
  // Constructor
  /////////////////////////
 public:       
- ImplicitlyRestartedLanczos(LinearOperatorBase<Field> &Linop, // op
+  //////////////////////////////////////////////////////////////////
-			    OperatorFunction<Field> & poly,   // polynomial
+  // PAB:
-			    int _Nstop, // really sought vecs
+  //////////////////////////////////////////////////////////////////
-			    int _Nk,    // sought vecs
+  // Too many options  & knobs. Do we really need orth_period
-			    int _Nm,    // total vecs
+  // What is the theoretical basis & guarantees of betastp ?
-			    RealD _eresid, // resid in lmd deficit 
+  // Nstop=Nk viable?
-			    int _MaxIter,  // Max iterations
+  // MinRestart avoidable with new convergence test?
-			    IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen ) :
+  // Could cut to HermOp, HermOpTest, Tester, Nk, Nm, resid, maxiter (+diagonalisation)
-    _Linop(Linop),    _poly(poly),
+  // HermOpTest could be eliminated if we dropped the Power method for max eval.
-      Nstop(_Nstop), Nk(_Nk), Nm(_Nm),
+  // -- also: The eval, eval2, eval2_copy stuff is still unnecessarily unclear
-      eresid(_eresid),  MaxIter(_MaxIter),
+  //////////////////////////////////////////////////////////////////
-      diagonalisation(_diagonalisation)
+ ImplicitlyRestartedLanczos(LinearFunction<Field> & HermOp,
-      { };
+			    LinearFunction<Field> & HermOpTest,
 			    ImplicitlyRestartedLanczosTester<Field> & Tester,
 			    int _Nstop, // sought vecs
 			    int _Nk, // sought vecs
 			    int _Nm, // spare vecs
 			    RealD _eresid, // resid in lmdue deficit 
 			    int _MaxIter, // Max iterations
 			    RealD _betastp=0.0, // if beta(k) < betastp: converged
 			    int _MinRestart=1, int _orth_period = 1,
 			    IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
    SimpleTester(HermOpTest), _HermOp(HermOp),      _HermOpTest(HermOpTest), _Tester(Tester),
    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
    eresid(_eresid),      betastp(_betastp),
    MaxIter(_MaxIter)  ,      MinRestart(_MinRestart),
    orth_period(_orth_period), diagonalisation(_diagonalisation)  { };
    ImplicitlyRestartedLanczos(LinearFunction<Field> & HermOp,
 			       LinearFunction<Field> & HermOpTest,
 			       int _Nstop, // sought vecs
 			       int _Nk, // sought vecs
 			       int _Nm, // spare vecs
 			       RealD _eresid, // resid in lmdue deficit 
 			       int _MaxIter, // Max iterations
 			       RealD _betastp=0.0, // if beta(k) < betastp: converged
 			       int _MinRestart=1, int _orth_period = 1,
 			       IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
    SimpleTester(HermOpTest),  _HermOp(HermOp),      _HermOpTest(HermOpTest), _Tester(SimpleTester),
    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
    eresid(_eresid),      betastp(_betastp),
    MaxIter(_MaxIter)  ,      MinRestart(_MinRestart),
    orth_period(_orth_period), diagonalisation(_diagonalisation)  { };
  ////////////////////////////////
  // Helpers
  ////////////////////////////////
-  static RealD normalise(Field& v) 
+  template<typename T>  static RealD normalise(T& v) 
  {
    RealD nn = norm2(v);
    nn = sqrt(nn);
@@ -136,16 +298,12 @@ public:
    return nn;
  }
-  void orthogonalize(Field& w, std::vector<Field>& evec, int k)
+  void orthogonalize(Field& w, std::vector<Field>& evec,int k)
  {
-    typedef typename Field::scalar_type MyComplex;
+    OrthoTime-=usecond()/1e6;
-    MyComplex ip;
+    basisOrthogonalize(evec,w,k);
    for(int j=0; j<k; ++j){
      ip = innerProduct(evec[j],w); 
      w = w - ip * evec[j];
    }
    normalise(w);
    OrthoTime+=usecond()/1e6;
  }
 /* Rudy Arthur's thesis pp.137
@@ -165,184 +323,234 @@ repeat
  →AVK =VKHK +fKe†K † Extend to an M = K + P step factorization AVM = VMHM + fMeM
 until convergence
 */
-  void calc(std::vector<RealD>& eval,  std::vector<Field>& evec, const Field& src, int& Nconv)
+  void calc(std::vector<RealD>& eval, std::vector<Field>& evec,  const Field& src, int& Nconv, bool reverse=true)
  {
    GridBase *grid = src._grid;
    assert(grid == evec[0]._grid);
-    GridBase *grid = evec[0]._grid;
+    GridLogIRL.TimingMode(1);
-    assert(grid == src._grid);
+    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
-    
+    std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 /  "<< MaxIter<< std::endl;
-    std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
+    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
-    std::cout << GridLogMessage <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 /  "<< MaxIter<< std::endl;
+    std::cout << GridLogIRL <<" -- seek   Nk    = " << Nk    <<" vectors"<< std::endl;
-    std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
+    std::cout << GridLogIRL <<" -- accept Nstop = " << Nstop <<" vectors"<< std::endl;
-    std::cout << GridLogMessage <<" -- seek   Nk    = " << Nk    <<" vectors"<< std::endl;
+    std::cout << GridLogIRL <<" -- total  Nm    = " << Nm    <<" vectors"<< std::endl;
-    std::cout << GridLogMessage <<" -- accept Nstop = " << Nstop <<" vectors"<< std::endl;
+    std::cout << GridLogIRL <<" -- size of eval = " << eval.size() << std::endl;
-    std::cout << GridLogMessage <<" -- total  Nm    = " << Nm    <<" vectors"<< std::endl;
+    std::cout << GridLogIRL <<" -- size of evec = " << evec.size() << std::endl;
    std::cout << GridLogMessage <<" -- size of eval = " << eval.size() << std::endl;
    std::cout << GridLogMessage <<" -- size of evec = " << evec.size() << std::endl;
    if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) {
-      std::cout << GridLogMessage << "Diagonalisation is DSTEGR "<<std::endl;
+      std::cout << GridLogIRL << "Diagonalisation is DSTEGR "<<std::endl;
    } else if ( diagonalisation == IRLdiagonaliseWithQR ) { 
-      std::cout << GridLogMessage << "Diagonalisation is QR "<<std::endl;
+      std::cout << GridLogIRL << "Diagonalisation is QR "<<std::endl;
    }  else if ( diagonalisation == IRLdiagonaliseWithEigen ) { 
-      std::cout << GridLogMessage << "Diagonalisation is Eigen "<<std::endl;
+      std::cout << GridLogIRL << "Diagonalisation is Eigen "<<std::endl;
    }
-    std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
+    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
-    assert(Nm == evec.size() && Nm == eval.size());
+    assert(Nm <= evec.size() && Nm <= eval.size());
    // quickly get an idea of the largest eigenvalue to more properly normalize the residuum
    RealD evalMaxApprox = 0.0;
    {
      auto src_n = src;
      auto tmp = src;
      const int _MAX_ITER_IRL_MEVAPP_ = 50;
      for (int i=0;i<_MAX_ITER_IRL_MEVAPP_;i++) {
 	_HermOpTest(src_n,tmp);
 	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
 	RealD vden = norm2(src_n);
 	RealD na = vnum/vden;
 	if (fabs(evalMaxApprox/na - 1.0) < 0.05)
 	  i=_MAX_ITER_IRL_MEVAPP_;
 	evalMaxApprox = na;
 	std::cout << GridLogIRL << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
 	src_n = tmp;
      }
    }
    std::vector<RealD> lme(Nm);  
    std::vector<RealD> lme2(Nm);
    std::vector<RealD> eval2(Nm);
-
+    std::vector<RealD> eval2_copy(Nm);
-    Eigen::MatrixXd    Qt = Eigen::MatrixXd::Zero(Nm,Nm);
+    Eigen::MatrixXd Qt = Eigen::MatrixXd::Zero(Nm,Nm);
    std::vector<int>   Iconv(Nm);
    std::vector<Field>  B(Nm,grid); // waste of space replicating
    Field f(grid);
    Field v(grid);
    int k1 = 1;
    int k2 = Nk;
    RealD beta_k;
    Nconv = 0;
    RealD beta_k;
    // Set initial vector
    evec[0] = src;
    std::cout << GridLogMessage <<"norm2(src)= " << norm2(src)<<std::endl;
    normalise(evec[0]);
    std::cout << GridLogMessage <<"norm2(evec[0])= " << norm2(evec[0]) <<std::endl;
    // Initial Nk steps
    OrthoTime=0.;
    for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
    std::cout<<GridLogIRL <<"Initial "<< Nk <<"steps done "<<std::endl;
    std::cout<<GridLogIRL <<"Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
    //////////////////////////////////
    // Restarting loop begins
    //////////////////////////////////
    int iter;
    for(iter = 0; iter<MaxIter; ++iter){
      OrthoTime=0.;
      std::cout<< GridLogMessage <<" **********************"<< std::endl;
      std::cout<< GridLogMessage <<" Restart iteration = "<< iter << std::endl;
      std::cout<< GridLogMessage <<" **********************"<< std::endl;
      std::cout<<GridLogIRL <<" running "<<Nm-Nk <<" steps: "<<std::endl;
      for(int k=Nk; k<Nm; ++k) step(eval,lme,evec,f,Nm,k);
      f *= lme[Nm-1];
      std::cout<<GridLogIRL <<" "<<Nm-Nk <<" steps done "<<std::endl;
      std::cout<<GridLogIRL <<"Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
      //////////////////////////////////
      // getting eigenvalues
      //////////////////////////////////
      for(int k=0; k<Nm; ++k){
 	eval2[k] = eval[k+k1-1];
 	lme2[k] = lme[k+k1-1];
      }
      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
      diagonalize(eval2,lme2,Nm,Nm,Qt,grid);
      std::cout<<GridLogIRL <<" diagonalized "<<std::endl;
      //////////////////////////////////
      // sorting
-      _sort.push(eval2,Nm);
+      //////////////////////////////////
      eval2_copy = eval2;
      std::partial_sort(eval2.begin(),eval2.begin()+Nm,eval2.end(),std::greater<RealD>());
      std::cout<<GridLogIRL <<" evals sorted "<<std::endl;
      const int chunk=8;
      for(int io=0; io<k2;io+=chunk){
 	std::cout<<GridLogIRL << "eval "<< std::setw(3) << io ;
 	for(int ii=0;ii<chunk;ii++){
 	  if ( (io+ii)<k2 )
 	    std::cout<< " "<< std::setw(12)<< eval2[io+ii];
 	}
 	std::cout << std::endl;
      }
      //////////////////////////////////
      // Implicitly shifted QR transformations
      //////////////////////////////////
      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
      for(int ip=k2; ip<Nm; ++ip){ 
-	// Eigen replacement for qr_decomp ???
+	QR_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
 	qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
      }
      std::cout<<GridLogIRL <<"QR decomposed "<<std::endl;
-      for(int i=0; i<(Nk+1); ++i) B[i] = 0.0;
+      assert(k2<Nm);      assert(k2<Nm);      assert(k1>0);
-      for(int j=k1-1; j<k2+1; ++j){
+      basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis
-	for(int k=0; k<Nm; ++k){
+      std::cout<<GridLogIRL <<"basisRotated  by Qt"<<std::endl;
 	  B[j].checkerboard = evec[k].checkerboard;
 	  B[j] += Qt(j,k) * evec[k];
 	}
      }
      for(int j=k1-1; j<k2+1; ++j) evec[j] = B[j];
      ////////////////////////////////////////////////////
      // Compressed vector f and beta(k2)
      ////////////////////////////////////////////////////
      f *= Qt(k2-1,Nm-1);
      f += lme[k2-1] * evec[k2];
      beta_k = norm2(f);
      beta_k = sqrt(beta_k);
-      std::cout<< GridLogMessage<<" beta(k) = "<<beta_k<<std::endl;
+      std::cout<<GridLogIRL<<" beta(k) = "<<beta_k<<std::endl;
      RealD betar = 1.0/beta_k;
      evec[k2] = betar * f;
      lme[k2-1] = beta_k;
      ////////////////////////////////////////////////////
      // Convergence test
      ////////////////////////////////////////////////////
      for(int k=0; k<Nm; ++k){    
 	eval2[k] = eval[k];
 	lme2[k] = lme[k];
      }
      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
      diagonalize(eval2,lme2,Nk,Nm,Qt,grid);
-      
+      std::cout<<GridLogIRL <<" Diagonalized "<<std::endl;
      for(int k = 0; k<Nk; ++k) B[k]=0.0;
      for(int j = 0; j<Nk; ++j){
 	for(int k = 0; k<Nk; ++k){
 	  B[j].checkerboard = evec[k].checkerboard;
 	  B[j] += Qt(j,k) * evec[k];
 	}
      }
      Nconv = 0;
-      for(int i=0; i<Nk; ++i){
+      if (iter >= MinRestart) {
-	_Linop.HermOp(B[i],v);
+	std::cout << GridLogIRL << "Test convergence: rotate subset of vectors to test convergence " << std::endl;
-	RealD vnum = real(innerProduct(B[i],v)); // HermOp.
+	Field B(grid); B.checkerboard = evec[0].checkerboard;
 	RealD vden = norm2(B[i]);
 	eval2[i] = vnum/vden;
 	v -= eval2[i]*B[i];
 	RealD vv = norm2(v);
-	std::cout.precision(13);
+	//  power of two search pattern;  not every evalue in eval2 is assessed.
-	std::cout << GridLogMessage << "[" << std::setw(3)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
+	for(int jj = 1; jj<=Nstop; jj*=2){
-	std::cout << "eval = "<<std::setw(25)<< std::setiosflags(std::ios_base::left)<< eval2[i];
+	  int j = Nstop-jj;
-	std::cout << " |H B[i] - eval[i]B[i]|^2 "<< std::setw(25)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl;
+	  RealD e = eval2_copy[j]; // Discard the evalue
-	
+	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	    
-	// change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged
+	  if( _Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) {
-	if((vv<eresid*eresid) && (i == Nconv) ){
+	    if ( j > Nconv ) {
-	  Iconv[Nconv] = i;
+	      Nconv=j+1;
-	  ++Nconv;
+	      jj=Nstop; // Terminate the scan
 	    }
 	  }
 	}
 	// Do evec[0] for good measure
 	{ 
 	  int j=0;
 	  RealD e = eval2_copy[0]; 
 	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	    
 	  _Tester.TestConvergence(j,eresid,B,e,evalMaxApprox);
 	}
 	// test if we converged, if so, terminate
 	std::cout<<GridLogIRL<<" #modes converged: >= "<<Nconv<<"/"<<Nstop<<std::endl;
 	//	if( Nconv>=Nstop || beta_k < betastp){
 	if( Nconv>=Nstop){
 	  goto converged;
 	}
-      }  // i-loop end
+      } else {
 	std::cout << GridLogIRL << "iter < MinRestart: do not yet test for convergence\n";
      } // end of iter loop
    }
-      std::cout<< GridLogMessage <<" #modes converged: "<<Nconv<<std::endl;
+    std::cout<<GridLogError<<"\n NOT converged.\n";
      if( Nconv>=Nstop ){
 	goto converged;
      }
    } // end of iter loop
    std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
    std::cout<< GridLogError    <<" ImplicitlyRestartedLanczos::calc() NOT converged.";
    std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
    abort();
  converged:
-    // Sorting
+    {
-    eval.resize(Nconv);
+      Field B(grid); B.checkerboard = evec[0].checkerboard;
-    evec.resize(Nconv,grid);
+      basisRotate(evec,Qt,0,Nk,0,Nk,Nm);	    
-    for(int i=0; i<Nconv; ++i){
+      std::cout << GridLogIRL << " Rotated basis"<<std::endl;
-      eval[i] = eval2[Iconv[i]];
+      Nconv=0;
-      evec[i] = B[Iconv[i]];
+      //////////////////////////////////////////////////////////////////////
-    }
+      // Full final convergence test; unconditionally applied
-    _sort.push(eval,evec,Nconv);
+      //////////////////////////////////////////////////////////////////////
      for(int j = 0; j<=Nk; j++){
 	B=evec[j];
 	if( _Tester.ReconstructEval(j,eresid,B,eval2[j],evalMaxApprox) ) {
 	  Nconv++;
 	}
      }
-    std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
+      if ( Nconv < Nstop )
-    std::cout << GridLogMessage << "ImplicitlyRestartedLanczos CONVERGED ; Summary :\n";
+	std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
-    std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
+
-    std::cout << GridLogMessage << " -- Iterations  = "<< iter   << "\n";
+      eval=eval2;
-    std::cout << GridLogMessage << " -- beta(k)     = "<< beta_k << "\n";
+
-    std::cout << GridLogMessage << " -- Nconv       = "<< Nconv  << "\n";
+      basisSortInPlace(evec,eval,reverse);
-    std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
+      
    }
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
    std::cout << GridLogIRL << "ImplicitlyRestartedLanczos CONVERGED ; Summary :\n";
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
    std::cout << GridLogIRL << " -- Iterations  = "<< iter   << "\n";
    std::cout << GridLogIRL << " -- beta(k)     = "<< beta_k << "\n";
    std::cout << GridLogIRL << " -- Nconv       = "<< Nconv  << "\n";
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
  }
-private:
+ private:
 /* Saad PP. 195
 1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
 2. For k = 1,2,...,m Do:
@@ -361,14 +569,18 @@ private:
    const RealD tiny = 1.0e-20;
    assert( k< Nm );
-    _poly(_Linop,evec[k],w);      // 3. wk:=Avk−βkv_{k−1}
+    GridStopWatch gsw_op,gsw_o;
    Field& evec_k = evec[k];
    _HermOp(evec_k,w);    std::cout<<GridLogIRL << "Poly(HermOp)" <<std::endl;
    if(k>0) w -= lme[k-1] * evec[k-1];
-    ComplexD zalph = innerProduct(evec[k],w); // 4. αk:=(wk,vk)
+    ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk)
    RealD     alph = real(zalph);
-    w = w - alph * evec[k];// 5. wk:=wk−αkvk
+    w = w - alph * evec_k;// 5. wk:=wk−αkvk
    RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
    // 7. vk+1 := wk/βk+1
@@ -376,10 +588,16 @@ private:
    lmd[k] = alph;
    lme[k] = beta;
-    if ( k > 0 ) orthogonalize(w,evec,k); // orthonormalise
+    if (k>0 && k % orth_period == 0) {
-    if ( k < Nm-1) evec[k+1] = w;
+      orthogonalize(w,evec,k); // orthonormalise
      std::cout<<GridLogIRL << "Orthogonalised " <<std::endl;
    }
-    if ( beta < tiny ) std::cout << GridLogMessage << " beta is tiny "<<beta<<std::endl;
+    if(k < Nm-1) evec[k+1] = w;
    std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
    if ( beta < tiny ) 
      std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;
  }
  void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
@@ -404,11 +622,11 @@ private:
      }
    }
  }
  ///////////////////////////////////////////////////////////////////////////
  // File could end here if settle on Eigen ???
  ///////////////////////////////////////////////////////////////////////////
-  void qr_decomp(std::vector<RealD>& lmd,   // Nm 
+  ///////////////////////////////////////////////////////////////////////////
  // File could end here if settle on Eigen ??? !!!
  ///////////////////////////////////////////////////////////////////////////
  void QR_decomp(std::vector<RealD>& lmd,   // Nm 
 		 std::vector<RealD>& lme,   // Nm 
 		 int Nk, int Nm,            // Nk, Nm
 		 Eigen::MatrixXd& Qt,       // Nm x Nm matrix
@@ -575,51 +793,50 @@ void diagonalize_lapack(std::vector<RealD>& lmd,
 #endif
 }
-  void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
+void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
-		      int Nk, int Nm,   
+		    int Nk, int Nm,   
-		      Eigen::MatrixXd & Qt,
+		    Eigen::MatrixXd & Qt,
-		      GridBase *grid)
+		    GridBase *grid)
-  {
+{
-    int Niter = 100*Nm;
+  int QRiter = 100*Nm;
-    int kmin = 1;
+  int kmin = 1;
-    int kmax = Nk;
+  int kmax = Nk;
-    // (this should be more sophisticated)
+  // (this should be more sophisticated)
-    for(int iter=0; iter<Niter; ++iter){
+  for(int iter=0; iter<QRiter; ++iter){
-      // determination of 2x2 leading submatrix
+    // determination of 2x2 leading submatrix
-      RealD dsub = lmd[kmax-1]-lmd[kmax-2];
+    RealD dsub = lmd[kmax-1]-lmd[kmax-2];
-      RealD dd = sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
+    RealD dd = sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
-      RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
+    RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
-      // (Dsh: shift)
+    // (Dsh: shift)
-      // transformation
+    // transformation
-      qr_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax); // Nk, Nm
+    QR_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax); // Nk, Nm
-      // Convergence criterion (redef of kmin and kamx)
+    // Convergence criterion (redef of kmin and kamx)
-      for(int j=kmax-1; j>= kmin; --j){
+    for(int j=kmax-1; j>= kmin; --j){
-	RealD dds = fabs(lmd[j-1])+fabs(lmd[j]);
+      RealD dds = fabs(lmd[j-1])+fabs(lmd[j]);
-	if(fabs(lme[j-1])+dds > dds){
+      if(fabs(lme[j-1])+dds > dds){
-	  kmax = j+1;
+	kmax = j+1;
-	  goto continued;
+	goto continued;
 	}
      }
      Niter = iter;
      return;
    continued:
      for(int j=0; j<kmax-1; ++j){
 	RealD dds = fabs(lmd[j])+fabs(lmd[j+1]);
 	if(fabs(lme[j])+dds > dds){
 	  kmin = j+1;
 	  break;
 	}
      }
    }
-    std::cout << GridLogError << "[QL method] Error - Too many iteration: "<<Niter<<"\n";
+    QRiter = iter;
-    abort();
+    return;
  }
- };
+  continued:
    for(int j=0; j<kmax-1; ++j){
      RealD dds = fabs(lmd[j])+fabs(lmd[j+1]);
      if(fabs(lme[j])+dds > dds){
 	kmin = j+1;
 	break;
      }
    }
  }
  std::cout << GridLogError << "[QL method] Error - Too many iteration: "<<QRiter<<"\n";
  abort();
 }
 };
 }
 #endif
--- a/lib/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/lib/algorithms/iterative/LocalCoherenceLanczos.h
@@ -0,0 +1,352 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/LocalCoherenceLanczos.h
    Copyright (C) 2015
 Author: Christoph Lehner <clehner@bnl.gov>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LOCAL_COHERENCE_IRL_H
 #define GRID_LOCAL_COHERENCE_IRL_H
 namespace Grid { 
 struct LanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams,
 				  ChebyParams, Cheby,/*Chebyshev*/
 				  int, Nstop,    /*Vecs in Lanczos must converge Nstop < Nk < Nm*/
 				  int, Nk,       /*Vecs in Lanczos seek converge*/
 				  int, Nm,       /*Total vecs in Lanczos include restart*/
 				  RealD, resid,  /*residual*/
 				  int, MaxIt, 
 				  RealD, betastp,  /* ? */
 				  int, MinRes);    // Must restart
 };
 struct LocalCoherenceLanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
 				  bool, doFine,
 				  bool, doFineRead,
 				  bool, doCoarse,
 	       			  bool, doCoarseRead,
 				  LanczosParams, FineParams,
 				  LanczosParams, CoarseParams,
 				  ChebyParams,   Smoother,
 				  RealD        , coarse_relax_tol,
 				  std::vector<int>, blockSize,
 				  std::string, config,
 				  std::vector < std::complex<double>  >, omega,
 				  RealD, mass,
 				  RealD, M5);
 };
 // Duplicate functionality; ProjectedFunctionHermOp could be used with the trivial function
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj>          FineField;
  LinearOperatorBase<FineField> &_Linop;
  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
  ProjectedHermOp(LinearOperatorBase<FineField>& linop,  Aggregation<Fobj,CComplex,nbasis> &aggregate) : 
    _Linop(linop),
    _Aggregate(aggregate)  {  };
  void operator()(const CoarseField& in, CoarseField& out) {
    GridBase *FineGrid = _Aggregate.FineGrid;
    FineField fin(FineGrid);
    FineField fout(FineGrid);
    _Aggregate.PromoteFromSubspace(in,fin);    std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
    _Linop.HermOp(fin,fout);                   std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
    _Aggregate.ProjectToSubspace(out,fout);    std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
  }
 };
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj>          FineField;
  OperatorFunction<FineField>   & _poly;
  LinearOperatorBase<FineField> &_Linop;
  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
  ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,LinearOperatorBase<FineField>& linop, 
 			  Aggregation<Fobj,CComplex,nbasis> &aggregate) : 
    _poly(poly),
    _Linop(linop),
    _Aggregate(aggregate)  {  };
  void operator()(const CoarseField& in, CoarseField& out) {
    GridBase *FineGrid = _Aggregate.FineGrid;
    FineField fin(FineGrid) ;fin.checkerboard  =_Aggregate.checkerboard;
    FineField fout(FineGrid);fout.checkerboard =_Aggregate.checkerboard;
    _Aggregate.PromoteFromSubspace(in,fin);    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
    _poly(_Linop,fin,fout);                    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
    _Aggregate.ProjectToSubspace(out,fout);    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
  }
 };
 template<class Fobj,class CComplex,int nbasis>
 class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanczosTester<Lattice<iVector<CComplex,nbasis > > >
 {
 public:
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj>          FineField;
  LinearFunction<CoarseField> & _Poly;
  OperatorFunction<FineField>   & _smoother;
  LinearOperatorBase<FineField> &_Linop;
  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
  RealD                             _coarse_relax_tol;
  ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly,
 					   OperatorFunction<FineField>   &smoother,
 					   LinearOperatorBase<FineField> &Linop,
 					   Aggregation<Fobj,CComplex,nbasis> &Aggregate,
 					   RealD coarse_relax_tol=5.0e3) 
    : _smoother(smoother), _Linop(Linop),_Aggregate(Aggregate), _Poly(Poly), _coarse_relax_tol(coarse_relax_tol)  {    };
  int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
    CoarseField v(B);
    RealD eval_poly = eval;
    // Apply operator
    _Poly(B,v);
    RealD vnum = real(innerProduct(B,v)); // HermOp.
    RealD vden = norm2(B);
    RealD vv0  = norm2(v);
    eval   = vnum/vden;
    v -= eval*B;
    RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
    std::cout.precision(13);
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
 	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;
    int conv=0;
    if( (vv<eresid*eresid) ) conv = 1;
    return conv;
  }
  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
    GridBase *FineGrid = _Aggregate.FineGrid;
    int checkerboard   = _Aggregate.checkerboard;
    FineField fB(FineGrid);fB.checkerboard =checkerboard;
    FineField fv(FineGrid);fv.checkerboard =checkerboard;
    _Aggregate.PromoteFromSubspace(B,fv);
    _smoother(_Linop,fv,fB); 
    RealD eval_poly = eval;
    _Linop.HermOp(fB,fv);
    RealD vnum = real(innerProduct(fB,fv)); // HermOp.
    RealD vden = norm2(fB);
    RealD vv0  = norm2(fv);
    eval   = vnum/vden;
    fv -= eval*fB;
    RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
    std::cout.precision(13);
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
 	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;
    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
    if( (vv<eresid*eresid) ) return 1;
    return 0;
  }
 };
 ////////////////////////////////////////////
 // Make serializable Lanczos params
 ////////////////////////////////////////////
 template<class Fobj,class CComplex,int nbasis>
 class LocalCoherenceLanczos 
 {
 public:
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CComplex>                   CoarseScalar; // used for inner products on fine field
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<Fobj>                       FineField;
 protected:
  GridBase *_CoarseGrid;
  GridBase *_FineGrid;
  int _checkerboard;
  LinearOperatorBase<FineField>                 & _FineOp;
  // FIXME replace Aggregation with vector of fine; the code reuse is too small for
  // the hassle and complexity of cross coupling.
  Aggregation<Fobj,CComplex,nbasis>               _Aggregate;  
  std::vector<RealD>                              evals_fine;
  std::vector<RealD>                              evals_coarse; 
  std::vector<CoarseField>                        evec_coarse;
 public:
  LocalCoherenceLanczos(GridBase *FineGrid,
 		GridBase *CoarseGrid,
 		LinearOperatorBase<FineField> &FineOp,
 		int checkerboard) :
    _CoarseGrid(CoarseGrid),
    _FineGrid(FineGrid),
    _Aggregate(CoarseGrid,FineGrid,checkerboard),
    _FineOp(FineOp),
    _checkerboard(checkerboard)
  {
    evals_fine.resize(0);
    evals_coarse.resize(0);
  };
  void Orthogonalise(void ) { _Aggregate.Orthogonalise(); }
  template<typename T>  static RealD normalise(T& v) 
  {
    RealD nn = norm2(v);
    nn = ::sqrt(nn);
    v = v * (1.0/nn);
    return nn;
  }
  void fakeFine(void)
  {
    int Nk = nbasis;
    _Aggregate.subspace.resize(Nk,_FineGrid);
    _Aggregate.subspace[0]=1.0;
    _Aggregate.subspace[0].checkerboard=_checkerboard;
    normalise(_Aggregate.subspace[0]);
    PlainHermOp<FineField>    Op(_FineOp);
    for(int k=1;k<Nk;k++){
      _Aggregate.subspace[k].checkerboard=_checkerboard;
      Op(_Aggregate.subspace[k-1],_Aggregate.subspace[k]);
      normalise(_Aggregate.subspace[k]);
    }
  }
  void testFine(RealD resid) 
  {
    assert(evals_fine.size() == nbasis);
    assert(_Aggregate.subspace.size() == nbasis);
    PlainHermOp<FineField>    Op(_FineOp);
    ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
    for(int k=0;k<nbasis;k++){
      assert(SimpleTester.ReconstructEval(k,resid,_Aggregate.subspace[k],evals_fine[k],1.0)==1);
    }
  }
  void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
  {
    assert(evals_fine.size() == nbasis);
    assert(_Aggregate.subspace.size() == nbasis);
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////
    Chebyshev<FineField>                          ChebySmooth(cheby_smooth);
    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,_Aggregate);
    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);
    for(int k=0;k<evec_coarse.size();k++){
      if ( k < nbasis ) { 
 	assert(ChebySmoothTester.ReconstructEval(k,resid,evec_coarse[k],evals_coarse[k],1.0)==1);
      } else { 
 	assert(ChebySmoothTester.ReconstructEval(k,resid*relax,evec_coarse[k],evals_coarse[k],1.0)==1);
      }
    }
  }
  void calcFine(ChebyParams cheby_parms,int Nstop,int Nk,int Nm,RealD resid, 
 		RealD MaxIt, RealD betastp, int MinRes)
  {
    assert(nbasis<=Nm);
    Chebyshev<FineField>      Cheby(cheby_parms);
    FunctionHermOp<FineField> ChebyOp(Cheby,_FineOp);
    PlainHermOp<FineField>    Op(_FineOp);
    evals_fine.resize(Nm);
    _Aggregate.subspace.resize(Nm,_FineGrid);
    ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
    FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard;
    int Nconv;
    IRL.calc(evals_fine,_Aggregate.subspace,src,Nconv,false);
    // Shrink down to number saved
    assert(Nstop>=nbasis);
    assert(Nconv>=nbasis);
    evals_fine.resize(nbasis);
    _Aggregate.subspace.resize(nbasis,_FineGrid);
  }
  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
 		  int Nstop, int Nk, int Nm,RealD resid, 
 		  RealD MaxIt, RealD betastp, int MinRes)
  {
    Chebyshev<FineField>                          Cheby(cheby_op);
    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,_Aggregate);
    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,_Aggregate);
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////
    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);
    evals_coarse.resize(Nm);
    evec_coarse.resize(Nm,_CoarseGrid);
    CoarseField src(_CoarseGrid);     src=1.0; 
    ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
    int Nconv=0;
    IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
    assert(Nconv>=Nstop);
    evals_coarse.resize(Nstop);
    evec_coarse.resize (Nstop,_CoarseGrid);
    for (int i=0;i<Nstop;i++){
      std::cout << i << " Coarse eval = " << evals_coarse[i]  << std::endl;
    }
  }
 };
 }
 #endif
--- a/lib/algorithms/iterative/SchurRedBlack.h
+++ b/lib/algorithms/iterative/SchurRedBlack.h
@@ -53,16 +53,119 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
   *     M psi = eta
   ***********************
   *Odd
-   * i)   (D_oo)^{\dag} D_oo psi_o = (D_oo)^dag L^{-1}  eta_o
+   * i)                 D_oo psi_o =  L^{-1}  eta_o
   *                        eta_o' = (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
   *
   * Wilson:
   *      (D_oo)^{\dag} D_oo psi_o = (D_oo)^dag L^{-1}  eta_o
   * Stag:
   *      D_oo psi_o = L^{-1}  eta =    (eta_o - Moe Mee^{-1} eta_e)
   *
   * L^-1 eta_o= (1              0 ) (e
   *             (-MoeMee^{-1}   1 )   
   *
   *Even
   * ii)  Mee psi_e + Meo psi_o = src_e
   *
   *   => sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
   *
   * 
   * TODO: Other options:
   * 
   * a) change checkerboards for Schur e<->o
   *
   * Left precon by Moo^-1
   * b) Doo^{dag} M_oo^-dag Moo^-1 Doo psi_0 =  (D_oo)^dag M_oo^-dag Moo^-1 L^{-1}  eta_o
   *                              eta_o'     = (D_oo)^dag  M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
   *
   * Right precon by Moo^-1
   * c) M_oo^-dag Doo^{dag} Doo Moo^-1 phi_0 = M_oo^-dag (D_oo)^dag L^{-1}  eta_o
   *                              eta_o'     = M_oo^-dag (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
   *                              psi_o = M_oo^-1 phi_o
   * TODO: Deflation 
   */
 namespace Grid {
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take a matrix and form a Red Black solver calling a Herm solver
  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class Field> class SchurRedBlackStaggeredSolve {
  private:
    OperatorFunction<Field> & _HermitianRBSolver;
    int CBfactorise;
  public:
    /////////////////////////////////////////////////////
    // Wrap the usual normal equations Schur trick
    /////////////////////////////////////////////////////
  SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver)  :
     _HermitianRBSolver(HermitianRBSolver) 
    { 
      CBfactorise=0;
    };
    template<class Matrix>
      void operator() (Matrix & _Matrix,const Field &in, Field &out){
      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
      Field src_e(grid);
      Field src_o(grid);
      Field sol_e(grid);
      Field sol_o(grid);
      Field   tmp(grid);
      Field  Mtmp(grid);
      Field resid(fgrid);
      pickCheckerboard(Even,src_e,in);
      pickCheckerboard(Odd ,src_o,in);
      pickCheckerboard(Even,sol_e,out);
      pickCheckerboard(Odd ,sol_o,out);
      /////////////////////////////////////////////////////
      // src_o = (source_o - Moe MeeInv source_e)
      /////////////////////////////////////////////////////
      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
      src_o = tmp;     assert(src_o.checkerboard ==Odd);
      //  _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source
      //////////////////////////////////////////////////////////////
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
      ///////////////////////////////////////////////////
      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
      // Verify the unprec residual
      _Matrix.M(out,resid); 
      resid = resid-in;
      RealD ns = norm2(in);
      RealD nr = norm2(resid);
      std::cout<<GridLogMessage << "SchurRedBlackStaggered solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
    }     
  };
  template<class Field> using SchurRedBlackStagSolve = SchurRedBlackStaggeredSolve<Field>;
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take a matrix and form a Red Black solver calling a Herm solver
  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
@@ -76,12 +179,10 @@ namespace Grid {
    /////////////////////////////////////////////////////
    // Wrap the usual normal equations Schur trick
    /////////////////////////////////////////////////////
-  SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver)  :
+  SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver,int cb=0)  :  _HermitianRBSolver(HermitianRBSolver) 
-     _HermitianRBSolver(HermitianRBSolver) 
+  { 
-    { 
+    CBfactorise=cb;
-      CBfactorise=0;
+  };
    };
    template<class Matrix>
      void operator() (Matrix & _Matrix,const Field &in, Field &out){
@@ -141,5 +242,166 @@ namespace Grid {
    }     
  };
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take a matrix and form a Red Black solver calling a Herm solver
  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class Field> class SchurRedBlackDiagTwoSolve {
  private:
    OperatorFunction<Field> & _HermitianRBSolver;
    int CBfactorise;
  public:
    /////////////////////////////////////////////////////
    // Wrap the usual normal equations Schur trick
    /////////////////////////////////////////////////////
  SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver)  :
     _HermitianRBSolver(HermitianRBSolver) 
    { 
      CBfactorise=0;
    };
    template<class Matrix>
      void operator() (Matrix & _Matrix,const Field &in, Field &out){
      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
      Field src_e(grid);
      Field src_o(grid);
      Field sol_e(grid);
      Field sol_o(grid);
      Field   tmp(grid);
      Field  Mtmp(grid);
      Field resid(fgrid);
      pickCheckerboard(Even,src_e,in);
      pickCheckerboard(Odd ,src_o,in);
      pickCheckerboard(Even,sol_e,out);
      pickCheckerboard(Odd ,sol_o,out);
      /////////////////////////////////////////////////////
      // src_o = Mdag * (source_o - Moe MeeInv source_e)
      /////////////////////////////////////////////////////
      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
      // get the right MpcDag
      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
      //////////////////////////////////////////////////////////////
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
 //      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
      _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd);
      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
      ///////////////////////////////////////////////////
      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
      // Verify the unprec residual
      _Matrix.M(out,resid); 
      resid = resid-in;
      RealD ns = norm2(in);
      RealD nr = norm2(resid);
      std::cout<<GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
    }     
  };
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take a matrix and form a Red Black solver calling a Herm solver
  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class Field> class SchurRedBlackDiagTwoMixed {
  private:
    LinearFunction<Field> & _HermitianRBSolver;
    int CBfactorise;
  public:
    /////////////////////////////////////////////////////
    // Wrap the usual normal equations Schur trick
    /////////////////////////////////////////////////////
  SchurRedBlackDiagTwoMixed(LinearFunction<Field> &HermitianRBSolver)  :
     _HermitianRBSolver(HermitianRBSolver) 
    { 
      CBfactorise=0;
    };
    template<class Matrix>
      void operator() (Matrix & _Matrix,const Field &in, Field &out){
      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
      Field src_e(grid);
      Field src_o(grid);
      Field sol_e(grid);
      Field sol_o(grid);
      Field   tmp(grid);
      Field  Mtmp(grid);
      Field resid(fgrid);
      pickCheckerboard(Even,src_e,in);
      pickCheckerboard(Odd ,src_o,in);
      pickCheckerboard(Even,sol_e,out);
      pickCheckerboard(Odd ,sol_o,out);
      /////////////////////////////////////////////////////
      // src_o = Mdag * (source_o - Moe MeeInv source_e)
      /////////////////////////////////////////////////////
      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
      // get the right MpcDag
      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
      //////////////////////////////////////////////////////////////
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
 //      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
 //      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
      _HermitianRBSolver(src_o,tmp);  assert(tmp.checkerboard==Odd);
      _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd);
      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
      ///////////////////////////////////////////////////
      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
      // Verify the unprec residual
      _Matrix.M(out,resid); 
      resid = resid-in;
      RealD ns = norm2(in);
      RealD nr = norm2(resid);
      std::cout<<GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
    }     
  };
 }
 #endif
--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@@ -44,13 +44,20 @@ namespace Grid{
  class GridBase : public CartesianCommunicator , public GridThread {
 public:
-
+    int dummy;
    // Give Lattice access
    template<class object> friend class Lattice;
    GridBase(const std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
    GridBase(const std::vector<int> & processor_grid,
-	     const CartesianCommunicator &parent) : CartesianCommunicator(processor_grid,parent) {};
+	     const CartesianCommunicator &parent,
 	     int &split_rank) 
      : CartesianCommunicator(processor_grid,parent,split_rank) {};
    GridBase(const std::vector<int> & processor_grid,
 	     const CartesianCommunicator &parent) 
      : CartesianCommunicator(processor_grid,parent,dummy) {};
    virtual ~GridBase() = default;
    // Physics Grid information.
    std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes.
--- a/lib/cartesian/Cartesian_full.h
+++ b/lib/cartesian/Cartesian_full.h
@@ -38,7 +38,7 @@ namespace Grid{
 class GridCartesian: public GridBase {
 public:
-
+    int dummy;
    virtual int  CheckerBoardFromOindexTable (int Oindex) {
      return 0;
    }
@@ -67,7 +67,14 @@ public:
    GridCartesian(const std::vector<int> &dimensions,
 		  const std::vector<int> &simd_layout,
 		  const std::vector<int> &processor_grid,
-		  const GridCartesian &parent) : GridBase(processor_grid,parent)
+		  const GridCartesian &parent) : GridBase(processor_grid,parent,dummy)
    {
      Init(dimensions,simd_layout,processor_grid);
    }
    GridCartesian(const std::vector<int> &dimensions,
 		  const std::vector<int> &simd_layout,
 		  const std::vector<int> &processor_grid,
 		  const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank)
    {
      Init(dimensions,simd_layout,processor_grid);
    }
@@ -81,6 +88,8 @@ public:
      Init(dimensions,simd_layout,processor_grid);
    }
    virtual ~GridCartesian() = default;
    void Init(const std::vector<int> &dimensions,
 	      const std::vector<int> &simd_layout,
 	      const std::vector<int> &processor_grid)
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@@ -133,6 +133,8 @@ public:
    {
      Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim)  ;
    }
    virtual ~GridRedBlackCartesian() = default;
 #if 0
    ////////////////////////////////////////////////////////////
    // Create redblack grid ;; deprecate these. Should not
@@ -205,6 +207,7 @@ public:
        {
          assert((_gdimensions[d] & 0x1) == 0);
          _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
 	  _gsites /= 2;
        }
        _ldimensions[d] = _gdimensions[d] / _processors[d];
        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@@ -97,9 +97,9 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
 }
-#if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT)
+#if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) || defined (GRID_COMMS_MPI3)
-CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent) 
+CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank) 
 {
  _ndimension = processors.size();
  assert(_ndimension = parent._ndimension);
@@ -117,13 +117,24 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
  int Nchild = Nparent/childsize;
  assert (childsize * Nchild == Nparent);
-  int prank;  MPI_Comm_rank(parent.communicator,&prank);
+  std::vector<int> ccoor(_ndimension); // coor within subcommunicator
-  int crank = prank % childsize;
+  std::vector<int> scoor(_ndimension); // coor of split within parent
-  int ccomm = prank / childsize;
+  std::vector<int> ssize(_ndimension); // coor of split within parent
  for(int d=0;d<_ndimension;d++){
    ccoor[d] = parent._processor_coor[d] % processors[d];
    scoor[d] = parent._processor_coor[d] / processors[d];
    ssize[d] = parent._processors[d]     / processors[d];
  }
  int crank;  // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
  // Mpi uses the reverse Lexico convention to us
  Lexicographic::IndexFromCoorReversed(ccoor,crank,processors);
  Lexicographic::IndexFromCoorReversed(scoor,srank,ssize);
  MPI_Comm comm_split;
  if ( Nchild > 1 ) { 
    /*
    std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
    std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"]    ";
    for(int d=0;d<parent._processors.size();d++)  std::cout << parent._processors[d] << " ";
@@ -133,16 +144,31 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
    for(int d=0;d<processors.size();d++)  std::cout << processors[d] << " ";
    std::cout<<std::endl;
-    int ierr= MPI_Comm_split(parent.communicator, ccomm,crank,&comm_split);
+    std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< _ndimension <<"]    ";
    for(int d=0;d<processors.size();d++)  std::cout << parent._processor_coor[d] << " ";
    std::cout<<std::endl;
    std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"]    ";
    for(int d=0;d<processors.size();d++)  std::cout << ccoor[d] << " ";
    std::cout<<std::endl;
    std::cout << GridLogMessage<<" new coor ["<< _ndimension <<"]    ";
    for(int d=0;d<processors.size();d++)  std::cout << parent._processor_coor[d] << " ";
    std::cout<<std::endl;
    */
    int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
    assert(ierr==0);
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    // Declare victory
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    /*
    std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
-	      <<Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
+	      << Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
    */
  } else {
    comm_split=parent.communicator;
-    //    std::cout << "Passed parental communicator to a new communicator" <<std::endl;
+    srank = 0;
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -155,9 +181,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
 {
  //  if ( communicator_base != communicator_world ) {
  //    std::cout << "Cartesian communicator created with a non-world communicator"<<std::endl;
  //  }
  _ndimension = processors.size();
  _processor_coor.resize(_ndimension);
@@ -171,10 +194,20 @@ void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &proc
  }
  std::vector<int> periodic(_ndimension,1);
-  MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],1,&communicator);
+  MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator);
  MPI_Comm_rank(communicator,&_processor);
  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
  if ( communicator_base != communicator_world ) {
    std::cout << "Cartesian communicator created with a non-world communicator"<<std::endl;
    std::cout << " new communicator rank "<<_processor<< " coor ["<<_ndimension<<"] ";
    for(int d=0;d<_processors.size();d++){
      std::cout << _processor_coor[d]<<" ";
    }
    std::cout << std::endl;
  }
  int Size;
  MPI_Comm_size(communicator,&Size);
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -153,8 +153,9 @@ class CartesianCommunicator {
  // Constructors to sub-divide a parent communicator
  // and default to comm world
  ////////////////////////////////////////////////
-  CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent);
+  CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank);
  CartesianCommunicator(const std::vector<int> &pdimensions_in);
  virtual ~CartesianCommunicator();
 private:
 #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) 
@@ -263,6 +264,27 @@ class CartesianCommunicator {
  ////////////////////////////////////////////////////////////
  void Broadcast(int root,void* data, int bytes);
  ////////////////////////////////////////////////////////////
  // All2All down one dimension
  ////////////////////////////////////////////////////////////
  template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){
    assert(dim>=0);
    assert(dim<_ndimension);
    int numnode = _processors[dim];
    //    std::cerr << " AllToAll in.size()  "<<in.size()<<std::endl;
    //    std::cerr << " AllToAll out.size() "<<out.size()<<std::endl;
    assert(in.size()==out.size());
    uint64_t bytes=sizeof(T);
    uint64_t words=in.size()/numnode;
    assert(numnode * words == in.size());
    assert(words < (1ULL<<32));
    AllToAll(dim,(void *)&in[0],(void *)&out[0],words,bytes);
  }
  void AllToAll(int dim  ,void *in,void *out,uint64_t words,uint64_t bytes);
  void AllToAll(void  *in,void *out,uint64_t words         ,uint64_t bytes);
  template<class obj> void Broadcast(int root,obj &data)
    {
      Broadcast(root,(void *)&data,sizeof(data));
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -52,6 +52,15 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
  ShmInitGeneric();
 }
 CartesianCommunicator::~CartesianCommunicator()
 {
  int MPI_is_finalised;
  MPI_Finalized(&MPI_is_finalised);
  if (communicator && MPI_is_finalised)
    MPI_Comm_free(&communicator);
 }
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
@@ -187,6 +196,36 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 		     root,
 		     communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  std::vector<int> row(_ndimension,1);
  assert(dim>=0 && dim<_ndimension);
  //  Split the communicator
  row[dim] = _processors[dim];
  int me;
  CartesianCommunicator Comm(row,*this,me);
  Comm.AllToAll(in,out,words,bytes);
 }
 void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  // MPI is a pain and uses "int" arguments
  // 64*64*64*128*16 == 500Million elements of data.
  // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
  // (Turns up on 32^3 x 64 Gparity too)
  MPI_Datatype object;
  int iwords; 
  int ibytes;
  iwords = words;
  ibytes = bytes;
  assert(words == iwords); // safe to cast to int ?
  assert(bytes == ibytes); // safe to cast to int ?
  MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
  MPI_Type_commit(&object);
  MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);
  MPI_Type_free(&object);
 }
  ///////////////////////////////////////////////////////
  // Should only be used prior to Grid Init finished.
@@ -207,5 +246,7 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
  assert(ierr==0);
 }
 }
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -712,7 +712,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 							 int from,
 							 int bytes,int dir)
 {
-  assert(dir < communicator_halo.size());
+  int ncomm  =communicator_halo.size(); 
  int commdir=dir%ncomm;
  MPI_Request xrq;
  MPI_Request rrq;
@@ -732,14 +733,14 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  gfrom = MPI_UNDEFINED;
 #endif
  if ( gfrom ==MPI_UNDEFINED) {
-    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[dir],&rrq);
+    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
    assert(ierr==0);
    list.push_back(rrq);
    off_node_bytes+=bytes;
  }
  if ( gdest == MPI_UNDEFINED ) {
-    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[dir],&xrq);
+    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq);
    assert(ierr==0);
    list.push_back(xrq);
    off_node_bytes+=bytes;
--- a/lib/communicator/Communicator_mpit.cc
+++ b/lib/communicator/Communicator_mpit.cc
@@ -53,6 +53,13 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
  ShmInitGeneric();
 }
 CartesianCommunicator::~CartesianCommunicator()
 {
  if (communicator && !MPI::Is_finalized())
    MPI_Comm_free(&communicator);
 }
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
@@ -217,13 +224,14 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 {
  int myrank = _processor;
  int ierr;
-  assert(dir < communicator_halo.size());
+  int ncomm  =communicator_halo.size(); 
  int commdir=dir%ncomm;
  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
  // Give the CPU to MPI immediately; can use threads to overlap optionally
  MPI_Request req[2];
-  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
+  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[commdir],&req[1]);
-  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]);
+  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[commdir],&req[0]);
  list.push_back(req[0]);
  list.push_back(req[1]);
@@ -242,13 +250,14 @@ double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
 {
  int myrank = _processor;
  int ierr;
-  assert(dir < communicator_halo.size());
+  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo.size()<< <std::endl;
-  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
+  int ncomm  =communicator_halo.size(); 
  int commdir=dir%ncomm;
  // Give the CPU to MPI immediately; can use threads to overlap optionally
  MPI_Request req[2];
-  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
+  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[commdir],&req[1]);
-  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]);
+  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[commdir],&req[0]);
  MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
  return 2.0*bytes;
 }
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -38,8 +38,8 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
  ShmInitGeneric();
 }
-CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent) 
+CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank) 
-  : CartesianCommunicator(processors) {}
+  : CartesianCommunicator(processors) { srank=0;}
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
@@ -56,6 +56,8 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  }
 }
 CartesianCommunicator::~CartesianCommunicator(){}
 void CartesianCommunicator::GlobalSum(float &){}
 void CartesianCommunicator::GlobalSumVector(float *,int N){}
 void CartesianCommunicator::GlobalSum(double &){}
@@ -98,6 +100,14 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
 {
  assert(0);
 }
 void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  bcopy(in,out,bytes*words);
 }
 void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  bcopy(in,out,bytes*words);
 }
 int  CartesianCommunicator::RankWorld(void){return 0;}
 void CartesianCommunicator::Barrier(void){}
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@@ -109,8 +109,8 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
  coarseData=zero;
-  // Loop with a cache friendly loop ordering
+  // Loop over coars parallel, and then loop over fine associated with coarse.
-  for(int sf=0;sf<fine->oSites();sf++){
+  parallel_for(int sf=0;sf<fine->oSites();sf++){
    int sc;
    std::vector<int> coor_c(_ndimension);
@@ -119,6 +119,7 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 PARALLEL_CRITICAL
    for(int i=0;i<nbasis;i++) {
      coarseData._odata[sc](i)=coarseData._odata[sc](i)
@@ -139,6 +140,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
  GridBase * coarse= coarseA._grid;
  fineZ.checkerboard=fineX.checkerboard;
  assert(fineX.checkerboard==fineY.checkerboard);
  subdivides(coarse,fine); // require they map
  conformable(fineX,fineY);
  conformable(fineX,fineZ);
@@ -180,9 +182,10 @@ template<class vobj,class CComplex>
  GridBase *coarse(CoarseInner._grid);
  GridBase *fine  (fineX._grid);
-  Lattice<dotp> fine_inner(fine);
+  Lattice<dotp> fine_inner(fine); fine_inner.checkerboard = fineX.checkerboard;
  Lattice<dotp> coarse_inner(coarse);
  // Precision promotion?
  fine_inner = localInnerProduct(fineX,fineY);
  blockSum(coarse_inner,fine_inner);
  parallel_for(int ss=0;ss<coarse->oSites();ss++){
@@ -193,7 +196,7 @@ template<class vobj,class CComplex>
 inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
 {
  GridBase *coarse = ip._grid;
-  Lattice<vobj> zz(fineX._grid); zz=zero;
+  Lattice<vobj> zz(fineX._grid); zz=zero; zz.checkerboard=fineX.checkerboard;
  blockInnerProduct(ip,fineX,fineX);
  ip = pow(ip,-0.5);
  blockZAXPY(fineX,ip,fineX,zz);
@@ -216,19 +219,25 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
    block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
  }
  // Turn this around to loop threaded over sc and interior loop 
  // over sf would thread better
  coarseData=zero;
-  for(int sf=0;sf<fine->oSites();sf++){
+  parallel_region {
    int sc;
    std::vector<int> coor_c(_ndimension);
    std::vector<int> coor_f(_ndimension);
-    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
+    parallel_for_internal(int sf=0;sf<fine->oSites();sf++){
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
-    coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf];
+      Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
      for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
      Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 PARALLEL_CRITICAL
      coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf];
    }
  }
  return;
 }
@@ -238,7 +247,7 @@ inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vob
 {
  GridBase * fine = unpicked._grid;
-  Lattice<vobj> zz(fine);
+  Lattice<vobj> zz(fine); zz.checkerboard = unpicked.checkerboard;
  Lattice<iScalar<vInteger> > fcoor(fine);
  zz = zero;
@@ -303,20 +312,21 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
  }
  // Loop with a cache friendly loop ordering
-  for(int sf=0;sf<fine->oSites();sf++){
+  parallel_region {
    int sc;
    std::vector<int> coor_c(_ndimension);
    std::vector<int> coor_f(_ndimension);
-    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
+    parallel_for_internal(int sf=0;sf<fine->oSites();sf++){
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
-    for(int i=0;i<nbasis;i++) {
+      Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
-      if(i==0) fineData._odata[sf]=coarseData._odata[sc](i) * Basis[i]._odata[sf];
+      for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-      else     fineData._odata[sf]=fineData._odata[sf]+coarseData._odata[sc](i)*Basis[i]._odata[sf];
+      Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
      for(int i=0;i<nbasis;i++) {
 	if(i==0) fineData._odata[sf]=coarseData._odata[sc](i) * Basis[i]._odata[sf];
 	else     fineData._odata[sf]=fineData._odata[sf]+coarseData._odata[sc](i)*Basis[i]._odata[sf];
      }
    }
  }
  return;
@@ -685,5 +695,314 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
  }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Communicate between grids
 ////////////////////////////////////////////////////////////////////////////////
 //
 // All to all plan
 //
 // Subvolume on fine grid is v.    Vectors a,b,c,d 
 //
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 // SIMPLEST CASE:
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Mesh of nodes (2) ; subdivide to  1 subdivisions
 //
 // Lex ord:   
 //          N0 va0 vb0  N1 va1 vb1 
 //
 // For each dimension do an all to all
 //
 // full AllToAll(0)
 //          N0 va0 va1    N1 vb0 vb1
 //
 // REARRANGE
 //          N0 va01       N1 vb01
 //
 // Must also rearrange data to get into the NEW lex order of grid at each stage. Some kind of "insert/extract".
 // NB: Easiest to programme if keep in lex order.
 //
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 // SIMPLE CASE:
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 //
 // Mesh of nodes (2x2) ; subdivide to  1x1 subdivisions
 //
 // Lex ord:   
 //          N0 va0 vb0 vc0 vd0       N1 va1 vb1 vc1 vd1  
 //          N2 va2 vb2 vc2 vd2       N3 va3 vb3 vc3 vd3 
 //
 // Ratio = full[dim] / split[dim]
 //
 // For each dimension do an all to all; get Nvec -> Nvec / ratio
 //                                          Ldim -> Ldim * ratio
 //                                          LocalVol -> LocalVol * ratio
 // full AllToAll(0)
 //          N0 va0 vb0 va1 vb1       N1 vc0 vd0 vc1 vd1   
 //          N2 va2 vb2 va3 vb3       N3 vc2 vd2 vc3 vd3 
 //
 // REARRANGE
 //          N0 va01 vb01      N1 vc01 vd01
 //          N2 va23 vb23      N3 vc23 vd23
 //
 // full AllToAll(1)           // Not what is wanted. FIXME
 //          N0 va01 va23      N1 vc01 vc23 
 //          N2 vb01 vb23      N3 vd01 vd23
 // 
 // REARRANGE
 //          N0 va0123      N1 vc0123
 //          N2 vb0123      N3 vd0123
 //
 // Must also rearrange data to get into the NEW lex order of grid at each stage. Some kind of "insert/extract".
 // NB: Easiest to programme if keep in lex order.
 //
 /////////////////////////////////////////////////////////
 template<class Vobj>
 void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)
 {
  typedef typename Vobj::scalar_object Sobj;
  int full_vecs   = full.size();
  assert(full_vecs>=1);
  GridBase * full_grid = full[0]._grid;
  GridBase *split_grid = split._grid;
  int       ndim  = full_grid->_ndimension;
  int  full_nproc = full_grid->_Nprocessors;
  int split_nproc =split_grid->_Nprocessors;
  ////////////////////////////////
  // Checkerboard management
  ////////////////////////////////
  int cb = full[0].checkerboard;
  split.checkerboard = cb;
  //////////////////////////////
  // Checks
  //////////////////////////////
  assert(full_grid->_ndimension==split_grid->_ndimension);
  for(int n=0;n<full_vecs;n++){
    assert(full[n].checkerboard == cb);
    for(int d=0;d<ndim;d++){
      assert(full[n]._grid->_gdimensions[d]==split._grid->_gdimensions[d]);
      assert(full[n]._grid->_fdimensions[d]==split._grid->_fdimensions[d]);
    }
  }
  int   nvector   =full_nproc/split_nproc; 
  assert(nvector*split_nproc==full_nproc);
  assert(nvector == full_vecs);
  std::vector<int> ratio(ndim);
  for(int d=0;d<ndim;d++){
    ratio[d] = full_grid->_processors[d]/ split_grid->_processors[d];
  }
  uint64_t lsites = full_grid->lSites();
  uint64_t     sz = lsites * nvector;
  std::vector<Sobj> tmpdata(sz);
  std::vector<Sobj> alldata(sz);
  std::vector<Sobj> scalardata(lsites); 
  for(int v=0;v<nvector;v++){
    unvectorizeToLexOrdArray(scalardata,full[v]);    
    parallel_for(int site=0;site<lsites;site++){
      alldata[v*lsites+site] = scalardata[site];
    }
  }
  int nvec = nvector; // Counts down to 1 as we collapse dims
  std::vector<int> ldims = full_grid->_ldimensions;
  std::vector<int> lcoor(ndim);
  for(int d=ndim-1;d>=0;d--){
    if ( ratio[d] != 1 ) {
      full_grid ->AllToAll(d,alldata,tmpdata);
      //      std::cout << GridLogMessage << "Grid_split: dim " <<d<<" ratio "<<ratio[d]<<" nvec "<<nvec<<" procs "<<split_grid->_processors[d]<<std::endl;
      //      for(int v=0;v<nvec;v++){
      //	std::cout << "Grid_split: alldata["<<v<<"] " << alldata[v] <<std::endl;
      //	std::cout << "Grid_split: tmpdata["<<v<<"] " << tmpdata[v] <<std::endl;
      //      }
      //////////////////////////////////////////
      //Local volume for this dimension is expanded by ratio of processor extents
      // Number of vectors is decreased by same factor
      // Rearrange to lexico for bigger volume
      //////////////////////////////////////////
      nvec    /= ratio[d];
      auto rdims = ldims; rdims[d]  *=   ratio[d];
      auto rsites= lsites*ratio[d];
      for(int v=0;v<nvec;v++){
 	// For loop over each site within old subvol
 	for(int lsite=0;lsite<lsites;lsite++){
 	  Lexicographic::CoorFromIndex(lcoor, lsite, ldims);	  
 	  for(int r=0;r<ratio[d];r++){ // ratio*nvec terms
 	    auto rcoor = lcoor;	    rcoor[d]  += r*ldims[d];
 	    int rsite; Lexicographic::IndexFromCoor(rcoor, rsite, rdims);	  
 	    rsite += v * rsites;
 	    int rmul=nvec*lsites;
 	    int vmul=     lsites;
 	    alldata[rsite] = tmpdata[lsite+r*rmul+v*vmul];
 	    //	    if ( lsite==0 ) {
 	    //	      std::cout << "Grid_split: grow alldata["<<rsite<<"] " << alldata[rsite] << " <- tmpdata["<< lsite+r*rmul+v*vmul<<"] "<<tmpdata[lsite+r*rmul+v*vmul]  <<std::endl;
 	    //	    }	      
 	  }
 	}
      }
      ldims[d]*= ratio[d];
      lsites  *= ratio[d];
      if ( split_grid->_processors[d] > 1 ) {
 	tmpdata = alldata;
 	split_grid->AllToAll(d,tmpdata,alldata);
      }
    }
  }
  vectorizeFromLexOrdArray(alldata,split);    
 }
 template<class Vobj>
 void Grid_split(Lattice<Vobj> &full,Lattice<Vobj>   & split)
 {
  int nvector = full._grid->_Nprocessors / split._grid->_Nprocessors;
  std::vector<Lattice<Vobj> > full_v(nvector,full._grid);
  for(int n=0;n<nvector;n++){
    full_v[n] = full;
  }
  Grid_split(full_v,split);
 }
 template<class Vobj>
 void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)
 {
  typedef typename Vobj::scalar_object Sobj;
  int full_vecs   = full.size();
  assert(full_vecs>=1);
  GridBase * full_grid = full[0]._grid;
  GridBase *split_grid = split._grid;
  int       ndim  = full_grid->_ndimension;
  int  full_nproc = full_grid->_Nprocessors;
  int split_nproc =split_grid->_Nprocessors;
  ////////////////////////////////
  // Checkerboard management
  ////////////////////////////////
  int cb = full[0].checkerboard;
  split.checkerboard = cb;
  //////////////////////////////
  // Checks
  //////////////////////////////
  assert(full_grid->_ndimension==split_grid->_ndimension);
  for(int n=0;n<full_vecs;n++){
    assert(full[n].checkerboard == cb);
    for(int d=0;d<ndim;d++){
      assert(full[n]._grid->_gdimensions[d]==split._grid->_gdimensions[d]);
      assert(full[n]._grid->_fdimensions[d]==split._grid->_fdimensions[d]);
    }
  }
  int   nvector   =full_nproc/split_nproc; 
  assert(nvector*split_nproc==full_nproc);
  assert(nvector == full_vecs);
  std::vector<int> ratio(ndim);
  for(int d=0;d<ndim;d++){
    ratio[d] = full_grid->_processors[d]/ split_grid->_processors[d];
  }
  uint64_t lsites = full_grid->lSites();
  uint64_t     sz = lsites * nvector;
  std::vector<Sobj> tmpdata(sz);
  std::vector<Sobj> alldata(sz);
  std::vector<Sobj> scalardata(lsites); 
  unvectorizeToLexOrdArray(alldata,split);    
  /////////////////////////////////////////////////////////////////
  // Start from split grid and work towards full grid
  /////////////////////////////////////////////////////////////////
  std::vector<int> lcoor(ndim);
  std::vector<int> rcoor(ndim);
  int nvec = 1;
  lsites = split_grid->lSites();
  std::vector<int> ldims = split_grid->_ldimensions;
  //  for(int d=ndim-1;d>=0;d--){
  for(int d=0;d<ndim;d++){
    if ( ratio[d] != 1 ) {
      if ( split_grid->_processors[d] > 1 ) {
 	tmpdata = alldata;
 	split_grid->AllToAll(d,tmpdata,alldata);
      }
      //////////////////////////////////////////
      //Local volume for this dimension is expanded by ratio of processor extents
      // Number of vectors is decreased by same factor
      // Rearrange to lexico for bigger volume
      //////////////////////////////////////////
      auto rsites= lsites/ratio[d];
      auto rdims = ldims; rdims[d]/=ratio[d];
      for(int v=0;v<nvec;v++){
 	// rsite, rcoor --> smaller local volume
 	// lsite, lcoor --> bigger original (single node?) volume
 	// For loop over each site within smaller subvol
 	for(int rsite=0;rsite<rsites;rsite++){
 	  Lexicographic::CoorFromIndex(rcoor, rsite, rdims);	  
 	  int lsite;
 	  for(int r=0;r<ratio[d];r++){ 
 	    lcoor = rcoor; lcoor[d] += r*rdims[d];
 	    Lexicographic::IndexFromCoor(lcoor, lsite, ldims); lsite += v * lsites;
 	    int rmul=nvec*rsites;
 	    int vmul=     rsites;
 	    tmpdata[rsite+r*rmul+v*vmul]=alldata[lsite];
 	  }
 	}
      }
      nvec   *= ratio[d];
      ldims[d]=rdims[d];
      lsites  =rsites;
      full_grid ->AllToAll(d,tmpdata,alldata);
    }
  }
  lsites = full_grid->lSites();
  for(int v=0;v<nvector;v++){
    assert(v<full.size());
    parallel_for(int site=0;site<lsites;site++){
      scalardata[site] = alldata[v*lsites+site];
    }
    vectorizeFromLexOrdArray(scalardata,full[v]);    
  }
 }
 }
 #endif
--- a/lib/log/Log.cc
+++ b/lib/log/Log.cc
@@ -50,7 +50,7 @@ namespace Grid {
    return (status==0) ? res.get() : name ;
  }
-GridStopWatch Logger::StopWatch;
+GridStopWatch Logger::GlobalStopWatch;
 int Logger::timestamp;
 std::ostream Logger::devnull(0);
@@ -59,13 +59,15 @@ void GridLogTimestamp(int on){
 }
 Colours GridLogColours(0);
-GridLogger GridLogError(1, "Error", GridLogColours, "RED");
+GridLogger GridLogIRL    (1, "IRL"   , GridLogColours, "NORMAL");
 GridLogger GridLogSolver (1, "Solver", GridLogColours, "NORMAL");
 GridLogger GridLogError  (1, "Error" , GridLogColours, "RED");
 GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
 GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
-GridLogger GridLogDebug(1, "Debug", GridLogColours, "PURPLE");
+GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
-GridLogger GridLogIterative(1, "Iterative", GridLogColours, "BLUE");
+GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
-GridLogger GridLogIntegrator(1, "Integrator", GridLogColours, "BLUE");
+GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
 void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogError.Active(0);
--- a/lib/log/Log.h
+++ b/lib/log/Log.h
@@ -85,12 +85,15 @@ class Logger {
 protected:
  Colours &Painter;
  int active;
  int timing_mode;
  static int timestamp;
  std::string name, topName;
  std::string COLOUR;
 public:
-  static GridStopWatch StopWatch;
+  static GridStopWatch GlobalStopWatch;
  GridStopWatch         LocalStopWatch;
  GridStopWatch *StopWatch;
  static std::ostream devnull;
  std::string background() {return Painter.colour["NORMAL"];}
@@ -101,22 +104,38 @@ public:
    name(nm),
    topName(topNm),
    Painter(col_class),
-    COLOUR(col) {} ;
+    timing_mode(0),
    COLOUR(col) 
    {
      StopWatch = & GlobalStopWatch;
    };
  void Active(int on) {active = on;};
  int  isActive(void) {return active;};
  static void Timestamp(int on) {timestamp = on;};
  void Reset(void) { 
    StopWatch->Reset(); 
    StopWatch->Start(); 
  }
  void TimingMode(int on) { 
    timing_mode = on; 
    if(on) { 
      StopWatch = &LocalStopWatch;
      Reset(); 
    }
  }
  friend std::ostream& operator<< (std::ostream& stream, Logger& log){
    if ( log.active ) {
-      stream << log.background()<< std::setw(8) << std::left << log.topName << log.background()<< " : ";
+      stream << log.background()<<  std::left << log.topName << log.background()<< " : ";
-      stream << log.colour() << std::setw(10) << std::left << log.name << log.background() << " : ";
+      stream << log.colour() <<  std::left << log.name << log.background() << " : ";
      if ( log.timestamp ) {
-	StopWatch.Stop();
+	log.StopWatch->Stop();
-	GridTime now = StopWatch.Elapsed();
+	GridTime now = log.StopWatch->Elapsed();
-	StopWatch.Start();
+	if ( log.timing_mode==1 ) log.StopWatch->Reset();
-	stream << log.evidence()<< now << log.background() << " : " ;
+	log.StopWatch->Start();
 	stream << log.evidence()<< std::setw(6)<<now << log.background() << " : " ;
      }
      stream << log.colour();
      return stream;
@@ -135,6 +154,8 @@ public:
 void GridLogConfigure(std::vector<std::string> &logstreams);
 extern GridLogger GridLogIRL;
 extern GridLogger GridLogSolver;
 extern GridLogger GridLogError;
 extern GridLogger GridLogWarning;
 extern GridLogger GridLogMessage;
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -261,7 +261,7 @@ class BinaryIO {
 			      GridBase *grid,
 			      std::vector<fobj> &iodata,
 			      std::string file,
-			      int offset,
+			      Integer offset,
 			      const std::string &format, int control,
 			      uint32_t &nersc_csum,
 			      uint32_t &scidac_csuma,
@@ -356,7 +356,7 @@ class BinaryIO {
      if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
 #ifdef USE_MPI_IO
-	std::cout<< GridLogMessage<< "MPI read I/O "<< file<< std::endl;
+	std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< std::endl;
 	ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);    assert(ierr==0);
 	ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);    assert(ierr==0);
 	ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status);    assert(ierr==0);
@@ -367,7 +367,7 @@ class BinaryIO {
 	assert(0);
 #endif
      } else {
-        std::cout << GridLogMessage << "C++ read I/O " << file << " : "
+	std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : "
                  << iodata.size() * sizeof(fobj) << " bytes" << std::endl;
        std::ifstream fin;
        fin.open(file, std::ios::binary | std::ios::in);
@@ -413,9 +413,9 @@ class BinaryIO {
      timer.Start();
      if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
 #ifdef USE_MPI_IO
-        std::cout << GridLogMessage << "MPI write I/O " << file << std::endl;
+        std::cout << GridLogMessage <<"IOobject: MPI write I/O " << file << std::endl;
        ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
-        std::cout << GridLogMessage << "Checking for errors" << std::endl;
+	//        std::cout << GridLogMessage << "Checking for errors" << std::endl;
        if (ierr != MPI_SUCCESS)
        {
          char error_string[BUFSIZ];
@@ -445,46 +445,54 @@ class BinaryIO {
 #endif
      } else { 
-	std::ofstream fout; 
+        std::cout << GridLogMessage << "IOobject: C++ write I/O " << file << " : "
-  fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
+                  << iodata.size() * sizeof(fobj) << " bytes" << std::endl;
  try {
    fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
  } catch (const std::fstream::failure& exc) {
    std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
    std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
    std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
    #ifdef USE_MPI_IO
    MPI_Abort(MPI_COMM_WORLD,1);
    #else
    exit(1);
    #endif
  }
 	std::cout << GridLogMessage<< "C++ write I/O "<< file<<" : "
 		        << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
-  if ( control & BINARYIO_MASTER_APPEND )  {
+	std::ofstream fout; 
-	  fout.seekp(0,fout.end);
+	fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
-	} else {
+	try {
-	  fout.seekp(offset+myrank*lsites*sizeof(fobj));
+	  fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
 	} catch (const std::fstream::failure& exc) {
 	  std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
 	  std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
 	  std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
 #ifdef USE_MPI_IO
 	  MPI_Abort(MPI_COMM_WORLD,1);
 #else
 	  exit(1);
 #endif
 	}
-  try {
+	if ( control & BINARYIO_MASTER_APPEND )  {
-  	fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));//assert( fout.fail()==0);
+	  try {
-  }
+	    fout.seekp(0,fout.end);
-  catch (const std::fstream::failure& exc) {
+	  } catch (const std::fstream::failure& exc) {
-    std::cout << "Exception in writing file " << file << std::endl;
+	    std::cout << "Exception in seeking file end " << file << std::endl;
-    std::cout << GridLogError << "Exception description: "<< exc.what() << std::endl;
+	  }
-    #ifdef USE_MPI_IO
+	} else {
-    MPI_Abort(MPI_COMM_WORLD,1);
+	  try { 
-    #else
+	    fout.seekp(offset+myrank*lsites*sizeof(fobj));
-    exit(1);
+	  } catch (const std::fstream::failure& exc) {
-    #endif
+	    std::cout << "Exception in seeking file " << file <<" offset "<< offset << std::endl;
-  }
+	  }
 	}
 	try {
 	  fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));//assert( fout.fail()==0);
 	}
 	catch (const std::fstream::failure& exc) {
 	  std::cout << "Exception in writing file " << file << std::endl;
 	  std::cout << GridLogError << "Exception description: "<< exc.what() << std::endl;
 #ifdef USE_MPI_IO
 	  MPI_Abort(MPI_COMM_WORLD,1);
 #else
 	  exit(1);
 #endif
 	}
 	fout.close();
-  }
+      }
-  timer.Stop();
+      timer.Stop();
-  }
+    }
    std::cout<<GridLogMessage<<"IOobject: ";
    if ( control & BINARYIO_READ) std::cout << " read  ";
@@ -515,7 +523,7 @@ class BinaryIO {
  static inline void readLatticeObject(Lattice<vobj> &Umu,
 				       std::string file,
 				       munger munge,
-				       int offset,
+				       Integer offset,
 				       const std::string &format,
 				       uint32_t &nersc_csum,
 				       uint32_t &scidac_csuma,
@@ -552,7 +560,7 @@ class BinaryIO {
    static inline void writeLatticeObject(Lattice<vobj> &Umu,
 					  std::string file,
 					  munger munge,
-					  int offset,
+					  Integer offset,
 					  const std::string &format,
 					  uint32_t &nersc_csum,
 					  uint32_t &scidac_csuma,
@@ -589,7 +597,7 @@ class BinaryIO {
  static inline void readRNG(GridSerialRNG &serial,
 			     GridParallelRNG &parallel,
 			     std::string file,
-			     int offset,
+			     Integer offset,
 			     uint32_t &nersc_csum,
 			     uint32_t &scidac_csuma,
 			     uint32_t &scidac_csumb)
@@ -651,7 +659,7 @@ class BinaryIO {
  static inline void writeRNG(GridSerialRNG &serial,
 			      GridParallelRNG &parallel,
 			      std::string file,
-			      int offset,
+			      Integer offset,
 			      uint32_t &nersc_csum,
 			      uint32_t &scidac_csuma,
 			      uint32_t &scidac_csumb)
--- a/lib/parallelIO/IldgIO.h
+++ b/lib/parallelIO/IldgIO.h
@@ -147,7 +147,7 @@ namespace QCD {
   _scidacRecord = sr;
-   std::cout << GridLogMessage << "Build SciDAC datatype " <<sr.datatype<<std::endl;
+   //   std::cout << GridLogMessage << "Build SciDAC datatype " <<sr.datatype<<std::endl;
 }
 ///////////////////////////////////////////////////////
@@ -159,7 +159,7 @@ namespace QCD {
   uint32_t scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
   if ( scidac_csuma !=scidac_checksuma) return 0;
   if ( scidac_csumb !=scidac_checksumb) return 0;
-    return 1;
+   return 1;
 }
 ////////////////////////////////////////////////////////////////////////////////////
@@ -224,7 +224,7 @@ class GridLimeReader : public BinaryIO {
 	assert(PayloadSize == file_bytes);// Must match or user error
-	off_t offset= ftell(File);
+	uint64_t offset= ftello(File);
 	//	std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
 	BinarySimpleMunger<sobj,sobj> munge;
 	BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
@@ -237,7 +237,7 @@ class GridLimeReader : public BinaryIO {
 	/////////////////////////////////////////////
 	// Verify checksums
 	/////////////////////////////////////////////
-	scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb);
+	assert(scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb)==1);
 	return;
      }
    }
@@ -253,16 +253,13 @@ class GridLimeReader : public BinaryIO {
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
      //      std::cout << GridLogMessage<< " readLimeObject seeking "<< record_name <<" found record :" <<limeReaderType(LimeR) <<std::endl;
      uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
      if ( !strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) )  ) {
 	//	std::cout << GridLogMessage<< " readLimeObject matches ! " << record_name <<std::endl;
 	std::vector<char> xmlc(nbytes+1,'\0');
 	limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);    
 	//	std::cout << GridLogMessage<< " readLimeObject matches XML " << &xmlc[0] <<std::endl;
 	XmlReader RD(&xmlc[0],"");
@@ -332,7 +329,7 @@ class GridLimeWriter : public BinaryIO {
    err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0);
    err=limeWriterCloseRecord(LimeW);                       assert(err>=0);
    limeDestroyHeader(h);
-    //    std::cout << " File offset is now"<<ftell(File) << std::endl;
+    //    std::cout << " File offset is now"<<ftello(File) << std::endl;
  }
  ////////////////////////////////////////////
  // Write a generic lattice field and csum
@@ -349,7 +346,6 @@ class GridLimeWriter : public BinaryIO {
    uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites;
    createLimeRecordHeader(record_name, 0, 0, PayloadSize);
    //    std::cout << "W sizeof(sobj)"      <<sizeof(sobj)<<std::endl;
    //    std::cout << "W Gsites "           <<field._grid->_gsites<<std::endl;
    //    std::cout << "W Payload expected " <<PayloadSize<<std::endl;
@@ -361,18 +357,20 @@ class GridLimeWriter : public BinaryIO {
    // These are both buffered, so why I think this code is right is as follows.
    //
    // i)  write record header to FILE *File, telegraphing the size. 
-    // ii) ftell reads the offset from FILE *File .
+    // ii) ftello reads the offset from FILE *File .
    // iii) iostream / MPI Open independently seek this offset. Write sequence direct to disk.
    //      Closes iostream and flushes.
    // iv) fseek on FILE * to end of this disjoint section.
    //  v) Continue writing scidac record.
    ////////////////////////////////////////////////////////////////////
-    off_t offset = ftell(File);
+    uint64_t offset = ftello(File);
    //    std::cout << " Writing to offset "<<offset << std::endl;
    std::string format = getFormatString<vobj>();
    BinarySimpleMunger<sobj,sobj> munge;
    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
    //    fseek(File,0,SEEK_END);    offset = ftello(File);std::cout << " offset now "<<offset << std::endl;
    err=limeWriterCloseRecord(LimeW);  assert(err>=0);
    ////////////////////////////////////////
    // Write checksum element, propagaing forward from the BinaryIO
    // Always pair a checksum with a binary object, and close message
@@ -382,7 +380,7 @@ class GridLimeWriter : public BinaryIO {
    std::stringstream streamb; streamb << std::hex << scidac_csumb;
    checksum.suma= streama.str();
    checksum.sumb= streamb.str();
-    std::cout << GridLogMessage<<" writing scidac checksums "<<std::hex<<scidac_csuma<<"/"<<scidac_csumb<<std::dec<<std::endl;
+    //    std::cout << GridLogMessage<<" writing scidac checksums "<<std::hex<<scidac_csuma<<"/"<<scidac_csumb<<std::dec<<std::endl;
    writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
  }
 };
@@ -642,7 +640,7 @@ class IldgReader : public GridLimeReader {
 	// Copy out the string
 	std::vector<char> xmlc(nbytes+1,'\0');
 	limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);    
-	std::cout << GridLogMessage<< "Non binary record :" <<limeReaderType(LimeR) <<std::endl; //<<"\n"<<(&xmlc[0])<<std::endl;
+	//	std::cout << GridLogMessage<< "Non binary record :" <<limeReaderType(LimeR) <<std::endl; //<<"\n"<<(&xmlc[0])<<std::endl;
 	//////////////////////////////////
 	// ILDG format record
@@ -686,7 +684,7 @@ class IldgReader : public GridLimeReader {
 	  std::string xmls(&xmlc[0]);
 	  // is it a USQCD info field
 	  if ( xmls.find(std::string("usqcdInfo")) != std::string::npos ) { 
-	    std::cout << GridLogMessage<<"...found a usqcdInfo field"<<std::endl;
+	    //	    std::cout << GridLogMessage<<"...found a usqcdInfo field"<<std::endl;
 	    XmlReader RD(&xmlc[0],"");
 	    read(RD,"usqcdInfo",usqcdInfo_);
 	    found_usqcdInfo = 1;
@@ -704,8 +702,7 @@ class IldgReader : public GridLimeReader {
 	// Binary data
 	/////////////////////////////////
 	std::cout << GridLogMessage << "ILDG Binary record found : "  ILDG_BINARY_DATA << std::endl;
-	off_t offset= ftell(File);
+	uint64_t offset= ftello(File);
 	if ( format == std::string("IEEE64BIG") ) {
 	  GaugeSimpleMunger<dobj, sobj> munge;
 	  BinaryIO::readLatticeObject< vobj, dobj >(Umu, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
--- a/lib/qcd/action/Action.h
+++ b/lib/qcd/action/Action.h
@@ -47,8 +47,4 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 ////////////////////////////////////////
 #include <Grid/qcd/action/pseudofermion/PseudoFermion.h>
 ////////////////////////////////////////////////////////////////////////
 // Laplacian on fermion fields
 ////////////////////////////////////////////////////////////////////////
 #include <Grid/qcd/utils/CovariantLaplacian.h>
 #endif
--- a/lib/qcd/action/ActionCore.h
+++ b/lib/qcd/action/ActionCore.h
@@ -53,7 +53,7 @@ directory
 // Utility functions
 ////////////////////////////////////////////
 #include <Grid/qcd/utils/Metric.h>
-#include <Grid/qcd/utils/CovariantAdjointLaplacian.h>
+#include <Grid/qcd/utils/CovariantLaplacian.h>
--- a/lib/qcd/action/fermion/DomainWallEOFAFermion.cc
+++ b/lib/qcd/action/fermion/DomainWallEOFAFermion.cc
@@ -60,11 +60,11 @@ namespace QCD {
        Approx::zolotarev_free(zdata);
    }
-    /*
+    /***************************************************************
-     Additional EOFA operators only called outside the inverter.
+     * Additional EOFA operators only called outside the inverter.
-     Since speed is not essential, simple axpby-style
+     * Since speed is not essential, simple axpby-style
-     implementations should be fine.
+     * implementations should be fine.
-    */
+     ***************************************************************/
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
    {
@@ -115,9 +115,9 @@ namespace QCD {
        return(norm2(chi));
    }
-    
+    /********************************************************************
-    // Performance critical fermion operators called inside the inverter
+     * Performance critical fermion operators called inside the inverter
-    
+     ********************************************************************/
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
--- a/lib/qcd/action/fermion/MobiusEOFAFermion.cc
+++ b/lib/qcd/action/fermion/MobiusEOFAFermion.cc
@@ -77,11 +77,11 @@ namespace QCD {
      }
    }
-    /*
+    /****************************************************************
-     Additional EOFA operators only called outside the inverter.
+     * Additional EOFA operators only called outside the inverter.  
-     Since speed is not essential, simple axpby-style
+     * Since speed is not essential, simple axpby-style
-     implementations should be fine.
+     * implementations should be fine.
-    */
+     ***************************************************************/
    template<class Impl>
    void MobiusEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
    {
@@ -193,9 +193,9 @@ namespace QCD {
      return(norm2(chi));
    }
-    
+    /********************************************************************
-    // Performance critical fermion operators called inside the inverter
+     * Performance critical fermion operators called inside the inverter
-    
+     ********************************************************************/
    template<class Impl>
    void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
--- a/lib/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
+++ b/lib/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
@@ -38,7 +38,7 @@ namespace Grid{
    //         (Moe Moo)    (Moe Mee^-1    1 )   (0   Moo-Moe Mee^-1 Meo)  (0   1         )
    //
    // Determinant is det of middle factor
-    // NOTICE: This assumes Mee is indept of U in computing the derivative
+    // This assumes Mee is indept of U.
    //
    template<class Impl>
    class SchurDifferentiableOperator :  public SchurDiagMooeeOperator<FermionOperator<Impl>,typename Impl::FermionField> 
@@ -77,7 +77,7 @@ namespace Grid{
          //  X^dag Der_oe MeeInv Meo Y
          // Use Mooee as nontrivial but gauge field indept
          this->_Mat.Meooe   (V,tmp1);      // odd->even -- implicit -0.5 factor to be applied
-	        this->_Mat.MooeeInv(tmp1,tmp2);   // even->even 
+	  this->_Mat.MooeeInv(tmp1,tmp2);   // even->even 
          this->_Mat.MoeDeriv(ForceO,U,tmp2,DaggerNo);
          //  Accumulate X^dag M_oe MeeInv Der_eo Y
          this->_Mat.MeooeDag   (U,tmp1);    // even->odd -- implicit -0.5 factor to be applied
--- a/lib/qcd/hmc/integrators/Integrator_algorithm.h
+++ b/lib/qcd/hmc/integrators/Integrator_algorithm.h
@@ -231,7 +231,7 @@ class ForceGradient : public Integrator<FieldImplementation, SmearingPolicy,
    Field Pfg(U._grid);
    Ufg = U;
    Pfg = zero;
-    std::cout << GridLogMessage << "FG update " << fg_dt << " " << ep
+    std::cout << GridLogIntegrator << "FG update " << fg_dt << " " << ep
              << std::endl;
    // prepare_fg; no prediction/result cache for now
    // could relax CG stopping conditions for the
--- a/lib/qcd/utils/CovariantAdjointLaplacian.h
+++ b/lib/qcd/utils/CovariantAdjointLaplacian.h
@@ -1,209 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/scalar/CovariantAdjointLaplacian.h
 Copyright (C) 2016
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef COVARIANT_ADJOINT_LAPLACIAN_H
 #define COVARIANT_ADJOINT_LAPLACIAN_H
 namespace Grid
 {
 namespace QCD
 {
 struct LaplacianParams : Serializable
 {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LaplacianParams,
                                  RealD, lo,
                                  RealD, hi,
                                  int, MaxIter,
                                  RealD, tolerance,
                                  int, degree,
                                  int, precision);
  // constructor
  LaplacianParams(RealD lo = 0.0,
                  RealD hi = 1.0,
                  int maxit = 1000,
                  RealD tol = 1.0e-8,
                  int degree = 10,
                  int precision = 64)
      : lo(lo),
        hi(hi),
        MaxIter(maxit),
        tolerance(tol),
        degree(degree),
        precision(precision){};
 };
 ////////////////////////////////////////////////////////////
 // Laplacian operator L on adjoint fields
 //
 // phi: adjoint field
 // L: D_mu^dag D_mu
 //
 // L phi(x) = Sum_mu [ U_mu(x)phi(x+mu)U_mu(x)^dag +
 //                     U_mu(x-mu)^dag phi(x-mu)U_mu(x-mu)
 //                     -2phi(x)]
 //
 // Operator designed to be encapsulated by
 // an HermitianLinearOperator<.. , ..>
 ////////////////////////////////////////////////////////////
 template <class Impl>
 class LaplacianAdjointField : public Metric<typename Impl::Field>
 {
  OperatorFunction<typename Impl::Field> &Solver;
  LaplacianParams param;
  MultiShiftFunction PowerHalf;
  MultiShiftFunction PowerInvHalf;
 public:
  INHERIT_GIMPL_TYPES(Impl);
  LaplacianAdjointField(GridBase *grid, OperatorFunction<GaugeField> &S, LaplacianParams &p, const RealD k = 1.0)
      : U(Nd, grid), Solver(S), param(p), kappa(k)
  {
    AlgRemez remez(param.lo, param.hi, param.precision);
    std::cout << GridLogMessage << "Generating degree " << param.degree << " for x^(1/2)" << std::endl;
    remez.generateApprox(param.degree, 1, 2);
    PowerHalf.Init(remez, param.tolerance, false);
    PowerInvHalf.Init(remez, param.tolerance, true);
  };
  void Mdir(const GaugeField &, GaugeField &, int, int) { assert(0); }
  void Mdiag(const GaugeField &, GaugeField &) { assert(0); }
  void ImportGauge(const GaugeField &_U)
  {
    for (int mu = 0; mu < Nd; mu++)
    {
      U[mu] = PeekIndex<LorentzIndex>(_U, mu);
    }
  }
  void M(const GaugeField &in, GaugeField &out)
  {
    // in is an antihermitian matrix
    // test
    //GaugeField herm = in + adj(in);
    //std::cout << "AHermiticity: " << norm2(herm) << std::endl;
    GaugeLinkField tmp(in._grid);
    GaugeLinkField tmp2(in._grid);
    GaugeLinkField sum(in._grid);
    for (int nu = 0; nu < Nd; nu++)
    {
      sum = zero;
      GaugeLinkField in_nu = PeekIndex<LorentzIndex>(in, nu);
      GaugeLinkField out_nu(out._grid);
      for (int mu = 0; mu < Nd; mu++)
      {
        tmp = U[mu] * Cshift(in_nu, mu, +1) * adj(U[mu]);
        tmp2 = adj(U[mu]) * in_nu * U[mu];
        sum += tmp + Cshift(tmp2, mu, -1) - 2.0 * in_nu;
      }
      out_nu = (1.0 - kappa) * in_nu - kappa / (double(4 * Nd)) * sum;
      PokeIndex<LorentzIndex>(out, out_nu, nu);
    }
  }
  void MDeriv(const GaugeField &in, GaugeField &der)
  {
    // in is anti-hermitian
    RealD factor = -kappa / (double(4 * Nd));
    for (int mu = 0; mu < Nd; mu++)
    {
      GaugeLinkField der_mu(der._grid);
      der_mu = zero;
      for (int nu = 0; nu < Nd; nu++)
      {
        GaugeLinkField in_nu = PeekIndex<LorentzIndex>(in, nu);
        der_mu += U[mu] * Cshift(in_nu, mu, 1) * adj(U[mu]) * in_nu;
      }
      // the minus sign comes by using the in_nu instead of the
      // adjoint in the last multiplication
      PokeIndex<LorentzIndex>(der, -2.0 * factor * der_mu, mu);
    }
  }
  // separating this temporarily
  void MDeriv(const GaugeField &left, const GaugeField &right,
              GaugeField &der)
  {
    // in is anti-hermitian
    RealD factor = -kappa / (double(4 * Nd));
    for (int mu = 0; mu < Nd; mu++)
    {
      GaugeLinkField der_mu(der._grid);
      der_mu = zero;
      for (int nu = 0; nu < Nd; nu++)
      {
        GaugeLinkField left_nu = PeekIndex<LorentzIndex>(left, nu);
        GaugeLinkField right_nu = PeekIndex<LorentzIndex>(right, nu);
        der_mu += U[mu] * Cshift(left_nu, mu, 1) * adj(U[mu]) * right_nu;
        der_mu += U[mu] * Cshift(right_nu, mu, 1) * adj(U[mu]) * left_nu;
      }
      PokeIndex<LorentzIndex>(der, -factor * der_mu, mu);
    }
  }
  void Minv(const GaugeField &in, GaugeField &inverted)
  {
    HermitianLinearOperator<LaplacianAdjointField<Impl>, GaugeField> HermOp(*this);
    Solver(HermOp, in, inverted);
  }
  void MSquareRoot(GaugeField &P)
  {
    GaugeField Gp(P._grid);
    HermitianLinearOperator<LaplacianAdjointField<Impl>, GaugeField> HermOp(*this);
    ConjugateGradientMultiShift<GaugeField> msCG(param.MaxIter, PowerHalf);
    msCG(HermOp, P, Gp);
    P = Gp;
  }
  void MInvSquareRoot(GaugeField &P)
  {
    GaugeField Gp(P._grid);
    HermitianLinearOperator<LaplacianAdjointField<Impl>, GaugeField> HermOp(*this);
    ConjugateGradientMultiShift<GaugeField> msCG(param.MaxIter, PowerInvHalf);
    msCG(HermOp, P, Gp);
    P = Gp;
  }
 private:
  RealD kappa;
  std::vector<GaugeLinkField> U;
 };
 } // QCD
 } // Grid
 #endif
--- a/lib/qcd/utils/CovariantLaplacian.h
+++ b/lib/qcd/utils/CovariantLaplacian.h
@@ -4,7 +4,7 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/scalar/CovariantLaplacian.h
-Copyright (C) 2017
+Copyright (C) 2016
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
@@ -30,57 +30,168 @@ directory
 #ifndef COVARIANT_LAPLACIAN_H
 #define COVARIANT_LAPLACIAN_H
-namespace Grid
+namespace Grid {
-{
+namespace QCD {
-namespace QCD
+
-{
+struct LaplacianParams : Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LaplacianParams, 
                                  RealD, lo, 
                                  RealD, hi, 
                                  int,   MaxIter, 
                                  RealD, tolerance, 
                                  int,   degree, 
                                  int,   precision);
  // constructor 
  LaplacianParams(RealD lo      = 0.0, 
                  RealD hi      = 1.0, 
                  int maxit     = 1000,
                  RealD tol     = 1.0e-8, 
                  int degree    = 10,
                  int precision = 64)
    : lo(lo),
      hi(hi),
      MaxIter(maxit),
      tolerance(tol),
      degree(degree),
      precision(precision){};
 };
 ////////////////////////////////////////////////////////////
-// Laplacian operator L on fermion fields
+// Laplacian operator L on adjoint fields
 //
-// phi: fermion field
+// phi: adjoint field
 // L: D_mu^dag D_mu
 //
-// L phi(x) = Sum_mu [ U_mu(x) phi(x+mu) + U_mu(x-mu) phi(x-mu) - 2phi(x)]
+// L phi(x) = Sum_mu [ U_mu(x)phi(x+mu)U_mu(x)^dag + 
 //                     U_mu(x-mu)^dag phi(x-mu)U_mu(x-mu)
 //                     -2phi(x)]
 //
 // Operator designed to be encapsulated by
 // an HermitianLinearOperator<.. , ..>
 ////////////////////////////////////////////////////////////
 // has to inherit from a fermion implementation
 template <class Impl>
-class Laplacian
+class LaplacianAdjointField: public Metric<typename Impl::Field> {
-{
+  OperatorFunction<typename Impl::Field> &Solver;
-public:
+  LaplacianParams param;
-  INHERIT_IMPL_TYPES(Impl);
+  MultiShiftFunction PowerHalf;    
  MultiShiftFunction PowerInvHalf;    
-  // add a bool to smear only in the spatial directions
+ public:
-  Laplacian(GridBase *grid, bool spatial = false)
+  INHERIT_GIMPL_TYPES(Impl);
      : U(Nd, grid), spatial_laplacian(spatial){};
-  void Mdir(const FermionField &, FermionField &, int, int) { assert(0); }
+  LaplacianAdjointField(GridBase* grid, OperatorFunction<GaugeField>& S, LaplacianParams& p, const RealD k = 1.0)
-  void Mdiag(const FermionField &, FermionField &) { assert(0); }
+      : U(Nd, grid), Solver(S), param(p), kappa(k){
        AlgRemez remez(param.lo,param.hi,param.precision);
        std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
        remez.generateApprox(param.degree,1,2);
        PowerHalf.Init(remez,param.tolerance,false);
        PowerInvHalf.Init(remez,param.tolerance,true);
-  void ImportGauge(const GaugeField &_U)
+
-  {
+      };
-    for (int mu = 0; mu < Nd; mu++)
+
  void Mdir(const GaugeField&, GaugeField&, int, int){ assert(0);}
  void Mdiag(const GaugeField&, GaugeField&){ assert(0);}
  void ImportGauge(const GaugeField& _U) {
    for (int mu = 0; mu < Nd; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(_U, mu);
    }
  }
-  void M(const FermionField &in, FermionField &out)
+  void M(const GaugeField& in, GaugeField& out) {
-  {
+    // in is an antihermitian matrix
-    int dims = spatial_laplacian ? (Nd - 1) : Nd;
+    // test
    //GaugeField herm = in + adj(in);
    //std::cout << "AHermiticity: " << norm2(herm) << std::endl;
-    out = -2.0 * dims * in;
+    GaugeLinkField tmp(in._grid);
-    // eventually speed up with the stencil operator, if necessary
+    GaugeLinkField tmp2(in._grid);
-    for (int mu = 0; mu < dims; mu++)
+    GaugeLinkField sum(in._grid);
-      out += Impl::CovShiftForward(U[mu], mu, in) + Impl::CovShiftBackward(U[mu], mu, in);
+
    for (int nu = 0; nu < Nd; nu++) {
      sum = zero;
      GaugeLinkField in_nu = PeekIndex<LorentzIndex>(in, nu);
      GaugeLinkField out_nu(out._grid);
      for (int mu = 0; mu < Nd; mu++) {
        tmp = U[mu] * Cshift(in_nu, mu, +1) * adj(U[mu]);
        tmp2 = adj(U[mu]) * in_nu * U[mu];
        sum += tmp + Cshift(tmp2, mu, -1) - 2.0 * in_nu;
      }
      out_nu = (1.0 - kappa) * in_nu - kappa / (double(4 * Nd)) * sum;
      PokeIndex<LorentzIndex>(out, out_nu, nu);
    }
  }
-private:
+  void MDeriv(const GaugeField& in, GaugeField& der) {
-  bool spatial_laplacian;
+    // in is anti-hermitian
    RealD factor = -kappa / (double(4 * Nd));
    for (int mu = 0; mu < Nd; mu++){
      GaugeLinkField der_mu(der._grid);
      der_mu = zero;
      for (int nu = 0; nu < Nd; nu++){
        GaugeLinkField in_nu = PeekIndex<LorentzIndex>(in, nu);
        der_mu += U[mu] * Cshift(in_nu, mu, 1) * adj(U[mu]) * in_nu;
      }
      // the minus sign comes by using the in_nu instead of the
      // adjoint in the last multiplication
      PokeIndex<LorentzIndex>(der,  -2.0 * factor * der_mu, mu);
    } 
  }
  // separating this temporarily
  void MDeriv(const GaugeField& left, const GaugeField& right,
              GaugeField& der) {
    // in is anti-hermitian
    RealD factor = -kappa / (double(4 * Nd));
    for (int mu = 0; mu < Nd; mu++) {
      GaugeLinkField der_mu(der._grid);
      der_mu = zero;
      for (int nu = 0; nu < Nd; nu++) {
        GaugeLinkField left_nu = PeekIndex<LorentzIndex>(left, nu);
        GaugeLinkField right_nu = PeekIndex<LorentzIndex>(right, nu);
        der_mu += U[mu] * Cshift(left_nu, mu, 1) * adj(U[mu]) * right_nu;
        der_mu += U[mu] * Cshift(right_nu, mu, 1) * adj(U[mu]) * left_nu;
      }
      PokeIndex<LorentzIndex>(der, -factor * der_mu, mu);
    }
  }
  void Minv(const GaugeField& in, GaugeField& inverted){
    HermitianLinearOperator<LaplacianAdjointField<Impl>,GaugeField> HermOp(*this);
    Solver(HermOp, in, inverted);
  }
  void MSquareRoot(GaugeField& P){
    GaugeField Gp(P._grid);
    HermitianLinearOperator<LaplacianAdjointField<Impl>,GaugeField> HermOp(*this);
    ConjugateGradientMultiShift<GaugeField> msCG(param.MaxIter,PowerHalf);
    msCG(HermOp,P,Gp);
    P = Gp; 
  }
  void MInvSquareRoot(GaugeField& P){
    GaugeField Gp(P._grid);
    HermitianLinearOperator<LaplacianAdjointField<Impl>,GaugeField> HermOp(*this);
    ConjugateGradientMultiShift<GaugeField> msCG(param.MaxIter,PowerInvHalf);
    msCG(HermOp,P,Gp);
    P = Gp; 
  }
 private:
  RealD kappa;
  std::vector<GaugeLinkField> U;
-}; // Laplacian
+};
 }
 }
 } // QCD
 } // Grid
 #endif
--- a/lib/serialisation/XmlIO.cc
+++ b/lib/serialisation/XmlIO.cc
@@ -70,8 +70,8 @@ XmlReader::XmlReader(const char *xmlstring,string toplev) : fileName_("")
  pugi::xml_parse_result result;
  result = doc_.load_string(xmlstring);
  if ( !result ) {
-    cerr << "XML error description: " << result.description() << "\n";
+    cerr << "XML error description (from char *): " << result.description() << "\nXML\n"<< xmlstring << "\n";
-    cerr << "XML error offset     : " << result.offset        << "\n";
+    cerr << "XML error offset      (from char *) " << result.offset         << "\nXML\n"<< xmlstring <<"\n";
    abort();
  }
  if ( toplev == std::string("") ) {
@@ -87,8 +87,8 @@ XmlReader::XmlReader(const string &fileName,string toplev) : fileName_(fileName)
  pugi::xml_parse_result result;
  result = doc_.load_file(fileName_.c_str());
  if ( !result ) {
-    cerr << "XML error description: " << result.description() << "\n";
+    cerr << "XML error description: " << result.description() <<" "<< fileName_ <<"\n";
-    cerr << "XML error offset     : " << result.offset        << "\n";
+    cerr << "XML error offset     : " << result.offset        <<" "<< fileName_ <<"\n";
    abort();
  }
  if ( toplev == std::string("") ) {
--- a/lib/threads/Threads.h
+++ b/lib/threads/Threads.h
@@ -51,7 +51,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define PARALLEL_CRITICAL
 #endif
 #define parallel_region    PARALLEL_REGION
 #define parallel_for       PARALLEL_FOR_LOOP for
 #define parallel_for_internal PARALLEL_FOR_LOOP_INTERN for
 #define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for
 namespace Grid {
--- a/lib/util/Init.cc
+++ b/lib/util/Init.cc
@@ -208,7 +208,7 @@ static int Grid_is_initialised = 0;
 void Grid_init(int *argc,char ***argv)
 {
-  GridLogger::StopWatch.Start();
+  GridLogger::GlobalStopWatch.Start();
  std::string arg;
@@ -243,6 +243,12 @@ void Grid_init(int *argc,char ***argv)
    fname<<CartesianCommunicator::RankWorld();
    fp=freopen(fname.str().c_str(),"w",stdout);
    assert(fp!=(FILE *)NULL);
    std::ostringstream ename;
    ename<<"Grid.stderr.";
    ename<<CartesianCommunicator::RankWorld();
    fp=freopen(ename.str().c_str(),"w",stderr);
    assert(fp!=(FILE *)NULL);
  }
  ////////////////////////////////////
--- a/lib/util/Lexicographic.h
+++ b/lib/util/Lexicographic.h
@@ -26,6 +26,25 @@ namespace Grid{
      }
    }
    static inline void IndexFromCoorReversed (const std::vector<int>& coor,int &index,const std::vector<int> &dims){
      int nd=dims.size();
      int stride=1;
      index=0;
      for(int d=nd-1;d>=0;d--){
 	index = index+stride*coor[d];
 	stride=stride*dims[d];
      }
    }
    static inline void CoorFromIndexReversed (std::vector<int>& coor,int index,const std::vector<int> &dims){
      int nd= dims.size();
      coor.resize(nd);
      for(int d=nd-1;d>=0;d--){
 	coor[d] = index % dims[d];
 	index   = index / dims[d];
      }
    }
  };
 }
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -1,4 +1,4 @@
-SUBDIRS = . core forces hmc solver debug smearing IO
+SUBDIRS = . core forces hmc solver debug smearing IO lanczos
 if BUILD_CHROMA_REGRESSION
  SUBDIRS+= qdpxx
--- a/tests/core/Test_laplacian.cc
+++ b/tests/core/Test_laplacian.cc
@@ -1,105 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./tests/Test_laplacian.cc
    Copyright (C) 2017
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian     RBGrid(&Grid);
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  GridParallelRNG          pRNG(&Grid);
  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
  std::vector<int> point({0,0,0,0});
  LatticeFermion src   (&Grid); //random(pRNG,src);
  SpinColourVectorD Sp;
  for (unsigned int s = 0; s < Ns; ++s)
      for (unsigned int c = 0; c < Nc; ++c)
        Sp()(s)(c) = 1;
  src = zero;
  pokeSite(Sp,src,point);
  LatticeFermion result(&Grid); result=zero;
  LatticeFermion tmp(&Grid); tmp=zero;
  // Gauge configuration
  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Testing the laplacian operator on a point source            "<<std::endl;
  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
  Laplacian<WilsonImplR> LaplaceOperator(src._grid);
  LaplaceOperator.ImportGauge(Umu);
  LaplaceOperator.M(src, result);
  std::cout << "Source vector" << std::endl;
  std::cout << src << std::endl;
  std::cout << "Result vector" << std::endl;
  std::cout << result << std::endl;
  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Testing the laplacian smearing operator on a point source   "<<std::endl;
  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
  LatticeFermion smeared  (&Grid); smeared = src;
  for (int smr = 0; smr < 10; ++smr)
  {
      LaplaceOperator.M(smeared, tmp);
      smeared += 0.1*tmp;
  }
  std::cout << "Smeared vector" << std::endl;
  std::cout << smeared << std::endl;
  // Norm of vector
  LatticeComplex smr_norm(&Grid);
  smr_norm = localNorm2(smeared);
  std::cout << "Smeared vector norm" << std::endl;
  std::cout << smr_norm << std::endl;
  Grid_finalize();
 }
--- a/tests/debug/Test_cheby.cc
+++ b/tests/debug/Test_cheby.cc
@@ -37,8 +37,15 @@ RealD InverseApproximation(RealD x){
 RealD SqrtApproximation(RealD x){
  return std::sqrt(x);
 }
 RealD Approximation32(RealD x){
  return std::pow(x,-1.0/32.0);
 }
 RealD Approximation2(RealD x){
  return std::pow(x,-1.0/2.0);
 }
 RealD StepFunction(RealD x){
-  if ( x<0.1 )  return 1.0;
+  if ( x<10.0 )  return 1.0;
  else return 0.0;
 }
@@ -56,7 +63,6 @@ int main (int argc, char ** argv)
  Chebyshev<LatticeFermion> ChebyInv(lo,hi,2000,InverseApproximation);
  {
    std::ofstream of("chebyinv");
    ChebyInv.csv(of);
@@ -78,7 +84,6 @@ int main (int argc, char ** argv)
  ChebyStep.JacksonSmooth();
  {
    std::ofstream of("chebystepjack");
    ChebyStep.csv(of);
@@ -100,5 +105,30 @@ int main (int argc, char ** argv)
    ChebyNE.csv(of);
  }
  lo=0.0;
  hi=4.0;
  Chebyshev<LatticeFermion> Cheby32(lo,hi,2000,Approximation32);
  {
    std::ofstream of("cheby32");
    Cheby32.csv(of);
  }
  Cheby32.JacksonSmooth();
  {
    std::ofstream of("cheby32jack");
    Cheby32.csv(of);
  }
  Chebyshev<LatticeFermion> ChebySqrt(lo,hi,2000,Approximation2);
  {
    std::ofstream of("chebysqrt");
    ChebySqrt.csv(of);
  }
  ChebySqrt.JacksonSmooth();
  {
    std::ofstream of("chebysqrtjack");
    ChebySqrt.csv(of);
  }
  Grid_finalize();
 }
--- a/tests/hadrons/Test_hadrons_meson_3pt_laplacian.cc
+++ b/tests/hadrons/Test_hadrons_meson_3pt_laplacian.cc
@@ -1,195 +0,0 @@
 /*******************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: tests/hadrons/Test_hadrons_meson_3pt.cc
 Copyright (C) 2015
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory.
 *******************************************************************************/
 #include <Grid/Hadrons/Application.hpp>
 using namespace Grid;
 using namespace Hadrons;
 int main(int argc, char *argv[])
 {
    // initialization //////////////////////////////////////////////////////////
    Grid_init(&argc, &argv);
    HadronsLogError.Active(GridLogError.isActive());
    HadronsLogWarning.Active(GridLogWarning.isActive());
    HadronsLogMessage.Active(GridLogMessage.isActive());
    HadronsLogIterative.Active(GridLogIterative.isActive());
    HadronsLogDebug.Active(GridLogDebug.isActive());
    LOG(Message) << "Grid initialized" << std::endl;
    // run setup ///////////////////////////////////////////////////////////////
    Application              application;
    std::vector<std::string> flavour = {"l", "s", "c1", "c2", "c3"};
    std::vector<double>      mass    = {.01, .04, .2  , .25 , .3  };
    unsigned int             nt      = GridDefaultLatt()[Tp];
    // global parameters
    Application::GlobalPar globalPar;
    globalPar.trajCounter.start    = 1500;
    globalPar.trajCounter.end      = 1520;
    globalPar.trajCounter.step     = 20;
    globalPar.seed                 = "1 2 3 4";
    globalPar.genetic.maxGen       = 1000;
    globalPar.genetic.maxCstGen    = 200;
    globalPar.genetic.popSize      = 20;
    globalPar.genetic.mutationRate = .1;
    application.setPar(globalPar);
    // gauge field
    application.createModule<MGauge::Unit>("gauge");
    // set fermion boundary conditions to be periodic space, antiperiodic time.
    std::string boundary = "1 1 1 -1";
    // sink
    MSink::Point::Par sinkPar;
    sinkPar.mom = "0 0 0";
    application.createModule<MSink::ScalarPoint>("sink", sinkPar);
    for (unsigned int i = 0; i < flavour.size(); ++i)
    {
        // actions
        MAction::DWF::Par actionPar;
        actionPar.gauge = "gauge";
        actionPar.Ls    = 12;
        actionPar.M5    = 1.8;
        actionPar.mass  = mass[i];
        actionPar.boundary = boundary;
        application.createModule<MAction::DWF>("DWF_" + flavour[i], actionPar);
        // solvers
        MSolver::RBPrecCG::Par solverPar;
        solverPar.action   = "DWF_" + flavour[i];
        solverPar.residual = 1.0e-8;
        application.createModule<MSolver::RBPrecCG>("CG_" + flavour[i],
                                                    solverPar);
    }
    for (unsigned int t = 0; t < nt; t += 1)
    {
        std::string                           srcName;
        std::string                           lapName;
        std::vector<std::string>              qName;
        std::vector<std::vector<std::string>> seqName;
        // Z2 source
        MSource::Z2::Par z2Par;
        z2Par.tA = t;
        z2Par.tB = t;
        srcName  = "z2_" + std::to_string(t);
        application.createModule<MSource::Z2>(srcName, z2Par);
        // Example of smearing of the source 
        MSource::LaplaceSmearing::Par LapPar;
        LapPar.N = 10;
        LapPar.alpha = 0.1;
        LapPar.source = srcName;
        LapPar.gauge = "gauge";
        lapName = "z2smr_" + std::to_string(t);
        application.createModule<MSource::LaplaceSmearing>(lapName, LapPar);
        for (unsigned int i = 0; i < flavour.size(); ++i)
        {
            // sequential sources
            MSource::SeqGamma::Par seqPar;
            qName.push_back("QZ2_" + flavour[i] + "_" + std::to_string(t));
            seqPar.q   = qName[i];
            seqPar.tA  = (t + nt/4) % nt;
            seqPar.tB  = (t + nt/4) % nt;
            seqPar.mom = "1. 0. 0. 0.";
            seqName.push_back(std::vector<std::string>(Nd));
            for (unsigned int mu = 0; mu < Nd; ++mu)
            {
                seqPar.gamma   = 0x1 << mu;
                seqName[i][mu] = "G" + std::to_string(seqPar.gamma)
                                 + "_" + std::to_string(seqPar.tA) + "-"
                                 + qName[i];
                application.createModule<MSource::SeqGamma>(seqName[i][mu], seqPar);
            }
            // propagators
            MFermion::GaugeProp::Par quarkPar;
            quarkPar.solver = "CG_" + flavour[i];
            quarkPar.source = srcName;
            application.createModule<MFermion::GaugeProp>(qName[i], quarkPar);
            for (unsigned int mu = 0; mu < Nd; ++mu)
            {
                quarkPar.source = seqName[i][mu];
                seqName[i][mu]  = "Q_" + flavour[i] + "-" + seqName[i][mu];
                application.createModule<MFermion::GaugeProp>(seqName[i][mu], quarkPar);
            }
        }
        // contractions
        MContraction::Meson::Par mesPar;
        for (unsigned int i = 0; i < flavour.size(); ++i)
        for (unsigned int j = i; j < flavour.size(); ++j)
        {
            mesPar.output = "mesons/Z2_" + flavour[i] + flavour[j];
            mesPar.q1     = qName[i];
            mesPar.q2     = qName[j];
            mesPar.gammas = "all";
            mesPar.sink   = "sink";
            application.createModule<MContraction::Meson>("meson_Z2_"
                                                          + std::to_string(t)
                                                          + "_"
                                                          + flavour[i]
                                                          + flavour[j],
                                                          mesPar);
        }
        for (unsigned int i = 0; i < flavour.size(); ++i)
        for (unsigned int j = 0; j < flavour.size(); ++j)
        for (unsigned int mu = 0; mu < Nd; ++mu)
        {
            MContraction::Meson::Par mesPar;
            mesPar.output = "3pt/Z2_" + flavour[i] + flavour[j] + "_"
                            + std::to_string(mu);
            mesPar.q1     = qName[i];
            mesPar.q2     = seqName[j][mu];
            mesPar.gammas = "all";
            mesPar.sink   = "sink";
            application.createModule<MContraction::Meson>("3pt_Z2_"
                                                          + std::to_string(t)
                                                          + "_"
                                                          + flavour[i]
                                                          + flavour[j]
                                                          + "_"
                                                          + std::to_string(mu),
                                                          mesPar);
        }
    }
    // execution
    application.saveParameterFile("meson3pt.xml");
    application.run();
    // epilogue
    LOG(Message) << "Grid is finalizing now" << std::endl;
    Grid_finalize();
    return EXIT_SUCCESS;
 }
--- a/tests/hmc/Test_remez.cc
+++ b/tests/hmc/Test_remez.cc
@@ -38,11 +38,11 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Testing Remez"<<std::endl;
-  double     lo=0.01;
+  double     lo=1.0e-3;
-  double     hi=1.0;
+  double     hi=5.0;
  int precision=64;
-  int    degree=10;
+  int    degree=16;
-  AlgRemez remez(0.001,1.0,precision);
+  AlgRemez remez(lo,hi,precision);
  ////////////////////////////////////////
  // sqrt and inverse sqrt
@@ -50,21 +50,50 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Generating degree "<<degree<<" for x^(1/2)"<<std::endl;
  remez.generateApprox(degree,1,2);
-  MultiShiftFunction Sqrt(remez,1.0,false);
+  MultiShiftFunction Root2(remez,1.0,false);
-  MultiShiftFunction InvSqrt(remez,1.0,true);
+  MultiShiftFunction InvRoot2(remez,1.0,true);
  std::cout<<GridLogMessage << "Generating degree "<<degree<<" for x^(1/4)"<<std::endl;
  remez.generateApprox(degree,1,4);
-  MultiShiftFunction SqrtSqrt(remez,1.0,false);
+  MultiShiftFunction Root4(remez,1.0,false);
-  MultiShiftFunction InvSqrtSqrt(remez,1.0,true);
+  MultiShiftFunction InvRoot4(remez,1.0,true);
  std::cout<<GridLogMessage << "Generating degree "<<degree<<" for x^(1/8)"<<std::endl;
  remez.generateApprox(degree,1,8);
  MultiShiftFunction Root8(remez,1.0,false);
  MultiShiftFunction InvRoot8(remez,1.0,true);
  std::cout<<GridLogMessage << "Generating degree "<<degree<<" for x^(1/16)"<<std::endl;
  remez.generateApprox(degree,1,16);
  MultiShiftFunction Root16(remez,1.0,false);
  MultiShiftFunction InvRoot16(remez,1.0,true);
  std::cout<<GridLogMessage << "Generating degree "<<degree<<" for x^(1/32)"<<std::endl;
  remez.generateApprox(degree,1,32);
  MultiShiftFunction Root32(remez,1.0,false);
  MultiShiftFunction InvRoot32(remez,1.0,true);
  ofstream gnuplot(std::string("Root2.gnu"),std::ios::out|std::ios::trunc);
  Root2.gnuplot(gnuplot);
  ofstream gnuplot_i2(std::string("InvRoot2.gnu"),std::ios::out|std::ios::trunc);
  InvRoot2.gnuplot(gnuplot_i2);
  ofstream gnuplot_i4(std::string("InvRoot4.gnu"),std::ios::out|std::ios::trunc);
  InvRoot4.gnuplot(gnuplot_i4);
  ofstream gnuplot_i8(std::string("InvRoot8.gnu"),std::ios::out|std::ios::trunc);
  InvRoot8.gnuplot(gnuplot_i8);
  ofstream gnuplot_i16(std::string("InvRoot16.gnu"),std::ios::out|std::ios::trunc);
  InvRoot16.gnuplot(gnuplot_i16);
  ofstream gnuplot_i32(std::string("InvRoot32.gnu"),std::ios::out|std::ios::trunc);
  InvRoot32.gnuplot(gnuplot_i32);
  ofstream gnuplot(std::string("Sqrt.gnu"),std::ios::out|std::ios::trunc);
  Sqrt.gnuplot(gnuplot);
  ofstream gnuplot_inv(std::string("InvSqrt.gnu"),std::ios::out|std::ios::trunc);
  InvSqrt.gnuplot(gnuplot);
  double x=0.6789;
  double sx=std::sqrt(x);
@@ -72,10 +101,10 @@ int main (int argc, char ** argv)
  double isx=1.0/sx;
  double issx=1.0/ssx;
-  double asx  =Sqrt.approx(x);
+  double asx  =Root2.approx(x);
-  double assx =SqrtSqrt.approx(x);
+  double assx =Root4.approx(x);
-  double aisx =InvSqrt.approx(x);
+  double aisx =InvRoot2.approx(x);
-  double aissx=InvSqrtSqrt.approx(x);
+  double aissx=InvRoot4.approx(x);
  std::cout<<GridLogMessage << "x^(1/2) : "<<sx<<" "<<asx<<std::endl;
  std::cout<<GridLogMessage << "x^(1/4) : "<<ssx<<" "<<assx<<std::endl;
--- a/tests/lanczos/BlockProjector.h
+++ b/tests/lanczos/BlockProjector.h
@@ -0,0 +1,143 @@
 namespace Grid { 
 /*
  BlockProjector
  If _HP_BLOCK_PROJECTORS_ is defined, we assume that _evec is a basis that is not
  fully orthonormalized (to the precision of the coarse field) and we allow for higher-precision
  coarse field than basis field.
 */
 //#define _HP_BLOCK_PROJECTORS_
 template<typename Field>
 class BlockProjector {
 public:
  BasisFieldVector<Field>& _evec;
  BlockedGrid<Field>& _bgrid;
  BlockProjector(BasisFieldVector<Field>& evec, BlockedGrid<Field>& bgrid) : _evec(evec), _bgrid(bgrid) {
  }
  void createOrthonormalBasis(RealD thres = 0.0) {
    GridStopWatch sw;
    sw.Start();
    int cnt = 0;
 #pragma omp parallel shared(cnt)
    {
      int lcnt = 0;
 #pragma omp for
      for (int b=0;b<_bgrid._o_blocks;b++) {
 	for (int i=0;i<_evec._Nm;i++) {
 	  auto nrm0 = _bgrid.block_sp(b,_evec._v[i],_evec._v[i]);
 	  // |i> -= <j|i> |j>
 	  for (int j=0;j<i;j++) {
 	    _bgrid.block_caxpy(b,_evec._v[i],-_bgrid.block_sp(b,_evec._v[j],_evec._v[i]),_evec._v[j],_evec._v[i]);
 	  }
 	  auto nrm = _bgrid.block_sp(b,_evec._v[i],_evec._v[i]);
 	  auto eps = nrm/nrm0;
 	  if (Reduce(eps).real() < thres) {
 	    lcnt++;
 	  }
 	  // TODO: if norm is too small, remove this eigenvector/mark as not needed; in practice: set it to zero norm here and return a mask
 	  // that is then used later to decide not to write certain eigenvectors to disk (add a norm calculation before subtraction step and look at nrm/nrm0 < eps to decide)
 	  _bgrid.block_cscale(b,1.0 / sqrt(nrm),_evec._v[i]);
 	}
      }
 #pragma omp critical
      {
 	cnt += lcnt;
      }
    }
    sw.Stop();
    std::cout << GridLogMessage << "Gram-Schmidt to create blocked basis took " << sw.Elapsed() << " (" << ((RealD)cnt / (RealD)_bgrid._o_blocks / (RealD)_evec._Nm) 
 	      << " below threshold)" << std::endl;
  }
  template<typename CoarseField>
  void coarseToFine(const CoarseField& in, Field& out) {
    out = zero;
    out.checkerboard = _evec._v[0].checkerboard;
    int Nbasis = sizeof(in._odata[0]._internal._internal) / sizeof(in._odata[0]._internal._internal[0]);
    assert(Nbasis == _evec._Nm);
 #pragma omp parallel for
    for (int b=0;b<_bgrid._o_blocks;b++) {
      for (int j=0;j<_evec._Nm;j++) {
 	_bgrid.block_caxpy(b,out,in._odata[b]._internal._internal[j],_evec._v[j],out);
      }
    }
  }
  template<typename CoarseField>
  void fineToCoarse(const Field& in, CoarseField& out) {
    out = zero;
    int Nbasis = sizeof(out._odata[0]._internal._internal) / sizeof(out._odata[0]._internal._internal[0]);
    assert(Nbasis == _evec._Nm);
    Field tmp(_bgrid._grid);
    tmp = in;
 #pragma omp parallel for
    for (int b=0;b<_bgrid._o_blocks;b++) {
      for (int j=0;j<_evec._Nm;j++) {
 	// |rhs> -= <j|rhs> |j>
 	auto c = _bgrid.block_sp(b,_evec._v[j],tmp);
 	_bgrid.block_caxpy(b,tmp,-c,_evec._v[j],tmp); // may make this more numerically stable
 	out._odata[b]._internal._internal[j] = c;
      }
    }
  }
  template<typename CoarseField>
    void deflateFine(BasisFieldVector<CoarseField>& _coef,const std::vector<RealD>& eval,int N,const Field& src_orig,Field& result) {
    result = zero;
    for (int i=0;i<N;i++) {
      Field tmp(result._grid);
      coarseToFine(_coef._v[i],tmp);
      axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
    }
  }
  template<typename CoarseField>
    void deflateCoarse(BasisFieldVector<CoarseField>& _coef,const std::vector<RealD>& eval,int N,const Field& src_orig,Field& result) {
    CoarseField src_coarse(_coef._v[0]._grid);
    CoarseField result_coarse = src_coarse;
    result_coarse = zero;
    fineToCoarse(src_orig,src_coarse);
    for (int i=0;i<N;i++) {
      axpy(result_coarse,TensorRemove(innerProduct(_coef._v[i],src_coarse)) / eval[i],_coef._v[i],result_coarse);
    }
    coarseToFine(result_coarse,result);
  }
  template<typename CoarseField>
    void deflate(BasisFieldVector<CoarseField>& _coef,const std::vector<RealD>& eval,int N,const Field& src_orig,Field& result) {
    // Deflation on coarse Grid is much faster, so use it by default.  Deflation on fine Grid is kept for legacy reasons for now.
    deflateCoarse(_coef,eval,N,src_orig,result);
  }
 };
 }
--- a/tests/lanczos/BlockedGrid.h
+++ b/tests/lanczos/BlockedGrid.h
@@ -0,0 +1,401 @@
 namespace Grid {
 template<typename Field>
 class BlockedGrid {
 public:
  GridBase* _grid;
  typedef typename Field::scalar_type  Coeff_t;
  typedef typename Field::vector_type vCoeff_t;
  std::vector<int> _bs; // block size
  std::vector<int> _nb; // number of blocks
  std::vector<int> _l;  // local dimensions irrespective of cb
  std::vector<int> _l_cb;  // local dimensions of checkerboarded vector
  std::vector<int> _l_cb_o;  // local dimensions of inner checkerboarded vector
  std::vector<int> _bs_cb; // block size in checkerboarded vector
  std::vector<int> _nb_o; // number of blocks of simd o-sites
  int _nd, _blocks, _cf_size, _cf_block_size, _cf_o_block_size, _o_blocks, _block_sites;
  BlockedGrid(GridBase* grid, const std::vector<int>& block_size) :
    _grid(grid), _bs(block_size), _nd((int)_bs.size()), 
      _nb(block_size), _l(block_size), _l_cb(block_size), _nb_o(block_size),
      _l_cb_o(block_size), _bs_cb(block_size) {
    _blocks = 1;
    _o_blocks = 1;
    _l = grid->FullDimensions();
    _l_cb = grid->LocalDimensions();
    _l_cb_o = grid->_rdimensions;
    _cf_size = 1;
    _block_sites = 1;
    for (int i=0;i<_nd;i++) {
      _l[i] /= grid->_processors[i];
      assert(!(_l[i] % _bs[i])); // lattice must accommodate choice of blocksize
      int r = _l[i] / _l_cb[i];
      assert(!(_bs[i] % r)); // checkerboarding must accommodate choice of blocksize
      _bs_cb[i] = _bs[i] / r;
      _block_sites *= _bs_cb[i];
      _nb[i] = _l[i] / _bs[i];
      _nb_o[i] = _nb[i] / _grid->_simd_layout[i];
      if (_nb[i] % _grid->_simd_layout[i]) { // simd must accommodate choice of blocksize
 	std::cout << GridLogMessage << "Problem: _nb[" << i << "] = " << _nb[i] << " _grid->_simd_layout[" << i << "] = " << _grid->_simd_layout[i] << std::endl;
 	assert(0);
      }
      _blocks *= _nb[i];
      _o_blocks *= _nb_o[i];
      _cf_size *= _l[i];
    }
    _cf_size *= 12 / 2;
    _cf_block_size = _cf_size / _blocks;
    _cf_o_block_size = _cf_size / _o_blocks;
    std::cout << GridLogMessage << "BlockedGrid:" << std::endl;
    std::cout << GridLogMessage << " _l     = " << _l << std::endl;
    std::cout << GridLogMessage << " _l_cb     = " << _l_cb << std::endl;
    std::cout << GridLogMessage << " _l_cb_o     = " << _l_cb_o << std::endl;
    std::cout << GridLogMessage << " _bs    = " << _bs << std::endl;
    std::cout << GridLogMessage << " _bs_cb    = " << _bs_cb << std::endl;
    std::cout << GridLogMessage << " _nb    = " << _nb << std::endl;
    std::cout << GridLogMessage << " _nb_o    = " << _nb_o << std::endl;
    std::cout << GridLogMessage << " _blocks = " << _blocks << std::endl;
    std::cout << GridLogMessage << " _o_blocks = " << _o_blocks << std::endl;
    std::cout << GridLogMessage << " sizeof(vCoeff_t) = " << sizeof(vCoeff_t) << std::endl;
    std::cout << GridLogMessage << " _cf_size = " << _cf_size << std::endl;
    std::cout << GridLogMessage << " _cf_block_size = " << _cf_block_size << std::endl;
    std::cout << GridLogMessage << " _block_sites = " << _block_sites << std::endl;
    std::cout << GridLogMessage << " _grid->oSites() = " << _grid->oSites() << std::endl;
    //    _grid->Barrier();
    //abort();
  }
    void block_to_coor(int b, std::vector<int>& x0) {
      std::vector<int> bcoor;
      bcoor.resize(_nd);
      x0.resize(_nd);
      assert(b < _o_blocks);
      Lexicographic::CoorFromIndex(bcoor,b,_nb_o);
      int i;
      for (i=0;i<_nd;i++) {
 	x0[i] = bcoor[i]*_bs_cb[i];
      }
      //std::cout << GridLogMessage << "Map block b -> " << x0 << std::endl;
    }
    void block_site_to_o_coor(const std::vector<int>& x0, std::vector<int>& coor, int i) {
      Lexicographic::CoorFromIndex(coor,i,_bs_cb);
      for (int j=0;j<_nd;j++)
 	coor[j] += x0[j];
    }
    int block_site_to_o_site(const std::vector<int>& x0, int i) {
      std::vector<int> coor;  coor.resize(_nd);
      block_site_to_o_coor(x0,coor,i);
      Lexicographic::IndexFromCoor(coor,i,_l_cb_o);
      return i;
    }
    vCoeff_t block_sp(int b, const Field& x, const Field& y) {
      std::vector<int> x0;
      block_to_coor(b,x0);
      vCoeff_t ret = 0.0;
      for (int i=0;i<_block_sites;i++) { // only odd sites
 	int ss = block_site_to_o_site(x0,i);
 	ret += TensorRemove(innerProduct(x._odata[ss],y._odata[ss]));
      }
      return ret;
    }
    vCoeff_t block_sp(int b, const Field& x, const std::vector< ComplexD >& y) {
      std::vector<int> x0;
      block_to_coor(b,x0);
      constexpr int nsimd = sizeof(vCoeff_t) / sizeof(Coeff_t);
      int lsize = _cf_o_block_size / _block_sites;
      std::vector< ComplexD > ret(nsimd);
      for (int i=0;i<nsimd;i++)
 	ret[i] = 0.0;
      for (int i=0;i<_block_sites;i++) { // only odd sites
 	int ss = block_site_to_o_site(x0,i);
 	int n = lsize / nsimd;
 	for (int l=0;l<n;l++) {
 	  for (int j=0;j<nsimd;j++) {
 	    int t = lsize * i + l*nsimd + j;
 	    ret[j] += conjugate(((Coeff_t*)&x._odata[ss]._internal)[l*nsimd + j]) * y[t];
 	  }
 	}
      }
      vCoeff_t vret;
      for (int i=0;i<nsimd;i++)
 	((Coeff_t*)&vret)[i] = (Coeff_t)ret[i];
      return vret;
    }
    template<class T>
      void vcaxpy(iScalar<T>& r,const vCoeff_t& a,const iScalar<T>& x,const iScalar<T>& y) {
      vcaxpy(r._internal,a,x._internal,y._internal);
    }
    template<class T,int N>
      void vcaxpy(iVector<T,N>& r,const vCoeff_t& a,const iVector<T,N>& x,const iVector<T,N>& y) {
      for (int i=0;i<N;i++)
 	vcaxpy(r._internal[i],a,x._internal[i],y._internal[i]);
    }
    void vcaxpy(vCoeff_t& r,const vCoeff_t& a,const vCoeff_t& x,const vCoeff_t& y) {
      r = a*x + y;
    }
    void block_caxpy(int b, Field& ret, const vCoeff_t& a, const Field& x, const Field& y) {
      std::vector<int> x0;
      block_to_coor(b,x0);
      for (int i=0;i<_block_sites;i++) { // only odd sites
 	int ss = block_site_to_o_site(x0,i);
 	vcaxpy(ret._odata[ss],a,x._odata[ss],y._odata[ss]);
      }
    }
    void block_caxpy(int b, std::vector< ComplexD >& ret, const vCoeff_t& a, const Field& x, const std::vector< ComplexD >& y) {
      std::vector<int> x0;
      block_to_coor(b,x0);
      constexpr int nsimd = sizeof(vCoeff_t) / sizeof(Coeff_t);
      int lsize = _cf_o_block_size / _block_sites;
      for (int i=0;i<_block_sites;i++) { // only odd sites
 	int ss = block_site_to_o_site(x0,i);
 	int n = lsize / nsimd;
 	for (int l=0;l<n;l++) {
 	  vCoeff_t r = a* ((vCoeff_t*)&x._odata[ss]._internal)[l];
 	  for (int j=0;j<nsimd;j++) {
 	    int t = lsize * i + l*nsimd + j;
 	    ret[t] = y[t] + ((Coeff_t*)&r)[j];
 	  }
 	}
      }
    }
    void block_set(int b, Field& ret, const std::vector< ComplexD >& x) {
      std::vector<int> x0;
      block_to_coor(b,x0);
      int lsize = _cf_o_block_size / _block_sites;
      for (int i=0;i<_block_sites;i++) { // only odd sites
 	int ss = block_site_to_o_site(x0,i);
 	for (int l=0;l<lsize;l++)
 	  ((Coeff_t*)&ret._odata[ss]._internal)[l] = (Coeff_t)x[lsize * i + l]; // convert precision
      }
    }
    void block_get(int b, const Field& ret, std::vector< ComplexD >& x) {
      std::vector<int> x0;
      block_to_coor(b,x0);
      int lsize = _cf_o_block_size / _block_sites;
      for (int i=0;i<_block_sites;i++) { // only odd sites
 	int ss = block_site_to_o_site(x0,i);
 	for (int l=0;l<lsize;l++)
 	  x[lsize * i + l] = (ComplexD)((Coeff_t*)&ret._odata[ss]._internal)[l];
      }
    }
    template<class T>
    void vcscale(iScalar<T>& r,const vCoeff_t& a,const iScalar<T>& x) {
      vcscale(r._internal,a,x._internal);
    }
    template<class T,int N>
    void vcscale(iVector<T,N>& r,const vCoeff_t& a,const iVector<T,N>& x) {
      for (int i=0;i<N;i++)
 	vcscale(r._internal[i],a,x._internal[i]);
    }
    void vcscale(vCoeff_t& r,const vCoeff_t& a,const vCoeff_t& x) {
      r = a*x;
    }
    void block_cscale(int b, const vCoeff_t& a, Field& ret) {
      std::vector<int> x0;
      block_to_coor(b,x0);
      for (int i=0;i<_block_sites;i++) { // only odd sites
 	int ss = block_site_to_o_site(x0,i);
 	vcscale(ret._odata[ss],a,ret._odata[ss]);
      }
    }
    void getCanonicalBlockOffset(int cb, std::vector<int>& x0) {
      const int ndim = 5;
      assert(_nb.size() == ndim);
      std::vector<int> _nbc = { _nb[1], _nb[2], _nb[3], _nb[4], _nb[0] };
      std::vector<int> _bsc = { _bs[1], _bs[2], _bs[3], _bs[4], _bs[0] };
      x0.resize(ndim);
      assert(cb >= 0);
      assert(cb < _nbc[0]*_nbc[1]*_nbc[2]*_nbc[3]*_nbc[4]);
      Lexicographic::CoorFromIndex(x0,cb,_nbc);
      int i;
      for (i=0;i<ndim;i++) {
 	x0[i] *= _bsc[i];
      }
      //if (cb < 2)
      //	std::cout << GridLogMessage << "Map: " << cb << " To: " << x0 << std::endl;
    }
    void pokeBlockOfVectorCanonical(int cb,Field& v,const std::vector<float>& buf) {
      std::vector<int> _bsc = { _bs[1], _bs[2], _bs[3], _bs[4], _bs[0] };
      std::vector<int> ldim = v._grid->LocalDimensions();
      std::vector<int> cldim = { ldim[1], ldim[2], ldim[3], ldim[4], ldim[0] };
      const int _nbsc = _bs_cb[0]*_bs_cb[1]*_bs_cb[2]*_bs_cb[3]*_bs_cb[4];
      // take canonical block cb of v and put it in canonical ordering in buf
      std::vector<int> cx0;
      getCanonicalBlockOffset(cb,cx0);
 #pragma omp parallel
      {
 	std::vector<int> co0,cl0;
 	co0=cx0; cl0=cx0;
 #pragma omp for
 	for (int i=0;i<_nbsc;i++) {
 	  Lexicographic::CoorFromIndex(co0,2*i,_bsc); // 2* for eo
 	  for (int j=0;j<(int)_bsc.size();j++)
 	    cl0[j] = cx0[j] + co0[j];
 	  std::vector<int> l0 = { cl0[4], cl0[0], cl0[1], cl0[2], cl0[3] };
 	  int oi = v._grid->oIndex(l0);
 	  int ii = v._grid->iIndex(l0);
 	  int lti = i;
 	  //if (cb < 2 && i<2)
 	  //  std::cout << GridLogMessage << "Map: " << cb << ", " << i << " To: " << cl0 << ", " << cx0 << ", " << oi << ", " << ii << std::endl;
 	  for (int s=0;s<4;s++)
 	    for (int c=0;c<3;c++) {
 	      Coeff_t& ld = ((Coeff_t*)&v._odata[oi]._internal._internal[s]._internal[c])[ii];
 	      int ti = 12*lti + 3*s + c;
 	      ld = Coeff_t(buf[2*ti+0], buf[2*ti+1]);
 	    }
 	}
      }
    }
    void peekBlockOfVectorCanonical(int cb,const Field& v,std::vector<float>& buf) {
      std::vector<int> _bsc = { _bs[1], _bs[2], _bs[3], _bs[4], _bs[0] };
      std::vector<int> ldim = v._grid->LocalDimensions();
      std::vector<int> cldim = { ldim[1], ldim[2], ldim[3], ldim[4], ldim[0] };
      const int _nbsc = _bs_cb[0]*_bs_cb[1]*_bs_cb[2]*_bs_cb[3]*_bs_cb[4];
      // take canonical block cb of v and put it in canonical ordering in buf
      std::vector<int> cx0;
      getCanonicalBlockOffset(cb,cx0);
      buf.resize(_cf_block_size * 2);
 #pragma omp parallel
      {
 	std::vector<int> co0,cl0;
 	co0=cx0; cl0=cx0;
 #pragma omp for
 	for (int i=0;i<_nbsc;i++) {
 	  Lexicographic::CoorFromIndex(co0,2*i,_bsc); // 2* for eo
 	  for (int j=0;j<(int)_bsc.size();j++)
 	    cl0[j] = cx0[j] + co0[j];
 	  std::vector<int> l0 = { cl0[4], cl0[0], cl0[1], cl0[2], cl0[3] };
 	  int oi = v._grid->oIndex(l0);
 	  int ii = v._grid->iIndex(l0);
 	  int lti = i;
 	  //if (cb < 2 && i<2)
 	  //  std::cout << GridLogMessage << "Map: " << cb << ", " << i << " To: " << cl0 << ", " << cx0 << ", " << oi << ", " << ii << std::endl;
 	  for (int s=0;s<4;s++)
 	    for (int c=0;c<3;c++) {
 	      Coeff_t& ld = ((Coeff_t*)&v._odata[oi]._internal._internal[s]._internal[c])[ii];
 	      int ti = 12*lti + 3*s + c;
 	      buf[2*ti+0] = ld.real();
 	      buf[2*ti+1] = ld.imag();
 	    }
 	}
      }
    }
    int globalToLocalCanonicalBlock(int slot,const std::vector<int>& src_nodes,int nb) {
      // processor coordinate
      int _nd = (int)src_nodes.size();
      std::vector<int> _src_nodes = src_nodes;
      std::vector<int> pco(_nd);
      Lexicographic::CoorFromIndex(pco,slot,_src_nodes);
      std::vector<int> cpco = { pco[1], pco[2], pco[3], pco[4], pco[0] };
      // get local block
      std::vector<int> _nbc = { _nb[1], _nb[2], _nb[3], _nb[4], _nb[0] };
      assert(_nd == 5);
      std::vector<int> c_src_local_blocks(_nd);
      for (int i=0;i<_nd;i++) {
 	assert(_grid->_fdimensions[i] % (src_nodes[i] * _bs[i]) == 0);
 	c_src_local_blocks[(i+4) % 5] = _grid->_fdimensions[i] / src_nodes[i] / _bs[i];
      }
      std::vector<int> cbcoor(_nd); // coordinate of block in slot in canonical form
      Lexicographic::CoorFromIndex(cbcoor,nb,c_src_local_blocks);
      // cpco, cbcoor
      std::vector<int> clbcoor(_nd);
      for (int i=0;i<_nd;i++) {
 	int cgcoor = cpco[i] * c_src_local_blocks[i] + cbcoor[i]; // global block coordinate
 	int pcoor = cgcoor / _nbc[i]; // processor coordinate in my Grid
 	int tpcoor = _grid->_processor_coor[(i+1)%5];
 	if (pcoor != tpcoor)
 	  return -1;
 	clbcoor[i] = cgcoor - tpcoor * _nbc[i]; // canonical local block coordinate for canonical dimension i
      }
      int lnb;
      Lexicographic::IndexFromCoor(clbcoor,lnb,_nbc);
      //std::cout << "Mapped slot = " << slot << " nb = " << nb << " to " << lnb << std::endl;
      return lnb;
    }
 };
 }
--- a/tests/lanczos/FieldBasisVector.h
+++ b/tests/lanczos/FieldBasisVector.h
@@ -0,0 +1,81 @@
 namespace Grid { 
 template<class Field>
 class BasisFieldVector {
 public:
  int _Nm;
  typedef typename Field::scalar_type Coeff_t;
  typedef typename Field::vector_type vCoeff_t;
  typedef typename Field::vector_object vobj;
  typedef typename vobj::scalar_object sobj;
  std::vector<Field> _v; // _Nfull vectors
  void report(int n,GridBase* value) {
    std::cout << GridLogMessage << "BasisFieldVector allocated:\n";
    std::cout << GridLogMessage << " Delta N = " << n << "\n";
    std::cout << GridLogMessage << " Size of full vectors (size) = " << 
      ((double)n*sizeof(vobj)*value->oSites() / 1024./1024./1024.) << " GB\n";
    std::cout << GridLogMessage << " Size = " << _v.size() << " Capacity = " << _v.capacity() << std::endl;
    value->Barrier();
 #ifdef __linux
    if (value->IsBoss()) {
      system("cat /proc/meminfo");
    }
 #endif
    value->Barrier();
  }
  BasisFieldVector(int Nm,GridBase* value) : _Nm(Nm), _v(Nm,value) {
    report(Nm,value);
  }
  ~BasisFieldVector() {
  }
  Field& operator[](int i) {
    return _v[i];
  }
  void orthogonalize(Field& w, int k) {
    basisOrthogonalize(_v,w,k);
  }
  void rotate(Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) {
    basisRotate(_v,Qt,j0,j1,k0,k1,Nm);
  }
  size_t size() const {
    return _Nm;
  }
  void resize(int n) {
    if (n > _Nm)
      _v.reserve(n);
    _v.resize(n,_v[0]._grid);
    if (n < _Nm)
      _v.shrink_to_fit();
    report(n - _Nm,_v[0]._grid);
    _Nm = n;
  }
  void sortInPlace(std::vector<RealD>& sort_vals, bool reverse) {
    basisSortInPlace(_v,sort_vals,reverse);
  }
  void deflate(const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
    basisDeflate(_v,eval,src_orig,result);
  }
 }; 
 }
--- a/tests/lanczos/FieldVectorIO.h
+++ b/tests/lanczos/FieldVectorIO.h
--- a/tests/lanczos/Makefile.am
+++ b/tests/lanczos/Makefile.am
@@ -0,0 +1 @@
 include Make.inc
--- a/tests/lanczos/Params.h
+++ b/tests/lanczos/Params.h
@@ -0,0 +1,136 @@
 /*
  Params IO
  Author: Christoph Lehner
  Date:   2017
 */
 #define PADD(p,X) p.get(#X,X);
 class Params {
 protected:
  std::string trim(const std::string& sc) {
    std::string s = sc;
    s.erase(s.begin(), std::find_if(s.begin(), s.end(),
 				    std::not1(std::ptr_fun<int, int>(std::isspace))));
    s.erase(std::find_if(s.rbegin(), s.rend(),
 			 std::not1(std::ptr_fun<int, int>(std::isspace))).base(), s.end());
    return s;
  }
 public:
  std::map< std::string, std::string > lines;
  std::string _fn;
 Params(const char* fn) : _fn(fn) {
    FILE* f = fopen(fn,"rt");
    assert(f);
    while (!feof(f)) {
      char buf[4096];
      if (fgets(buf,sizeof(buf),f)) {
 	if (buf[0] != '#' && buf[0] != '\r' && buf[0] != '\n') {
 	  char* sep = strchr(buf,'=');
 	  assert(sep);
 	  *sep = '\0';
 	  lines[trim(buf)] = trim(sep+1);
 	}
      }
    }      
    fclose(f);
  }
  ~Params() {
  }
  std::string loghead() {
    return _fn + ": ";
  }
  bool has(const char* name) {
    auto f = lines.find(name);
    return (f != lines.end());
  }
  const std::string& get(const char* name) {
    auto f = lines.find(name);
    if (f == lines.end()) {
      std::cout << Grid::GridLogMessage << loghead() << "Could not find value for " << name << std::endl;
      abort();
    }
    return f->second;
  }
  void parse(std::string& s, const std::string& cval) {
    std::stringstream trimmer;
    trimmer << cval;
    s.clear();
    trimmer >> s;
  }
  void parse(int& i, const std::string& cval) {
    assert(sscanf(cval.c_str(),"%d",&i)==1);
  }
  void parse(long long& i, const std::string& cval) {
    assert(sscanf(cval.c_str(),"%lld",&i)==1);
  }
  void parse(double& f, const std::string& cval) {
    assert(sscanf(cval.c_str(),"%lf",&f)==1);
  }
  void parse(float& f, const std::string& cval) {
    assert(sscanf(cval.c_str(),"%f",&f)==1);
  }
  void parse(bool& b, const std::string& cval) {
    std::string lcval = cval;
    std::transform(lcval.begin(), lcval.end(), lcval.begin(), ::tolower);
    if (lcval == "true" || lcval == "yes") {
      b = true;
    } else if (lcval == "false" || lcval == "no") {
      b = false;
    } else {
      std::cout << "Invalid value for boolean: " << b << std::endl;
      assert(0);
    }
  }
  void parse(std::complex<double>& f, const std::string& cval) {
    double r,i;
    assert(sscanf(cval.c_str(),"%lf %lf",&r,&i)==2);
    f = std::complex<double>(r,i);
  }
  void parse(std::complex<float>& f, const std::string& cval) {
    float r,i;
    assert(sscanf(cval.c_str(),"%f %f",&r,&i)==2);
    f = std::complex<float>(r,i);
  }
  template<class T>
    void get(const char* name, std::vector<T>& v) {
    int i = 0;
    v.resize(0);
    while (true) {
      char buf[4096];
      sprintf(buf,"%s[%d]",name,i++);
      if (!has(buf))
 	break;
      T val;
      parse(val,get(buf));
      std::cout << Grid::GridLogMessage << loghead() << "Set " << buf << " to " << val << std::endl;
      v.push_back(val);
    }
  }
  template<class T>
    void get(const char* name, T& f) {
    parse(f,get(name));
    std::cout << Grid::GridLogMessage << loghead() << "Set " << name << " to " << f << std::endl;
  }
 };
--- a/tests/lanczos/Test_dwf_compressed_lanczos.cc
+++ b/tests/lanczos/Test_dwf_compressed_lanczos.cc
@@ -0,0 +1,712 @@
 /*
  Authors: Christoph Lehner
  Date: 2017
  Multigrid Lanczos
  TODO:
  High priority:
  - Explore filtering of starting vector again, should really work:  If cheby has 4 for low mode region and 1 for high mode, applying 15 iterations has 1e9 suppression
    of high modes, which should create the desired invariant subspace already?  Missing something here???  Maybe dynamic range dangerous, i.e., could also kill interesting
    eigenrange if not careful.
    Better: Use all Cheby up to order N in order to approximate a step function; try this!  Problem: width of step function.  Can kill eigenspace > 1e-3 and have < 1e-5 equal
            to 1
  Low priority:
  - Given that I seem to need many restarts and high degree poly to create the base and this takes about 1 day, seriously consider a simple method to create a basis
    (ortho krylov low poly); and then fix up lowest say 200 eigenvalues by 1 run with high-degree poly (600 could be enough)
 */
 #include <Grid/Grid.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
 /////////////////////////////////////////////////////////////////////////////
 // The following are now decoupled from the Lanczos and deal with grids.
 // Safe to replace functionality
 /////////////////////////////////////////////////////////////////////////////
 #include "BlockedGrid.h"
 #include "FieldBasisVector.h"
 #include "BlockProjector.h"
 #include "FieldVectorIO.h"
 #include "Params.h"
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 bool read_evals(GridBase* _grid, char* fn, std::vector<RealD>& evals) {
  FILE* f = 0;
  uint32_t status = 0;
  if (_grid->IsBoss()) {
    f = fopen(fn,"rt");
    status = f ? 1 : 0;
  }
  _grid->GlobalSum(status);
  if (!status)
    return false;
  uint32_t N;
  if (f)
    assert(fscanf(f,"%d\n",&N)==1);
  else
    N = 0;
  _grid->GlobalSum(N);
  std::cout << "Reading " << N << " eigenvalues" << std::endl;
  evals.resize(N);
  for (int i=0;i<N;i++) {
    if (f)
      assert(fscanf(f,"%lf",&evals[i])==1);
    else
      evals[i] = 0;
  }
  _grid->GlobalSumVector(&evals[0],evals.size());
  if (f)
    fclose(f);
  return true;
 }
 void write_evals(char* fn, std::vector<RealD>& evals) {
  FILE* f = fopen(fn,"wt");
  assert(f);
  int N = (int)evals.size();
  fprintf(f,"%d\n",N);
  for (int i=0;i<N;i++) {
    fprintf(f,"%.15E\n",evals[i]);
  }
  fclose(f);
 }
 void write_history(char* fn, std::vector<RealD>& hist) {
  FILE* f = fopen(fn,"wt");
  assert(f);
  int N = (int)hist.size();
  for (int i=0;i<N;i++) {
    fprintf(f,"%d %.15E\n",i,hist[i]);
  }
  fclose(f);
 }
 template<typename Field>
 class CheckpointedLinearFunction : public LinearFunction<Field> {
 public:
  LinearFunction<Field>& _op;
  std::string _dir;
  int _max_apply;
  int _apply, _apply_actual;
  GridBase* _grid;
  FILE* _f;
  CheckpointedLinearFunction(GridBase* grid, LinearFunction<Field>& op, const char* dir,int max_apply) : _op(op), _dir(dir), _grid(grid), _f(0),
 													 _max_apply(max_apply), _apply(0), _apply_actual(0) {
    FieldVectorIO::conditionalMkDir(dir);
    char fn[4096];
    sprintf(fn,"%s/ckpt_op.%4.4d",_dir.c_str(),_grid->ThisRank());
    printf("CheckpointLinearFunction:: file %s\n",fn);
    _f = fopen(fn,"r+b");
    if (!_f)
      _f = fopen(fn,"w+b");
    assert(_f);
    fseek(_f,0,SEEK_CUR);
  }
  ~CheckpointedLinearFunction() {
    if (_f) {
      fclose(_f);
      _f = 0;
    }
  }
  bool load_ckpt(const Field& in, Field& out) {
    off_t cur = ftello(_f);
    fseeko(_f,0,SEEK_END);
    if (cur == ftello(_f))
      return false;
    fseeko(_f,cur,SEEK_SET);
    size_t sz = sizeof(out._odata[0]) * out._odata.size();
    GridStopWatch gsw;
    gsw.Start();
    uint32_t crc_exp;
    assert(fread(&crc_exp,4,1,_f)==1);
    assert(fread(&out._odata[0],sz,1,_f)==1);
    assert(FieldVectorIO::crc32_threaded((unsigned char*)&out._odata[0],sz,0x0)==crc_exp);
    gsw.Stop();
    printf("CheckpointLinearFunction:: reading %lld\n",(long long)sz);
    std::cout << GridLogMessage << "Loading " << ((RealD)sz/1024./1024./1024.) << " GB in " << gsw.Elapsed() << std::endl;
    return true;
  }
  void save_ckpt(const Field& in, Field& out) {
    fseek(_f,0,SEEK_CUR); // switch to write
    size_t sz = sizeof(out._odata[0]) * out._odata.size();
    GridStopWatch gsw;
    gsw.Start();
    uint32_t crc = FieldVectorIO::crc32_threaded((unsigned char*)&out._odata[0],sz,0x0);
    assert(fwrite(&crc,4,1,_f)==1);
    assert(fwrite(&out._odata[0],sz,1,_f)==1);
    fflush(_f); // try this on the GPFS to suppress OPA usage for disk during dslash; this is not needed at Lustre/JLAB
    gsw.Stop();
    printf("CheckpointLinearFunction:: writing %lld\n",(long long)sz);
    std::cout << GridLogMessage << "Saving " << ((RealD)sz/1024./1024./1024.) << " GB in " << gsw.Elapsed() << std::endl;
  }
  void operator()(const Field& in, Field& out) {
    _apply++;
    if (load_ckpt(in,out))
      return;
    _op(in,out);
    save_ckpt(in,out);
    if (_apply_actual++ >= _max_apply) {
      std::cout << GridLogMessage << "Maximum application of operator reached, checkpoint and finish in future job" << std::endl;
      if (_f) { fclose(_f); _f=0; }
      in._grid->Barrier();
      Grid_finalize();
      exit(3);
    }
  }
 };
 template<typename CoarseField,typename Field>
 class ProjectedFunctionHermOp : public LinearFunction<CoarseField> {
 public:
  OperatorFunction<Field>   & _poly;
  LinearOperatorBase<Field> &_Linop;
  BlockProjector<Field>& _pr;
  ProjectedFunctionHermOp(BlockProjector<Field>& pr,OperatorFunction<Field> & poly,LinearOperatorBase<Field>& linop) : _poly(poly), _Linop(linop), _pr(pr) {
  }
  void operator()(const CoarseField& in, CoarseField& out) {
    assert(_pr._bgrid._o_blocks == in._grid->oSites());
    Field fin(_pr._bgrid._grid);
    Field fout(_pr._bgrid._grid);
    GridStopWatch gsw1,gsw2,gsw3;
    // fill fin
    gsw1.Start();
    _pr.coarseToFine(in,fin);
    gsw1.Stop();
    // apply poly
    gsw2.Start();
    _poly(_Linop,fin,fout);
    gsw2.Stop();
    // fill out
    gsw3.Start();
    _pr.fineToCoarse(fout,out);
    gsw3.Stop();
    auto eps = innerProduct(in,out);
    std::cout << GridLogMessage << "Operator timing details: c2f = " << gsw1.Elapsed() << " poly = " << gsw2.Elapsed() << " f2c = " << gsw3.Elapsed() << 
      "   Complimentary Hermiticity check: " << eps.imag() / std::abs(eps) << std::endl;
  }
 };
 template<typename CoarseField,typename Field>
 class ProjectedHermOp : public LinearFunction<CoarseField> {
 public:
  LinearOperatorBase<Field> &_Linop;
  BlockProjector<Field>& _pr;
  ProjectedHermOp(BlockProjector<Field>& pr,LinearOperatorBase<Field>& linop) : _Linop(linop), _pr(pr) {
  }
  void operator()(const CoarseField& in, CoarseField& out) {
    assert(_pr._bgrid._o_blocks == in._grid->oSites());
    Field fin(_pr._bgrid._grid);
    Field fout(_pr._bgrid._grid);
    _pr.coarseToFine(in,fin);
    _Linop.HermOp(fin,fout);
    _pr.fineToCoarse(fout,out);
  }
 };
 template<typename vtype, int N > using CoarseSiteFieldGeneral = iScalar< iVector<vtype, N> >;
 template<int N> using CoarseSiteFieldD = CoarseSiteFieldGeneral< vComplexD, N >;
 template<int N> using CoarseSiteFieldF = CoarseSiteFieldGeneral< vComplexF, N >;
 template<int N> using CoarseSiteField  = CoarseSiteFieldGeneral< vComplex,  N >;
 template<int N> using CoarseLatticeFermion  = Lattice< CoarseSiteField<N> >;
 template<int N> using CoarseLatticeFermionD = Lattice< CoarseSiteFieldD<N> >;
 template<typename Field,int Nstop1>
 void CoarseGridLanczos(BlockProjector<Field>& pr,RealD alpha2,RealD beta,int Npoly2,
 		       int Nstop2,int Nk2,int Nm2,RealD resid2,RealD betastp2,int MaxIt,int MinRes2,
 		       LinearOperatorBase<Field>& HermOp, std::vector<RealD>& eval1, bool cg_test_enabled, 
 		       int cg_test_maxiter,int nsingle,int SkipTest2, int MaxApply2,bool smoothed_eval_enabled,
 		       int smoothed_eval_inner,int smoothed_eval_outer,int smoothed_eval_begin,
 		       int smoothed_eval_end,RealD smoothed_eval_inner_resid) {
  BlockedGrid<Field>& bgrid = pr._bgrid;
  BasisFieldVector<Field>& basis = pr._evec;
  std::vector<int> coarseFourDimLatt;
  for (int i=0;i<4;i++)
    coarseFourDimLatt.push_back(bgrid._nb[1+i] * bgrid._grid->_processors[1+i]);
  assert(bgrid._grid->_processors[0] == 1);
  std::cout << GridLogMessage << "CoarseGrid = " << coarseFourDimLatt << " with basis = " << Nstop1 << std::endl;
  GridCartesian         * UCoarseGrid   = SpaceTimeGrid::makeFourDimGrid(coarseFourDimLatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridCartesian         * FCoarseGrid   = SpaceTimeGrid::makeFiveDimGrid(bgrid._nb[0],UCoarseGrid);
  Chebyshev<Field> Cheb2(alpha2,beta,Npoly2);
  CoarseLatticeFermion<Nstop1> src_coarse(FCoarseGrid);
  // Second round of Lanczos in blocked space
  std::vector<RealD>         eval2(Nm2);
  std::vector<RealD>         eval3(Nm2);
  BasisFieldVector<CoarseLatticeFermion<Nstop1> > coef(Nm2,FCoarseGrid);
  ProjectedFunctionHermOp<CoarseLatticeFermion<Nstop1>,LatticeFermion> Op2plain(pr,Cheb2,HermOp);
  CheckpointedLinearFunction<CoarseLatticeFermion<Nstop1> > Op2ckpt(src_coarse._grid,Op2plain,"checkpoint",MaxApply2);
  LinearFunction< CoarseLatticeFermion<Nstop1> >* Op2;
  if (MaxApply2) {
    Op2 = &Op2ckpt;
  } else {
    Op2 = &Op2plain;
  }
  ProjectedHermOp<CoarseLatticeFermion<Nstop1>,LatticeFermion> Op2nopoly(pr,HermOp);
  ImplicitlyRestartedLanczos<CoarseLatticeFermion<Nstop1> > IRL2(*Op2,*Op2,Nstop2,Nk2,Nm2,resid2,MaxIt,betastp2,MinRes2);
  src_coarse = 1.0;
  // Precision test
  {
    Field tmp(bgrid._grid);
    CoarseLatticeFermion<Nstop1> tmp2(FCoarseGrid);
    CoarseLatticeFermion<Nstop1> tmp3(FCoarseGrid);
    tmp2 = 1.0;
    tmp3 = 1.0;
    pr.coarseToFine(tmp2,tmp);
    pr.fineToCoarse(tmp,tmp2);
    tmp2 -= tmp3;
    std::cout << GridLogMessage << "Precision Test c->f->c: " << norm2(tmp2) / norm2(tmp3) << std::endl;
    //bgrid._grid->Barrier();
    //return;
  }
  int Nconv;
  if (!FieldVectorIO::read_compressed_vectors("lanczos.output",pr,coef) ||
      !read_evals(UCoarseGrid,(char *)"lanczos.output/eigen-values.txt",eval3) ||
      !read_evals(UCoarseGrid,(char *)"lanczos.output/eigen-values.txt.linear",eval1) ||
      !read_evals(UCoarseGrid,(char *)"lanczos.output/eigen-values.txt.poly",eval2)
      ) {
    IRL2.calc(eval2,coef._v,src_coarse,Nconv,true);
    coef.resize(Nstop2);
    eval2.resize(Nstop2);
    eval3.resize(Nstop2);
    std::vector<Field> step3_cache;
    // reconstruct eigenvalues of original operator
    for (int i=0;i<Nstop2;i++){
      RealD eval2_linear;
      if (i<Nstop1) {
 	eval2_linear = eval1[i];
      } else {
 	eval2_linear = eval2[i-1];
      }
      RealD eval2_poly = eval2[i];
      RealD eval_reconstruct = Cheb2.approxInv(eval2_poly,eval2_linear,100,1e-10);
      std::cout << i << " Reconstructed eval = " << eval_reconstruct << " from quess " << eval2_linear << std::endl;
      eval2[i] = eval_reconstruct;
    }
    // as demonstrated in CG test below, best result from mixed determination
    for (int i=0;i<Nstop2;i++)
      eval3[i] = (i < Nstop1) ? eval1[i] : eval2[i];
    for(int i=0;i<Nstop2;i++){
      std::cout << i<<" / "<< Nstop2<< " eigenvalue "<< eval3[i] <<std::endl;
    };
    // write
    mkdir("lanczos.output",ACCESSPERMS);
    FieldVectorIO::write_compressed_vectors("lanczos.output",pr,coef,nsingle);
    if (bgrid._grid->IsBoss()) {
      write_evals((char *)"lanczos.output/eigen-values.txt",eval3);
      write_evals((char *)"lanczos.output/eigen-values.txt.linear",eval1);
      write_evals((char *)"lanczos.output/eigen-values.txt.poly",eval2);
    }
  }
  // fix up eigenvalues
  if (!read_evals(UCoarseGrid,(char *)"lanczos.output/eigen-values.txt.smoothed",eval3) && smoothed_eval_enabled) {
    ConjugateGradient<LatticeFermion> CG(smoothed_eval_inner_resid, smoothed_eval_inner, false);
    LatticeFermion v_i(basis[0]._grid);
    auto tmp = v_i;
    auto tmp2 = v_i;
    for (int i=smoothed_eval_begin;i<smoothed_eval_end;i++) {
      GridStopWatch gsw;
      gsw.Start();
      pr.coarseToFine(coef[i],v_i);
      v_i.checkerboard = Odd;
      for (int j=0;j<smoothed_eval_outer;j++) {
 	tmp=zero;
 	//pr.deflate(coef,eval3,Nstop2,v_i,tmp);
 	CG(HermOp, v_i, tmp);
 	v_i = 1.0 / ::sqrt( norm2(tmp) ) * tmp;
      }
      tmp = v_i;
      HermOp.HermOp(tmp,tmp2);
      RealD ev = innerProduct(tmp,tmp2).real();
      gsw.Stop();
      std::cout << GridLogMessage << "Smoothed eigenvalue " << i << " from " << eval3[i] << " to " << ev << " in " << gsw.Elapsed() << std::endl;
      //	" with effective smoother precision " << (CG.ResHistory.back() / CG.ResHistory.front() ) << std::endl;
      //      CG.ResHistory.clear();
      eval3[i] = ev;
    }
    if (bgrid._grid->IsBoss()) {
      write_evals((char *)"lanczos.output/eigen-values.txt.smoothed",eval3);
      write_evals((char *)"lanczos.output/eigen-values.txt",eval3); // also reset this to the best ones we have available
    }
  }
  // do CG test with and without deflation
  if (cg_test_enabled) {
    ConjugateGradient<LatticeFermion> CG(1.0e-8, cg_test_maxiter, false);
    LatticeFermion src_orig(bgrid._grid);
    src_orig.checkerboard = Odd;
    src_orig = 1.0;
    src_orig = src_orig * (1.0 / ::sqrt(norm2(src_orig)) );
    auto result = src_orig; 
    // undeflated solve
    std::cout << GridLogMessage << " Undeflated solve "<<std::endl;
    result = zero;
    CG(HermOp, src_orig, result);
    //    if (UCoarseGrid->IsBoss())
    //      write_history("cg_test.undefl",CG.ResHistory);
    //    CG.ResHistory.clear();
    // deflated solve with all eigenvectors
    std::cout << GridLogMessage << " Deflated solve with all evectors"<<std::endl;
    result = zero;
    pr.deflate(coef,eval2,Nstop2,src_orig,result);
    CG(HermOp, src_orig, result);
    //    if (UCoarseGrid->IsBoss())
    //      write_history("cg_test.defl_all",CG.ResHistory);
    //    CG.ResHistory.clear();
    // deflated solve with non-blocked eigenvectors
    std::cout << GridLogMessage << " Deflated solve with non-blocked evectors"<<std::endl;
    result = zero;
    pr.deflate(coef,eval1,Nstop1,src_orig,result);
    CG(HermOp, src_orig, result);
    //    if (UCoarseGrid->IsBoss())
    //      write_history("cg_test.defl_full",CG.ResHistory);
    //    CG.ResHistory.clear();
    // deflated solve with all eigenvectors and original eigenvalues from proj
    std::cout << GridLogMessage << " Deflated solve with all eigenvectors and original eigenvalues from proj"<<std::endl;
    result = zero;
    pr.deflate(coef,eval3,Nstop2,src_orig,result);
    CG(HermOp, src_orig, result);
    //    if (UCoarseGrid->IsBoss())
    //      write_history("cg_test.defl_all_ev3",CG.ResHistory);
    //    CG.ResHistory.clear();
  }
 }
 template<typename Field>
 void quick_krylov_basis(BasisFieldVector<Field>& evec,Field& src,LinearFunction<Field>& Op,int Nstop) {
  Field tmp = src;
  Field tmp2 = tmp;
  for (int i=0;i<Nstop;i++) {
    GridStopWatch gsw;
    gsw.Start();
    Op(tmp,tmp2);
    gsw.Stop();
    evec.orthogonalize(tmp2,i);
    RealD nn = norm2(tmp2);
    nn = Grid::sqrt(nn);
    tmp2 = tmp2 * (1.0/nn);
    evec[i] = tmp2;
    tmp = tmp2;
    std::cout << GridLogMessage << "Quick_krylov_basis: " << i << "/" << Nstop << " timing of operator=" << gsw.Elapsed() << std::endl;
  }
 }
 int main (int argc, char ** argv) {
  Grid_init(&argc,&argv);
  const int MaxIt = 10000;
  int Ls;
  RealD mass;
  RealD M5;
  std::vector < std::complex<double>  > omega;
  RealD alpha1, alpha2, beta;
  int Npoly1, Npoly2;
  int Nstop1, Nstop2;
  int Nk1, Nk2;
  int Np1, Np2;
  int MinRes1, MinRes2;
  int SkipTest2, MaxApply2;
  bool checkpoint_basis;
  bool cg_test_enabled;
  bool exit_after_basis_calculation;
  bool simple_krylov_basis;
  int cg_test_maxiter;
  int nsingle; // store in single precision, the rest in FP16
  int max_cheb_time_ms;
  bool smoothed_eval_enabled;
  int smoothed_eval_inner;
  int smoothed_eval_outer;
  int smoothed_eval_begin;
  int smoothed_eval_end;
  RealD smoothed_eval_inner_resid;
  // vector representation
  std::vector<int> block_size; // 5d block size
  RealD resid1, resid2, betastp1, betastp2, basis_norm_threshold;
  std::string config;
  Params jp("params.txt");
  PADD(jp,Npoly1); PADD(jp,Npoly2);
  PADD(jp,max_cheb_time_ms);
  PADD(jp,Nstop1); PADD(jp,Nstop2); PADD(jp,MaxApply2);
  PADD(jp,Nk1); PADD(jp,Nk2); PADD(jp,betastp1); PADD(jp,betastp2);
  PADD(jp,Np1); PADD(jp,Np2); basis_norm_threshold = 1e-5; //PADD(jp,basis_norm_threshold);
  PADD(jp,block_size); PADD(jp,smoothed_eval_enabled); PADD(jp,smoothed_eval_inner);
  PADD(jp,resid1); PADD(jp,resid2); PADD(jp,smoothed_eval_outer);
  PADD(jp,alpha1); PADD(jp,alpha2); PADD(jp,smoothed_eval_begin);
  PADD(jp,MinRes1); PADD(jp,MinRes2); PADD(jp,smoothed_eval_end);
  PADD(jp,beta); PADD(jp,mass); PADD(jp,smoothed_eval_inner_resid);
  PADD(jp,omega); PADD(jp,config); 
  PADD(jp,M5); PADD(jp,cg_test_enabled);
  PADD(jp,cg_test_maxiter); PADD(jp,checkpoint_basis);
  PADD(jp,nsingle); PADD(jp,exit_after_basis_calculation);
  PADD(jp,simple_krylov_basis); PADD(jp,SkipTest2);
  Ls = (int)omega.size();
  // Grids
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridCartesian         * UGridHP = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridRedBlackCartesian * UrbGridHP = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridHP);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridCartesian         * FGridHP   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridHP);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGridHP = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridHP);
  // Gauge field
  LatticeGaugeField Umu(UGrid);
  FieldMetaData header;
  NerscIO::readConfiguration(Umu,header,config);
  std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt()
            << "   Ls: " << Ls << std::endl;
  // ZMobius EO Operator
  ZMobiusFermionR Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5, omega,1.,0.);
  SchurDiagTwoOperator<ZMobiusFermionR,LatticeFermion> HermOp(Ddwf);
  // Eigenvector storage
  const int Nm1 = Np1 + Nk1;
  const int Nm2 = Np2 + Nk2; // maximum number of vectors we need to keep
  std::cout << GridLogMessage << "Keep " << Nm1 << " full vectors" << std::endl;
  std::cout << GridLogMessage << "Keep " << Nm2 << " total vectors" << std::endl;
  assert(Nm2 >= Nm1);
  BasisFieldVector<LatticeFermion> evec(Nm1,FrbGrid); // start off with keeping full vectors
  // First and second cheby
  Chebyshev<LatticeFermion> Cheb1(alpha1,beta,Npoly1);
  FunctionHermOp<LatticeFermion> Op1(Cheb1,HermOp);
  PlainHermOp<LatticeFermion> Op1test(HermOp);
  // Eigenvalue storage
  std::vector<RealD>          eval1(evec.size());
  // Construct source vector
  LatticeFermion    src(FrbGrid);
  {
    src=1.0;
    src.checkerboard = Odd;
    // normalize
    RealD nn = norm2(src);
    nn = Grid::sqrt(nn);
    src = src * (1.0/nn);
  }
  // Do a benchmark and a quick exit if performance is too little (ugly but needed due to performance fluctuations)
  if (max_cheb_time_ms) {
    // one round of warmup
    auto tmp = src;
    GridStopWatch gsw1,gsw2;
    gsw1.Start();
    Cheb1(HermOp,src,tmp);
    gsw1.Stop();
    Ddwf.ZeroCounters();
    gsw2.Start();
    Cheb1(HermOp,src,tmp);
    gsw2.Stop();
    Ddwf.Report();
    std::cout << GridLogMessage << "Performance check; warmup = " << gsw1.Elapsed() << "  test = " << gsw2.Elapsed() << std::endl;
    int ms = (int)(gsw2.useconds()/1e3);
    if (ms > max_cheb_time_ms) {
      std::cout << GridLogMessage << "Performance too poor: " << ms << " ms, cutoff = " << max_cheb_time_ms << " ms" << std::endl;
      Grid_finalize();
      return 2;
    }
  }
  // First round of Lanczos to get low mode basis
  ImplicitlyRestartedLanczos<LatticeFermion> IRL1(Op1,Op1test,Nstop1,Nk1,Nm1,resid1,MaxIt,betastp1,MinRes1);
  int Nconv;
  char tag[1024];
  if (!FieldVectorIO::read_argonne(evec,(char *)"checkpoint") || !read_evals(UGrid,(char *)"checkpoint/eigen-values.txt",eval1)) {
    if (simple_krylov_basis) {
      quick_krylov_basis(evec,src,Op1,Nstop1);
    } else {
      IRL1.calc(eval1,evec._v,src,Nconv,false);
    }
    evec.resize(Nstop1); // and throw away superfluous
    eval1.resize(Nstop1);
    if (checkpoint_basis)
      FieldVectorIO::write_argonne(evec,(char *)"checkpoint");
    if (UGrid->IsBoss() && checkpoint_basis)
      write_evals((char *)"checkpoint/eigen-values.txt",eval1);
    Ddwf.Report();
    if (exit_after_basis_calculation) {
      Grid_finalize();
      return 0;
    }
  }
  // now test eigenvectors
  if (!simple_krylov_basis) {
    for (int i=0;i<Nstop1;i++){
      auto B = evec[i];
      auto tmp = B;
      auto v = B;
      {
 	HermOp.HermOp(B,v);
 	RealD vnum = real(innerProduct(B,v)); // HermOp.
 	RealD vden = norm2(B);
 	RealD vv0 = norm2(v);
 	RealD eval2 = vnum/vden;
 	v -= eval2*B;
 	RealD vv = norm2(v);
 	std::cout << i << " OP eval = " << eval2 << " (" << eval1[i] << ") "
 		  << "res2 = " << vv << " norm2 = " << norm2(B) << std::endl;
      }
    }
  }
  // do second step only if needed
  if (Nstop1 <= Nstop2) {
    // Now setup blocking
    assert(evec.size() == Nstop1);
    BlockedGrid<LatticeFermion> bgrid(FrbGrid, block_size);
    BlockProjector<LatticeFermion> pr(evec,bgrid);
    pr.createOrthonormalBasis(basis_norm_threshold);
    pr.createOrthonormalBasis(basis_norm_threshold); // another round due to precision issues created by local coherence
    constexpr int common_basis_sizes[] = { 60, 250, 400 };
    constexpr int n_common_basis_sizes = sizeof(common_basis_sizes) / sizeof(common_basis_sizes[0]);
    switch (Nstop1) {
 #define BASIS(n) case common_basis_sizes[n]:\
      CoarseGridLanczos<LatticeFermion,common_basis_sizes[n]>\
 	(pr,alpha2,beta,Npoly2,Nstop2,Nk2,Nm2,resid2,betastp2,MaxIt,MinRes2,HermOp,eval1, \
 	 cg_test_enabled,cg_test_maxiter,nsingle,SkipTest2, \
 	 MaxApply2,smoothed_eval_enabled,smoothed_eval_inner,smoothed_eval_outer, \
 	 smoothed_eval_begin,smoothed_eval_end,smoothed_eval_inner_resid); break;
      BASIS(0);
      BASIS(1);
      BASIS(2);
    default:
      std::cout << GridLogMessage << "Basis size " << Nstop1 << " must be added at compile-time" << std::endl;
      std::cout << GridLogMessage << "Currently available sizes: " << std::endl;
      for (int i=0;i<n_common_basis_sizes;i++) {
 	std::cout << GridLogMessage << "  " << common_basis_sizes[i] << std::endl;
      }
    }
  }
  Grid_finalize();
 }
--- a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc
+++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc
@@ -0,0 +1,254 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_dwf_compressed_lanczos_reorg.cc
    Copyright (C) 2017
 Author: Leans heavily on Christoph Lehner's code
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 /*
 *  Reimplement the badly named "multigrid" lanczos as compressed Lanczos using the features 
 *  in Grid that were intended to be used to support blocked Aggregates, from
 */
 #include <Grid/Grid.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
 #include <Grid/algorithms/iterative/LocalCoherenceLanczos.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 template<class Fobj,class CComplex,int nbasis>
 class LocalCoherenceLanczosScidac : public LocalCoherenceLanczos<Fobj,CComplex,nbasis>
 { 
 public:
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj>          FineField;
  LocalCoherenceLanczosScidac(GridBase *FineGrid,GridBase *CoarseGrid,
 			      LinearOperatorBase<FineField> &FineOp,
 			      int checkerboard) 
    // Base constructor
    : LocalCoherenceLanczos<Fobj,CComplex,nbasis>(FineGrid,CoarseGrid,FineOp,checkerboard) 
  {};
  void checkpointFine(std::string evecs_file,std::string evals_file)
  {
    assert(this->_Aggregate.subspace.size()==nbasis);
    emptyUserRecord record;
    Grid::QCD::ScidacWriter WR;
    WR.open(evecs_file);
    for(int k=0;k<nbasis;k++) {
      WR.writeScidacFieldRecord(this->_Aggregate.subspace[k],record);
    }
    WR.close();
    XmlWriter WRx(evals_file);
    write(WRx,"evals",this->evals_fine);
  }
  void checkpointFineRestore(std::string evecs_file,std::string evals_file)
  {
    this->evals_fine.resize(nbasis);
    this->_Aggregate.subspace.resize(nbasis,this->_FineGrid);
    std::cout << GridLogIRL<< "checkpointFineRestore:  Reading evals from "<<evals_file<<std::endl;
    XmlReader RDx(evals_file);
    read(RDx,"evals",this->evals_fine);
    assert(this->evals_fine.size()==nbasis);
    std::cout << GridLogIRL<< "checkpointFineRestore:  Reading evecs from "<<evecs_file<<std::endl;
    emptyUserRecord record;
    Grid::QCD::ScidacReader RD ;
    RD.open(evecs_file);
    for(int k=0;k<nbasis;k++) {
      this->_Aggregate.subspace[k].checkerboard=this->_checkerboard;
      RD.readScidacFieldRecord(this->_Aggregate.subspace[k],record);
    }
    RD.close();
  }
  void checkpointCoarse(std::string evecs_file,std::string evals_file)
  {
    int n = this->evec_coarse.size();
    emptyUserRecord record;
    Grid::QCD::ScidacWriter WR;
    WR.open(evecs_file);
    for(int k=0;k<n;k++) {
      WR.writeScidacFieldRecord(this->evec_coarse[k],record);
    }
    WR.close();
    XmlWriter WRx(evals_file);
    write(WRx,"evals",this->evals_coarse);
  }
  void checkpointCoarseRestore(std::string evecs_file,std::string evals_file,int nvec)
  {
    std::cout << "resizing coarse vecs to " << nvec<< std::endl;
    this->evals_coarse.resize(nvec);
    this->evec_coarse.resize(nvec,this->_CoarseGrid);
    std::cout << GridLogIRL<< "checkpointCoarseRestore:  Reading evals from "<<evals_file<<std::endl;
    XmlReader RDx(evals_file);
    read(RDx,"evals",this->evals_coarse);
    assert(this->evals_coarse.size()==nvec);
    emptyUserRecord record;
    std::cout << GridLogIRL<< "checkpointCoarseRestore:  Reading evecs from "<<evecs_file<<std::endl;
    Grid::QCD::ScidacReader RD ;
    RD.open(evecs_file);
    for(int k=0;k<nvec;k++) {
      RD.readScidacFieldRecord(this->evec_coarse[k],record);
    }
    RD.close();
  }
 };
 int main (int argc, char ** argv) {
  Grid_init(&argc,&argv);
  GridLogIRL.TimingMode(1);
  LocalCoherenceLanczosParams Params;
  {
    Params.omega.resize(10);
    Params.blockSize.resize(5);
    XmlWriter writer("Params_template.xml");
    write(writer,"Params",Params);
    std::cout << GridLogMessage << " Written Params_template.xml" <<std::endl;
  }
  { 
    XmlReader reader(std::string("./Params.xml"));
    read(reader, "Params", Params);
  }
  int     Ls = (int)Params.omega.size();
  RealD mass = Params.mass;
  RealD M5   = Params.M5;
  std::vector<int> blockSize = Params.blockSize;
  // Grids
  GridCartesian         * UGrid     = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
 								     GridDefaultSimd(Nd,vComplex::Nsimd()),
 								     GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid   = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::vector<int> fineLatt     = GridDefaultLatt();
  int dims=fineLatt.size();
  assert(blockSize.size()==dims+1);
  std::vector<int> coarseLatt(dims);
  std::vector<int> coarseLatt5d ;
  for (int d=0;d<coarseLatt.size();d++){
    coarseLatt[d] = fineLatt[d]/blockSize[d];    assert(coarseLatt[d]*blockSize[d]==fineLatt[d]);
  }
  std::cout << GridLogMessage<< " 5d coarse lattice is ";
  for (int i=0;i<coarseLatt.size();i++){
    std::cout << coarseLatt[i]<<"x";
  } 
  int cLs = Ls/blockSize[dims]; assert(cLs*blockSize[dims]==Ls);
  std::cout << cLs<<std::endl;
  GridCartesian         * CoarseGrid4    = SpaceTimeGrid::makeFourDimGrid(coarseLatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * CoarseGrid4rb  = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseGrid4);
  GridCartesian         * CoarseGrid5    = SpaceTimeGrid::makeFiveDimGrid(cLs,CoarseGrid4);
  GridRedBlackCartesian * CoarseGrid5rb  = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseGrid5);
  // Gauge field
  LatticeGaugeField Umu(UGrid);
  FieldMetaData header;
  NerscIO::readConfiguration(Umu,header,Params.config);
  std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt() << "   Ls: " << Ls << std::endl;
  // ZMobius EO Operator
  ZMobiusFermionR Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5, Params.omega,1.,0.);
  SchurDiagTwoOperator<ZMobiusFermionR,LatticeFermion> HermOp(Ddwf);
  // Eigenvector storage
  LanczosParams fine  =Params.FineParams;  
  LanczosParams coarse=Params.CoarseParams;  
  const int Ns1 = fine.Nstop;   const int Ns2 = coarse.Nstop;
  const int Nk1 = fine.Nk;      const int Nk2 = coarse.Nk;
  const int Nm1 = fine.Nm;      const int Nm2 = coarse.Nm;
  std::cout << GridLogMessage << "Keep " << fine.Nstop   << " fine   vectors" << std::endl;
  std::cout << GridLogMessage << "Keep " << coarse.Nstop << " coarse vectors" << std::endl;
  assert(Nm2 >= Nm1);
  const int nbasis= 60;
  assert(nbasis==Ns1);
  LocalCoherenceLanczosScidac<vSpinColourVector,vTComplex,nbasis> _LocalCoherenceLanczos(FrbGrid,CoarseGrid5rb,HermOp,Odd);
  std::cout << GridLogMessage << "Constructed LocalCoherenceLanczos" << std::endl;
  assert( (Params.doFine)||(Params.doFineRead));
  if ( Params.doFine ) { 
    std::cout << GridLogMessage << "Performing fine grid IRL Nstop "<< Ns1 << " Nk "<<Nk1<<" Nm "<<Nm1<< std::endl;
    _LocalCoherenceLanczos.calcFine(fine.Cheby,
 		 fine.Nstop,fine.Nk,fine.Nm,
 		 fine.resid,fine.MaxIt, 
 		 fine.betastp,fine.MinRes);
    std::cout << GridLogIRL<<"Checkpointing Fine evecs"<<std::endl;
    _LocalCoherenceLanczos.checkpointFine(std::string("evecs.scidac"),std::string("evals.xml"));
    _LocalCoherenceLanczos.testFine(fine.resid*100.0); // Coarse check
    _LocalCoherenceLanczos.Orthogonalise();
  }
  if ( Params.doFineRead ) { 
    _LocalCoherenceLanczos.checkpointFineRestore(std::string("evecs.scidac"),std::string("evals.xml"));
    _LocalCoherenceLanczos.testFine(fine.resid*100.0); // Coarse check
    _LocalCoherenceLanczos.Orthogonalise();
  }
  if ( Params.doCoarse ) {
    std::cout << GridLogMessage << "Orthogonalising " << nbasis<<" Nm "<<Nm2<< std::endl;
    std::cout << GridLogMessage << "Performing coarse grid IRL Nstop "<< Ns2<< " Nk "<<Nk2<<" Nm "<<Nm2<< std::endl;
    _LocalCoherenceLanczos.calcCoarse(coarse.Cheby,Params.Smoother,Params.coarse_relax_tol,
 			      coarse.Nstop, coarse.Nk,coarse.Nm,
 			      coarse.resid, coarse.MaxIt, 
 			      coarse.betastp,coarse.MinRes);
    std::cout << GridLogIRL<<"Checkpointing coarse evecs"<<std::endl;
    _LocalCoherenceLanczos.checkpointCoarse(std::string("evecs.coarse.scidac"),std::string("evals.coarse.xml"));
  }
  if ( Params.doCoarseRead ) {
    // Verify we can reread ???
    _LocalCoherenceLanczos.checkpointCoarseRestore(std::string("evecs.coarse.scidac"),std::string("evals.coarse.xml"),coarse.Nstop);
    _LocalCoherenceLanczos.testCoarse(coarse.resid*100.0,Params.Smoother,Params.coarse_relax_tol); // Coarse check
  }
  Grid_finalize();
 }
--- a/tests/lanczos/Test_dwf_lanczos.cc
+++ b/tests/lanczos/Test_dwf_lanczos.cc
@@ -84,11 +84,12 @@ int main (int argc, char ** argv)
  std::vector<double> Coeffs { 0.,-1.};
  Polynomial<FermionField> PolyX(Coeffs);
-  Chebyshev<FermionField> Cheb(0.2,5.,11);
+  Chebyshev<FermionField> Cheby(0.2,5.,11);
-//  ChebyshevLanczos<LatticeFermion> Cheb(9.,1.,0.,20);
+
-//  Cheb.csv(std::cout);
+  FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
-//  exit(-24);
+     PlainHermOp<FermionField> Op     (HermOp);
-  ImplicitlyRestartedLanczos<FermionField> IRL(HermOp,Cheb,Nstop,Nk,Nm,resid,MaxIt);
+
  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby,Op,Nstop,Nk,Nm,resid,MaxIt);
  std::vector<RealD>          eval(Nm);
--- a/tests/lanczos/Test_synthetic_lanczos.cc
+++ b/tests/lanczos/Test_synthetic_lanczos.cc
@@ -119,12 +119,13 @@ int main (int argc, char ** argv)
  RealD beta  = 0.1;
  RealD mu    = 0.0;
  int order = 11;
-  ChebyshevLanczos<LatticeComplex> Cheby(alpha,beta,mu,order);
+  Chebyshev<LatticeComplex> Cheby(alpha,beta,order);
  std::ofstream file("cheby.dat");
  Cheby.csv(file);
  HermOpOperatorFunction<LatticeComplex> X;
  DumbOperator<LatticeComplex> HermOp(grid);
  FunctionHermOp<LatticeComplex> OpCheby(Cheby,HermOp);
     PlainHermOp<LatticeComplex> Op(HermOp);
  const int Nk = 40;
  const int Nm = 80;
@@ -133,8 +134,9 @@ int main (int argc, char ** argv)
  int Nconv;
  RealD eresid = 1.0e-6;
-  ImplicitlyRestartedLanczos<LatticeComplex> IRL(HermOp,X,Nk,Nk,Nm,eresid,Nit);
+
-  ImplicitlyRestartedLanczos<LatticeComplex> ChebyIRL(HermOp,Cheby,Nk,Nk,Nm,eresid,Nit);
+  ImplicitlyRestartedLanczos<LatticeComplex> IRL(Op,Op,Nk,Nk,Nm,eresid,Nit);
  ImplicitlyRestartedLanczos<LatticeComplex> ChebyIRL(OpCheby,Op,Nk,Nk,Nm,eresid,Nit);
  LatticeComplex src(grid); gaussian(RNG,src);
  {
--- a/tests/lanczos/Test_wilson_lanczos.cc
+++ b/tests/lanczos/Test_wilson_lanczos.cc
@@ -86,9 +86,12 @@ int main(int argc, char** argv) {
  std::vector<double> Coeffs{0, 1.};
  Polynomial<FermionField> PolyX(Coeffs);
-  Chebyshev<FermionField> Cheb(0.0, 10., 12);
+  Chebyshev<FermionField> Cheby(0.0, 10., 12);
-  ImplicitlyRestartedLanczos<FermionField> IRL(HermOp, PolyX, Nstop, Nk, Nm,
+
-                                               resid, MaxIt);
+  FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
     PlainHermOp<FermionField> Op     (HermOp);
  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
  std::vector<RealD> eval(Nm);
  FermionField src(FGrid);
--- a/tests/solver/Test_dwf_hdcr.cc
+++ b/tests/solver/Test_dwf_hdcr.cc
@@ -555,13 +555,13 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Calling Aggregation class to build subspace" <<std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermDefOp(Ddwf);
-  Subspace Aggregates(Coarse5d,FGrid);
+  Subspace Aggregates(Coarse5d,FGrid,0);
  //  Aggregates.CreateSubspace(RNG5,HermDefOp,nbasis);
  assert ( (nbasis & 0x1)==0);
  int nb=nbasis/2;
  std::cout<<GridLogMessage << " nbasis/2 = "<<nb<<std::endl;
-  //  Aggregates.CreateSubspace(RNG5,HermDefOp,nb);
+  Aggregates.CreateSubspace(RNG5,HermDefOp,nb);
-  Aggregates.CreateSubspaceLanczos(RNG5,HermDefOp,nb);
+  //  Aggregates.CreateSubspaceLanczos(RNG5,HermDefOp,nb);
  for(int n=0;n<nb;n++){
    G5R5(Aggregates.subspace[n+nb],Aggregates.subspace[n]);
    std::cout<<GridLogMessage<<n<<" subspace "<<norm2(Aggregates.subspace[n+nb])<<" "<<norm2(Aggregates.subspace[n]) <<std::endl;
--- a/tests/solver/Test_dwf_mrhs_cg.cc
+++ b/tests/solver/Test_dwf_mrhs_cg.cc
@@ -38,7 +38,7 @@ int main (int argc, char ** argv)
  typedef typename DomainWallFermionR::ComplexField ComplexField; 
  typename DomainWallFermionR::ImplParams params; 
-  const int Ls=8;
+  const int Ls=4;
  Grid_init(&argc,&argv);
@@ -47,42 +47,51 @@ int main (int argc, char ** argv)
  std::vector<int> mpi_layout  = GridDefaultMpi();
  std::vector<int> mpi_split (mpi_layout.size(),1);
  std::cout << "UGrid (world root)"<<std::endl;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  std::cout << "FGrid (child of UGrid)"<<std::endl;
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
-
+  GridRedBlackCartesian * rbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  int nrhs = UGrid->RankCount() ;
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  /////////////////////////////////////////////
  // Split into 1^4 mpi communicators
  /////////////////////////////////////////////
-  std::cout << "SGrid (world root)"<<std::endl;
+  for(int i=0;i<argc;i++){
    if(std::string(argv[i]) == "--split"){
      for(int k=0;k<mpi_layout.size();k++){
 	std::stringstream ss; 
 	ss << argv[i+1+k]; 
 	ss >> mpi_split[k];
      }
      break;
    }
  }
  int nrhs = 1;
  int me;
  for(int i=0;i<mpi_layout.size();i++) nrhs *= (mpi_layout[i]/mpi_split[i]);
  GridCartesian         * SGrid = new GridCartesian(GridDefaultLatt(),
 						    GridDefaultSimd(Nd,vComplex::Nsimd()),
 						    mpi_split,
-						    *UGrid); 
+						    *UGrid,me); 
  GridCartesian         * SFGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,SGrid);
  std::cout << "SFGrid"<<std::endl;
  GridRedBlackCartesian * SrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid);
  std::cout << "SrbGrid"<<std::endl;
  GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,SGrid);
  std::cout << "SFrbGrid"<<std::endl;
  ///////////////////////////////////////////////
  // Set up the problem as a 4d spreadout job
  ///////////////////////////////////////////////
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG pRNG(UGrid );  pRNG.SeedFixedIntegers(seeds);
  GridParallelRNG pRNG5(FGrid);  pRNG5.SeedFixedIntegers(seeds);
  std::vector<FermionField>    src(nrhs,FGrid);
  std::vector<FermionField> src_chk(nrhs,FGrid);
  std::vector<FermionField> result(nrhs,FGrid);
  FermionField tmp(FGrid);
  for(int s=0;s<nrhs;s++) random(pRNG5,src[s]);
-  for(int s=0;s<nrhs;s++) result[s] = zero;
+  for(int s=0;s<nrhs;s++) result[s]=zero;
  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
@@ -96,9 +105,11 @@ int main (int argc, char ** argv)
  emptyUserRecord record;
  std::string file("./scratch.scidac");
  std::string filef("./scratch.scidac.ferm");
-  int me = UGrid->ThisRank();
+
  LatticeGaugeField s_Umu(SGrid);
  FermionField s_src(SFGrid);
  FermionField s_src_split(SFGrid);
  FermionField s_tmp(SFGrid);
  FermionField s_res(SFGrid);
  {
@@ -157,6 +168,24 @@ int main (int argc, char ** argv)
    FGrid->Barrier();
  }
  ///////////////////////////////////////////////////////////////
  // split the source out using MPI instead of I/O
  ///////////////////////////////////////////////////////////////
  std::cout << GridLogMessage << " Splitting the grid data "<<std::endl;
  Grid_split  (src,s_src_split);
  std::cout << GridLogMessage << " Finished splitting the grid data "<<std::endl;
  for(int n=0;n<nrhs;n++){
    std::cout <<GridLogMessage<<"Full "<< n <<" "<< norm2(src[n])<<std::endl;
  }
  s_tmp = s_src_split - s_src;
  for(int n=0;n<nrhs;n++){
    FGrid->Barrier();
    if ( n==me ) {
      std::cout << GridLogMessage<<"Split "<< me << " " << norm2(s_src_split) << " " << norm2(s_src)<< " diff " << norm2(s_tmp)<<std::endl;
    }
    FGrid->Barrier();
  }
  ///////////////////////////////////////////////////////////////
  // Set up N-solvers as trivially parallel
@@ -164,6 +193,7 @@ int main (int argc, char ** argv)
  RealD mass=0.01;
  RealD M5=1.8;
  DomainWallFermionR Dchk(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5);
  DomainWallFermionR Ddwf(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5);
  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
@@ -171,25 +201,40 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOp(Ddwf);
-  ConjugateGradient<FermionField> CG((1.0e-8/(me+1)),10000);
+  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOpCk(Dchk);
  ConjugateGradient<FermionField> CG((1.0e-5/(me+1)),10000);
  s_res = zero;
  CG(HermOp,s_src,s_res);
-  ///////////////////////////////////////
+  /////////////////////////////////////////////////////////////
-  // Share the information
+  // Report how long they all took
-  ///////////////////////////////////////
+  /////////////////////////////////////////////////////////////
  std::vector<uint32_t> iterations(nrhs,0);
  iterations[me] = CG.IterationsToComplete;
  for(int n=0;n<nrhs;n++){
    UGrid->GlobalSum(iterations[n]);
    std::cout << GridLogMessage<<" Rank "<<n<<" "<< iterations[n]<<" CG iterations"<<std::endl;
  }
  /////////////////////////////////////////////////////////////
-  // Report how long they all took
+  // Gather and residual check on the results
  /////////////////////////////////////////////////////////////
-  for(int r=0;r<nrhs;r++){
+  std::cout << GridLogMessage<< "Unsplitting the result"<<std::endl;
-    std::cout << GridLogMessage<<" Rank "<<r<<" "<< iterations[r]<<" CG iterations"<<std::endl;
+  Grid_unsplit(result,s_res);
  /*
  Grid_unsplit(src_chk,s_src);
  for(int n=0;n<nrhs;n++){
    tmp = src[n]-src_chk[n];
    std::cout << " src_chk "<<n<<" "<<norm2(src_chk[n])<<" " <<norm2(src[n])<<" " <<norm2(tmp)<< std::endl;
    std::cout << " diff " <<tmp<<std::endl;
  }
  */
  std::cout << GridLogMessage<< "Checking the residuals"<<std::endl;
  for(int n=0;n<nrhs;n++){
    HermOpCk.HermOp(result[n],tmp); tmp = tmp - src[n];
    std::cout << GridLogMessage<<" resid["<<n<<"]  "<< norm2(tmp)<<std::endl;
  }
  Grid_finalize();
 }
--- a/tests/solver/Test_dwf_mrhs_cg_mpi.cc
+++ b/tests/solver/Test_dwf_mrhs_cg_mpi.cc
@@ -0,0 +1,223 @@
   /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_dwf_mrhs_cg.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  typedef typename DomainWallFermionR::FermionField FermionField; 
  typedef typename DomainWallFermionR::ComplexField ComplexField; 
  typename DomainWallFermionR::ImplParams params; 
  const int Ls=4;
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  std::vector<int> mpi_split (mpi_layout.size(),1);
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
 								   GridDefaultSimd(Nd,vComplex::Nsimd()),
 								   GridDefaultMpi());
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * rbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  /////////////////////////////////////////////
  // Split into 1^4 mpi communicators
  /////////////////////////////////////////////
  for(int i=0;i<argc;i++){
    if(std::string(argv[i]) == "--split"){
      for(int k=0;k<mpi_layout.size();k++){
 	std::stringstream ss; 
 	ss << argv[i+1+k]; 
 	ss >> mpi_split[k];
      }
      break;
    }
  }
  int nrhs = 1;
  int me;
  for(int i=0;i<mpi_layout.size();i++) nrhs *= (mpi_layout[i]/mpi_split[i]);
  GridCartesian         * SGrid = new GridCartesian(GridDefaultLatt(),
 						    GridDefaultSimd(Nd,vComplex::Nsimd()),
 						    mpi_split,
 						    *UGrid,me); 
  GridCartesian         * SFGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,SGrid);
  GridRedBlackCartesian * SrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid);
  GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,SGrid);
  ///////////////////////////////////////////////
  // Set up the problem as a 4d spreadout job
  ///////////////////////////////////////////////
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG pRNG(UGrid );  pRNG.SeedFixedIntegers(seeds);
  GridParallelRNG pRNG5(FGrid);  pRNG5.SeedFixedIntegers(seeds);
  std::vector<FermionField>    src(nrhs,FGrid);
  std::vector<FermionField> src_chk(nrhs,FGrid);
  std::vector<FermionField> result(nrhs,FGrid);
  FermionField tmp(FGrid);
  for(int s=0;s<nrhs;s++) result[s]=zero;
 #undef LEXICO_TEST
 #ifdef LEXICO_TEST
  {
    LatticeFermion lex(FGrid);  lex = zero;
    LatticeFermion ftmp(FGrid);
    Integer stride =10000;
    double nrm;
    LatticeComplex coor(FGrid);
    for(int d=0;d<5;d++){
      LatticeCoordinate(coor,d);
      ftmp = stride;
      ftmp = ftmp * coor;
      lex = lex + ftmp;
      stride=stride/10;
    }
    for(int s=0;s<nrhs;s++) {
      src[s]=lex;
      ftmp = 1000*1000*s;
      src[s] = src[s] + ftmp;
    }    
  }
 #else
  for(int s=0;s<nrhs;s++) {
    random(pRNG5,src[s]);
    tmp = 100.0*s;
    src[s] = (src[s] * 0.1) + tmp;
    std::cout << " src ]"<<s<<"] "<<norm2(src[s])<<std::endl;
  }
 #endif
  for(int n =0 ; n< nrhs ; n++) { 
    std::cout << " src"<<n<<"\n"<< src[n] <<std::endl;
  }
  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
  /////////////////
  // MPI only sends
  /////////////////
  LatticeGaugeField s_Umu(SGrid);
  FermionField s_src(SFGrid);
  FermionField s_tmp(SFGrid);
  FermionField s_res(SFGrid);
  ///////////////////////////////////////////////////////////////
  // split the source out using MPI instead of I/O
  ///////////////////////////////////////////////////////////////
  Grid_split  (Umu,s_Umu);
  Grid_split  (src,s_src);
  std::cout << " split rank  " <<me << " s_src "<<norm2(s_src)<<std::endl;
  std::cout << " s_src\n "<< s_src <<std::endl;
 #ifdef LEXICO_TEST
  FermionField s_src_tmp(SFGrid);
  FermionField s_src_diff(SFGrid);
  {
    LatticeFermion lex(SFGrid);  lex = zero;
    LatticeFermion ftmp(SFGrid);
    Integer stride =10000;
    double nrm;
    LatticeComplex coor(SFGrid);
    for(int d=0;d<5;d++){
      LatticeCoordinate(coor,d);
      ftmp = stride;
      ftmp = ftmp * coor;
      lex = lex + ftmp;
      stride=stride/10;
    }
    s_src_tmp=lex;
    ftmp = 1000*1000*me;
    s_src_tmp = s_src_tmp + ftmp;
  }
  s_src_diff = s_src_tmp - s_src;
  std::cout << " s_src_diff " << norm2(s_src_diff)<<std::endl;
  std::cout << " s_src \n" << s_src << std::endl;
  std::cout << " s_src_tmp \n" << s_src_tmp << std::endl;
  std::cout << " s_src_diff \n" << s_src_diff << std::endl;
 #endif
  ///////////////////////////////////////////////////////////////
  // Set up N-solvers as trivially parallel
  ///////////////////////////////////////////////////////////////
  RealD mass=0.01;
  RealD M5=1.8;
  DomainWallFermionR Dchk(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5);
  DomainWallFermionR Ddwf(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5);
  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
  std::cout << GridLogMessage << " Calling DWF CG "<<std::endl;
  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOp(Ddwf);
  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOpCk(Dchk);
  ConjugateGradient<FermionField> CG((1.0e-5),10000);
  s_res = zero;
  CG(HermOp,s_src,s_res);
  std::cout << " s_res norm "<<norm2(s_res)<<std::endl;
  /////////////////////////////////////////////////////////////
  // Report how long they all took
  /////////////////////////////////////////////////////////////
  std::vector<uint32_t> iterations(nrhs,0);
  iterations[me] = CG.IterationsToComplete;
  for(int n=0;n<nrhs;n++){
    UGrid->GlobalSum(iterations[n]);
    std::cout << GridLogMessage<<" Rank "<<n<<" "<< iterations[n]<<" CG iterations"<<std::endl;
  }
  /////////////////////////////////////////////////////////////
  // Gather and residual check on the results
  /////////////////////////////////////////////////////////////
  std::cout << GridLogMessage<< "Unsplitting the result"<<std::endl;
  Grid_unsplit(result,s_res);
  std::cout << GridLogMessage<< "Checking the residuals"<<std::endl;
  for(int n=0;n<nrhs;n++){
    std::cout << " res["<<n<<"] norm "<<norm2(result[n])<<std::endl;
    HermOpCk.HermOp(result[n],tmp); tmp = tmp - src[n];
    std::cout << GridLogMessage<<" resid["<<n<<"]  "<< norm2(tmp)/norm2(src[n])<<std::endl;
  }
  Grid_finalize();
 }
--- a/tests/solver/Test_dwf_mrhs_cg_mpieo.cc
+++ b/tests/solver/Test_dwf_mrhs_cg_mpieo.cc
@@ -0,0 +1,164 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_dwf_mrhs_cg.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  typedef typename DomainWallFermionR::FermionField FermionField; 
  typedef typename DomainWallFermionR::ComplexField ComplexField; 
  typename DomainWallFermionR::ImplParams params; 
  const int Ls=4;
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  std::vector<int> mpi_split (mpi_layout.size(),1);
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
 								   GridDefaultSimd(Nd,vComplex::Nsimd()),
 								   GridDefaultMpi());
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * rbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  int nrhs = UGrid->RankCount() ;
  /////////////////////////////////////////////
  // Split into 1^4 mpi communicators
  /////////////////////////////////////////////
  int me;
  GridCartesian         * SGrid = new GridCartesian(GridDefaultLatt(),
 						    GridDefaultSimd(Nd,vComplex::Nsimd()),
 						    mpi_split,
 						    *UGrid,me); 
  GridCartesian         * SFGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,SGrid);
  GridRedBlackCartesian * SrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid);
  GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,SGrid);
  ///////////////////////////////////////////////
  // Set up the problem as a 4d spreadout job
  ///////////////////////////////////////////////
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG pRNG(UGrid );  pRNG.SeedFixedIntegers(seeds);
  GridParallelRNG pRNG5(FGrid);  pRNG5.SeedFixedIntegers(seeds);
  std::vector<FermionField>    src(nrhs,FGrid);
  std::vector<FermionField> src_chk(nrhs,FGrid);
  std::vector<FermionField> result(nrhs,FGrid);
  FermionField tmp(FGrid);
  std::vector<FermionField> src_e(nrhs,FrbGrid);
  std::vector<FermionField> src_o(nrhs,FrbGrid);
  for(int s=0;s<nrhs;s++) random(pRNG5,src[s]);
  for(int s=0;s<nrhs;s++) result[s]=zero;
  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
  /////////////////
  // MPI only sends
  /////////////////
  LatticeGaugeField s_Umu(SGrid);
  FermionField s_src(SFGrid);
  FermionField s_src_e(SFrbGrid);
  FermionField s_src_o(SFrbGrid);
  FermionField s_tmp(SFGrid);
  FermionField s_res(SFGrid);
  ///////////////////////////////////////////////////////////////
  // split the source out using MPI instead of I/O
  ///////////////////////////////////////////////////////////////
  Grid_split  (Umu,s_Umu);
  Grid_split  (src,s_src);
  ///////////////////////////////////////////////////////////////
  // Check even odd cases
  ///////////////////////////////////////////////////////////////
  for(int s=0;s<nrhs;s++){
    pickCheckerboard(Odd , src_o[s], src[s]);
    pickCheckerboard(Even, src_e[s], src[s]);
  }
  Grid_split  (src_e,s_src_e);
  Grid_split  (src_o,s_src_o);
  setCheckerboard(s_tmp, s_src_o);
  setCheckerboard(s_tmp, s_src_e);
  s_tmp = s_tmp - s_src;
  std::cout << GridLogMessage<<" EvenOdd Difference " <<norm2(s_tmp)<<std::endl;
  ///////////////////////////////////////////////////////////////
  // Set up N-solvers as trivially parallel
  ///////////////////////////////////////////////////////////////
  RealD mass=0.01;
  RealD M5=1.8;
  DomainWallFermionR Dchk(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5);
  DomainWallFermionR Ddwf(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5);
  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
  std::cout << GridLogMessage << " Calling DWF CG "<<std::endl;
  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOp(Ddwf);
  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOpCk(Dchk);
  ConjugateGradient<FermionField> CG((1.0e-8/(me+1)),10000);
  s_res = zero;
  CG(HermOp,s_src,s_res);
  /////////////////////////////////////////////////////////////
  // Report how long they all took
  /////////////////////////////////////////////////////////////
  std::vector<uint32_t> iterations(nrhs,0);
  iterations[me] = CG.IterationsToComplete;
  for(int n=0;n<nrhs;n++){
    UGrid->GlobalSum(iterations[n]);
    std::cout << GridLogMessage<<" Rank "<<n<<" "<< iterations[n]<<" CG iterations"<<std::endl;
  }
  /////////////////////////////////////////////////////////////
  // Gather and residual check on the results
  /////////////////////////////////////////////////////////////
  std::cout << GridLogMessage<< "Unsplitting the result"<<std::endl;
  Grid_unsplit(result,s_res);
  std::cout << GridLogMessage<< "Checking the residuals"<<std::endl;
  for(int n=0;n<nrhs;n++){
    HermOpCk.HermOp(result[n],tmp); tmp = tmp - src[n];
    std::cout << GridLogMessage<<" resid["<<n<<"]  "<< norm2(tmp)<<std::endl;
  }
  Grid_finalize();
 }
--- a/tests/solver/Test_split_grid.cc
+++ b/tests/solver/Test_split_grid.cc
@@ -0,0 +1,157 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_dwf_mrhs_cg.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  typedef typename DomainWallFermionR::FermionField FermionField; 
  typedef typename DomainWallFermionR::ComplexField ComplexField; 
  typename DomainWallFermionR::ImplParams params; 
  const int Ls=4;
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  std::vector<int> mpi_split (mpi_layout.size(),1);
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * rbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  /////////////////////////////////////////////
  // Split into 1^4 mpi communicators
  /////////////////////////////////////////////
  for(int i=0;i<argc;i++){
    if(std::string(argv[i]) == "--split"){
      for(int k=0;k<mpi_layout.size();k++){
 	std::stringstream ss; 
 	ss << argv[i+1+k]; 
 	ss >> mpi_split[k];
      }
      break;
    }
  }
  int nrhs = 1;
  for(int i=0;i<mpi_layout.size();i++) nrhs *= (mpi_layout[i]/mpi_split[i]);
  GridCartesian         * SGrid = new GridCartesian(GridDefaultLatt(),
 						    GridDefaultSimd(Nd,vComplex::Nsimd()),
 						    mpi_split,
 						    *UGrid); 
  GridCartesian         * SFGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,SGrid);
  GridRedBlackCartesian * SrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid);
  GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,SGrid);
  ///////////////////////////////////////////////
  // Set up the problem as a 4d spreadout job
  ///////////////////////////////////////////////
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG pRNG(UGrid );  pRNG.SeedFixedIntegers(seeds);
  GridParallelRNG pRNG5(FGrid);  pRNG5.SeedFixedIntegers(seeds);
  std::vector<FermionField>    src(nrhs,FGrid);
  std::vector<FermionField> src_chk(nrhs,FGrid);
  std::vector<FermionField> result(nrhs,FGrid);
  FermionField tmp(FGrid);
  for(int s=0;s<nrhs;s++) random(pRNG5,src[s]);
  for(int s=0;s<nrhs;s++) result[s]=zero;
  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
  /////////////////
  // MPI only sends
  /////////////////
  int me = UGrid->ThisRank();
  LatticeGaugeField s_Umu(SGrid);
  FermionField s_src(SFGrid);
  FermionField s_tmp(SFGrid);
  FermionField s_res(SFGrid);
  ///////////////////////////////////////////////////////////////
  // split the source out using MPI instead of I/O
  ///////////////////////////////////////////////////////////////
  Grid_split  (Umu,s_Umu);
  Grid_split  (src,s_src);
  ///////////////////////////////////////////////////////////////
  // Set up N-solvers as trivially parallel
  ///////////////////////////////////////////////////////////////
  RealD mass=0.01;
  RealD M5=1.8;
  DomainWallFermionR Dchk(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5);
  DomainWallFermionR Ddwf(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5);
  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
  std::cout << GridLogMessage << " Calling DWF CG "<<std::endl;
  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOp(Ddwf);
  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOpCk(Dchk);
  ConjugateGradient<FermionField> CG((1.0e-8/(me+1)),10000);
  s_res = zero;
  CG(HermOp,s_src,s_res);
  /////////////////////////////////////////////////////////////
  // Report how long they all took
  /////////////////////////////////////////////////////////////
  std::vector<uint32_t> iterations(nrhs,0);
  iterations[me] = CG.IterationsToComplete;
  for(int n=0;n<nrhs;n++){
    UGrid->GlobalSum(iterations[n]);
    std::cout << GridLogMessage<<" Rank "<<n<<" "<< iterations[n]<<" CG iterations"<<std::endl;
  }
  /////////////////////////////////////////////////////////////
  // Gather and residual check on the results
  /////////////////////////////////////////////////////////////
  std::cout << GridLogMessage<< "Unsplitting the result"<<std::endl;
  Grid_unsplit(result,s_res);
  std::cout << GridLogMessage<< "Checking the residuals"<<std::endl;
  for(int n=0;n<nrhs;n++){
    HermOpCk.HermOp(result[n],tmp); tmp = tmp - src[n];
    std::cout << GridLogMessage<<" resid["<<n<<"]  "<< norm2(tmp)<<std::endl;
  }
  Grid_finalize();
 }
--- a/tests/solver/Test_staggered_block_cg_prec.cc
+++ b/tests/solver/Test_staggered_block_cg_prec.cc
@@ -0,0 +1,130 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_wilson_cg_unprec.cc
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 template<class d>
 struct scal {
  d internal;
 };
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT
  };
 int main (int argc, char ** argv)
 {
  typedef typename ImprovedStaggeredFermion5DR::FermionField FermionField; 
  typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField; 
  typename ImprovedStaggeredFermion5DR::ImplParams params; 
  const int Ls=8;
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG pRNG(UGrid );  pRNG.SeedFixedIntegers(seeds);
  GridParallelRNG pRNG5(FGrid);  pRNG5.SeedFixedIntegers(seeds);
  FermionField src(FGrid); random(pRNG5,src);
  FermionField src_o(FrbGrid);   pickCheckerboard(Odd,src_o,src);
  FermionField result_o(FrbGrid); result_o=zero; 
  RealD nrm = norm2(src);
  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
  RealD mass=0.003;
  ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); 
  SchurStaggeredOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
  ConjugateGradient<FermionField> CG(1.0e-8,10000);
  int blockDim = 0;
  BlockConjugateGradient<FermionField>    BCGrQ(BlockCGrQ,blockDim,1.0e-8,10000);
  BlockConjugateGradient<FermionField>    BCG  (BlockCG,blockDim,1.0e-8,10000);
  BlockConjugateGradient<FermionField>    mCG  (CGmultiRHS,blockDim,1.0e-8,10000);
  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
  std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
  ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass);
  SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
  FermionField src4d(UGrid); random(pRNG,src4d);
  FermionField src4d_o(UrbGrid);   pickCheckerboard(Odd,src4d_o,src4d);
  FermionField result4d_o(UrbGrid); 
  result4d_o=zero;
  CG(HermOp4d,src4d_o,result4d_o);
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
  std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl;
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
  Ds.ZeroCounters();
  result_o=zero;
  CG(HermOp,src_o,result_o);
  Ds.Report();
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
  std::cout << GridLogMessage << " Calling multiRHS CG for "<<Ls <<" right hand sides" <<std::endl;
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
  Ds.ZeroCounters();
  result_o=zero;
  mCG(HermOp,src_o,result_o);
  Ds.Report();
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
  std::cout << GridLogMessage << " Calling Block CG for "<<Ls <<" right hand sides" <<std::endl;
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
  Ds.ZeroCounters();
  result_o=zero;
  BCGrQ(HermOp,src_o,result_o);
  Ds.Report();
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
  Grid_finalize();
 }
--- a/tests/solver/Test_staggered_cg_prec.cc
+++ b/tests/solver/Test_staggered_cg_prec.cc
@@ -48,7 +48,6 @@ struct scal {
 int main (int argc, char ** argv)
 {
  typedef typename ImprovedStaggeredFermionR::FermionField FermionField; 
  typedef typename ImprovedStaggeredFermionR::ComplexField ComplexField; 
  typename ImprovedStaggeredFermionR::ImplParams params; 
  Grid_init(&argc,&argv);
@@ -71,7 +70,7 @@ int main (int argc, char ** argv)
    volume=volume*latt_size[mu];
  }  
-  RealD mass=0.1;
+  RealD mass=0.003;
  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass);
  FermionField res_o(&RBGrid); 
@@ -79,9 +78,14 @@ int main (int argc, char ** argv)
  pickCheckerboard(Odd,src_o,src);
  res_o=zero;
-  SchurDiagMooeeOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds);
+  SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds);
  ConjugateGradient<FermionField> CG(1.0e-8,10000);
  CG(HermOpEO,src_o,res_o);
  FermionField tmp(&RBGrid);
  HermOpEO.Mpc(res_o,tmp);
  std::cout << "check Mpc resid " << axpy_norm(tmp,-1.0,src_o,tmp)/norm2(src_o) << "\n";
  Grid_finalize();
 }
--- a/tests/solver/Test_staggered_cg_schur.cc
+++ b/tests/solver/Test_staggered_cg_schur.cc
@@ -0,0 +1,76 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_wilson_cg_schur.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 template<class d>
 struct scal {
  d internal;
 };
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT
  };
 int main (int argc, char ** argv)
 {
  typedef typename ImprovedStaggeredFermionR::FermionField FermionField; 
  typename ImprovedStaggeredFermionR::ImplParams params; 
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian     RBGrid(&Grid);
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
  FermionField    src(&Grid); random(pRNG,src);
  FermionField result(&Grid); result=zero;
  FermionField  resid(&Grid); 
  RealD mass=0.1;
  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass);
  ConjugateGradient<FermionField> CG(1.0e-8,10000);
  SchurRedBlackStaggeredSolve<FermionField> SchurSolver(CG);
  SchurSolver(Ds,src,result);
  Grid_finalize();
 }
Author	SHA1	Message	Date
paboyle	27ea2afe86	No compile on comms == none fix	2017-10-30 01:14:11 +00:00
paboyle	78e8704eac	Shaking out	2017-10-30 00:25:31 +00:00
paboyle	67131d82f2	Get subrank info from communicator constructor	2017-10-30 00:24:11 +00:00
paboyle	615a9448b9	Extended sub comm supported	2017-10-30 00:23:34 +00:00
paboyle	00164f5ce5	:	2017-10-30 00:22:52 +00:00
paboyle	a7f72eb994	SHaking out	2017-10-30 00:22:06 +00:00
paboyle	501fa1614a	Communicator updates for split grid	2017-10-30 00:16:12 +00:00
paboyle	5bf42e1e15	Update	2017-10-30 00:05:21 +00:00
paboyle	fe4d9b003c	More digits	2017-10-30 00:04:47 +00:00
paboyle	4a699b4da3	New rank can be found out	2017-10-30 00:04:14 +00:00
paboyle	689323f4ee	Reverse dim ordering lexico support	2017-10-30 00:03:15 +00:00
paboyle	84b441800f	Merge branch 'develop' into feature/lanczos-reorg	2017-10-27 14:21:38 +01:00
paboyle	1ef424b139	Split grid Y2K bug fix attempt	2017-10-27 14:20:35 +01:00
paboyle	aa66f41c69	Bug fix in the coarse restore... Think this is nearly there	2017-10-27 10:29:34 +01:00
paboyle	f96c800d25	Passes reload of coarse basis	2017-10-27 09:43:22 +01:00
paboyle	32a52d7583	Move the local coherence lanczos into algorithms. Keep the I/O in the tester. Other people can copy this method to write other I/O formats.	2017-10-27 09:04:31 +01:00
paboyle	fa04b6d3c2	Finished ? Verifying coarse evec restore	2017-10-27 08:18:29 +01:00
paboyle	7fab183c0e	Better read test	2017-10-27 08:17:49 +01:00
paboyle	9ec9850bdb	64bit ftello update	2017-10-26 23:34:31 +01:00
paboyle	0c4ddaea0b	Cleaning up	2017-10-26 23:31:46 +01:00
paboyle	00ebc150ad	Mistake in string parse; interface is ambiguous and must fix. Is char * a file, or a XML buffer ?	2017-10-26 23:30:37 +01:00
paboyle	0f3e9ae57d	Gsites error. Only appeared (so far) in I/O code for even odd fields	2017-10-26 23:29:59 +01:00
Azusa Yamaguchi	034de160bf	Staggered updates : Schur fixed and added a unit test for Test_staggered_cg_schur.cc giving stronger check	2017-10-26 20:58:46 +01:00
paboyle	14507fd6e4	Final? candidate for push back on the lanczos reorg feature	2017-10-26 16:25:01 +01:00
paboyle	2db05ac214	Test for split/unsplit in isolation	2017-10-26 07:48:03 +01:00
paboyle	31f99574fa	Moving these out of algorithms	2017-10-26 07:47:42 +01:00
paboyle	a34c8a2961	Update to IRL; getting close to the structure I would like.	2017-10-26 07:45:56 +01:00
paboyle	ccd20df827	Better IRL interface	2017-10-26 01:59:59 +01:00
paboyle	e9be293444	Better messaging	2017-10-26 01:59:30 +01:00
paboyle	d577211cc3	Relax stoppign condition	2017-10-25 23:57:54 +01:00
paboyle	f4336e480a	Faster converge time	2017-10-25 23:53:44 +01:00
paboyle	e4d461cb03	Messagign	2017-10-25 23:53:19 +01:00
paboyle	3d63b4894e	Use existing functionality where possible	2017-10-25 23:52:47 +01:00
paboyle	08583afaff	Red black friendly coarsening	2017-10-25 23:51:18 +01:00
paboyle	b395a312af	Better error messaging	2017-10-25 23:50:37 +01:00
paboyle	66295b99aa	Bit less verbose SciDAC IO	2017-10-25 23:50:05 +01:00
paboyle	b8654be0ef	64 bit safe offsets	2017-10-25 23:49:23 +01:00
paboyle	a479325349	Rewrite of local coherence lanczos	2017-10-25 23:48:47 +01:00
paboyle	f6c3f6bf2d	XML serialisation of parms and initialise from parms object	2017-10-25 23:47:59 +01:00
paboyle	d83868fdbb	Identity linear op added -- useful in circumstances where a linear op may or may not be needed. Supply a trivial one if not needed	2017-10-25 23:47:10 +01:00
paboyle	303e0b927d	Improvements for coarse grid compressed lanczos	2017-10-25 23:46:33 +01:00
paboyle	28ba8a0f48	Force spacing more nicely	2017-10-25 23:45:57 +01:00
Azusa Yamaguchi	f9e28577f3	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2017-10-25 21:07:56 +01:00
Guido Cossu	8a3aae98f6	Solving minor bug in compilation	2017-10-25 10:34:49 +01:00
Guido Cossu	8309f2364b	Solving again the MPI comm bug with FFTs	2017-10-25 10:24:14 +01:00
Azusa Yamaguchi	cac1750078	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2017-10-24 23:30:36 +01:00
Guido Cossu	27936900e6	Putting the FG verbosity in the Integrator level	2017-10-18 13:08:09 +01:00
paboyle	e325929851	ALl codes compile against the new Lanczos call signature	2017-10-13 14:02:43 +01:00
paboyle	47af3565f4	Logging improvement; reunified the Lanczos codes	2017-10-13 13:23:07 +01:00
paboyle	4b4d187935	Reunified the Lanczos implementations	2017-10-13 13:22:44 +01:00
paboyle	9aff354ab5	Final version prior to reunification	2017-10-13 13:22:26 +01:00
paboyle	cb9ff20249	Approx tests and lanczos improvement	2017-10-13 11:30:50 +01:00
paboyle	9fe6ac71ea	Starting reorg of Blocked lanczos	2017-10-11 10:12:07 +01:00
Azusa Yamaguchi	f1fa00b71b	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2017-10-10 14:26:44 +01:00
paboyle	bf58557fb1	Block compressed Lanczos	2017-10-10 14:15:11 +01:00
paboyle	10cb37f504	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2017-10-10 14:09:44 +01:00
Azusa Yamaguchi	1374c943d4	Correct Schur operator called	2017-10-10 13:59:50 +01:00
paboyle	a1d80282ec	cb factorise	2017-10-10 13:49:31 +01:00
paboyle	4eb8bbbebe	Christop mods	2017-10-10 13:48:51 +01:00
paboyle	d1c6288c5f	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2017-10-10 13:38:40 +01:00
Azusa Yamaguchi	dd949bc428	Merge branch 'feature/staggering' into develop	2017-10-10 13:02:51 +01:00
Azusa Yamaguchi	bb7378cfc3	Schur for staggered	2017-10-10 12:02:18 +01:00
Azusa Yamaguchi	f0e084a88c	Schur staggered	2017-10-10 10:00:43 +01:00
paboyle	153672d8ec	Split CG testing	2017-10-09 23:20:58 +01:00
paboyle	08ca338875	Split grid communication	2017-10-09 23:19:45 +01:00
paboyle	f7cbf82c04	Better stdout/err debug	2017-10-09 23:18:48 +01:00
paboyle	07009c569a	Comms splitting improvements	2017-10-09 23:16:51 +01:00
Azusa Yamaguchi	09f4cdb11e	Merge branch 'develop' of https://github.com/paboyle/Grid into feature/staggering	2017-10-04 10:51:16 +01:00
Azusa Yamaguchi	1e54882f71	Stagger	2017-10-04 10:51:06 +01:00
Azusa Yamaguchi	eb6153080a	Merge branch 'develop' of https://github.com/paboyle/Grid into feature/staggering	2017-10-02 08:56:33 +01:00
Azusa Yamaguchi	a6eeea777b	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2017-09-21 10:12:41 +01:00
Azusa Yamaguchi	77f7737ccc	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2017-09-19 14:28:01 +01:00
paboyle	f9df685cde	Merge branch 'hotfix/dirac-ITT-fix1'	2017-09-16 18:18:48 +01:00
Azusa Yamaguchi	0cd6b1858c	Merge branch 'develop' of https://github.com/paboyle/Grid into feature/staggering	2016-12-14 09:23:22 +00:00