Merge remote-tracking branch 'upstream/develop' into feature/wilsonmg

2025-05-15 15:05:46 +01:00 · 2018-03-23 21:13:50 +01:00 · 2018-03-23 21:13:50 +01:00 · afdcbf79d1
commit afdcbf79d1
parent 3c3ec4e267 f290b2e908
37 changed files with 1129 additions and 551 deletions
--- a/.gitignore
+++ b/.gitignore
@ -123,6 +123,7 @@ make-bin-BUCK.sh
 #####################
 lib/qcd/spin/gamma-gen/*.h
 lib/qcd/spin/gamma-gen/*.cc
+lib/version.h

 # vs code editor files #
 ########################
--- a/Makefile.am
+++ b/Makefile.am
@ -5,6 +5,10 @@ include $(top_srcdir)/doxygen.inc

 bin_SCRIPTS=grid-config

+BUILT_SOURCES = version.h
+
+version.h:
+	echo "`git log -n 1 --format=format:"#define GITHASH \\"%H:%d\\"%n" HEAD`" > $(srcdir)/lib/version.h

 .PHONY: bench check tests doxygen-run doxygen-doc $(DX_PS_GOAL) $(DX_PDF_GOAL)

--- a/configure.ac
+++ b/configure.ac
@ -340,7 +340,7 @@ case ${ac_PRECISION} in
 esac

 ######################  Shared memory allocation technique under MPI3
-AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|hugetlbfs],
+AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|hugetlbfs|shmnone],
              [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=shmopen])

 case ${ac_SHM} in
@ -349,6 +349,10 @@ case ${ac_SHM} in
     AC_DEFINE([GRID_MPI3_SHMOPEN],[1],[GRID_MPI3_SHMOPEN] )
     ;;

+     shmnone)
+     AC_DEFINE([GRID_MPI3_SHM_NONE],[1],[GRID_MPI3_SHM_NONE] )
+     ;;
+
     hugetlbfs)
     AC_DEFINE([GRID_MPI3_SHMMMAP],[1],[GRID_MPI3_SHMMMAP] )
     ;;
--- a/lib/algorithms/Algorithms.h
+++ b/lib/algorithms/Algorithms.h
@ -39,6 +39,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/approx/MultiShiftFunction.h>
 #include <Grid/algorithms/approx/Forecast.h>

+#include <Grid/algorithms/iterative/Deflation.h>
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateResidual.h>
 #include <Grid/algorithms/iterative/NormalEquations.h>
--- a/lib/algorithms/iterative/Deflation.h
+++ b/lib/algorithms/iterative/Deflation.h
@ -0,0 +1,101 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_DEFLATION_H
+#define GRID_DEFLATION_H
+
+namespace Grid { 
+
+struct ZeroGuesser {
+public:
+  template<class Field> 
+  void operator()(const Field &src,Field &guess) { guess = Zero(); };
+};
+struct SourceGuesser {
+public:
+  template<class Field> 
+  void operator()(const Field &src,Field &guess) { guess = src; };
+};
+
+////////////////////////////////
+// Fine grid deflation
+////////////////////////////////
+template<class Field>
+struct DeflatedGuesser {
+private:
+  const std::vector<Field> &evec;
+  const std::vector<RealD> &eval;
+
+public:
+
+  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
+
+  void operator()(const Field &src,Field &guess) { 
+    guess = zero;
+    assert(evec.size()==eval.size());
+    auto N = evec.size();
+    for (int i=0;i<N;i++) {
+      const Field& tmp = evec[i];
+      axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
+    }
+  }
+};
+
+template<class FineField, class CoarseField>
+class LocalCoherenceDeflatedGuesser {
+private:
+  const std::vector<FineField>   &subspace;
+  const std::vector<CoarseField> &evec_coarse;
+  const std::vector<RealD>       &eval_coarse;
+public:
+  
+  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
+				const std::vector<CoarseField> &_evec_coarse,
+				const std::vector<RealD>       &_eval_coarse)
+    : subspace(_subspace), 
+      evec_coarse(_evec_coarse), 
+      eval_coarse(_eval_coarse)  
+  {
+  }
+  
+  void operator()(const FineField &src,FineField &guess) { 
+    int N = (int)evec_coarse.size();
+    CoarseField src_coarse(evec_coarse[0]._grid);
+    CoarseField guess_coarse(evec_coarse[0]._grid);    guess_coarse = zero;
+    blockProject(src_coarse,src,subspace);    
+    for (int i=0;i<N;i++) {
+      const CoarseField & tmp = evec_coarse[i];
+      axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
+    }
+    blockPromote(guess_coarse,guess,subspace);
+  };
+};
+
+
+
+}
+#endif
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@ -149,19 +149,6 @@ void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, boo
  basisReorderInPlace(_v,sort_vals,idx);
 }

-// PAB: faster to compute the inner products first then fuse loops.
-// If performance critical can improve.
-template<class Field>
-void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
-  result = zero;
-  assert(_v.size()==eval.size());
-  int N = (int)_v.size();
-  for (int i=0;i<N;i++) {
-    Field& tmp = _v[i];
-    axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
-  }
-}
-
 /////////////////////////////////////////////////////////////
 // Implicitly restarted lanczos
 /////////////////////////////////////////////////////////////
@ -181,6 +168,7 @@ enum IRLdiagonalisation {
 template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public ImplicitlyRestartedLanczosTester<Field>
 {
 public:
+
  LinearFunction<Field>       &_HermOp;
  ImplicitlyRestartedLanczosHermOpTester(LinearFunction<Field> &HermOp) : _HermOp(HermOp)  {  };
  int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox)
@ -243,6 +231,7 @@ class ImplicitlyRestartedLanczos {
  /////////////////////////
  
 public:       
+
  //////////////////////////////////////////////////////////////////
  // PAB:
  //////////////////////////////////////////////////////////////////
--- a/lib/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/lib/algorithms/iterative/LocalCoherenceLanczos.h
@ -28,7 +28,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #ifndef GRID_LOCAL_COHERENCE_IRL_H
 #define GRID_LOCAL_COHERENCE_IRL_H
+
 namespace Grid { 
+
+
 struct LanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams,
@ -70,21 +73,24 @@ public:
  typedef Lattice<Fobj>          FineField;

  LinearOperatorBase<FineField> &_Linop;
-  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
+  std::vector<FineField>        &subspace;

-  ProjectedHermOp(LinearOperatorBase<FineField>& linop,  Aggregation<Fobj,CComplex,nbasis> &aggregate) : 
-    _Linop(linop),
-    _Aggregate(aggregate)  {  };
+  ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) : 
+    _Linop(linop), subspace(_subspace)
+  {  
+    assert(subspace.size() >0);
+  };

  void operator()(const CoarseField& in, CoarseField& out) {
+    GridBase *FineGrid = subspace[0]._grid;    
+    int   checkerboard = subspace[0].checkerboard;
+      
+    FineField fin (FineGrid);     fin.checkerboard= checkerboard;
+    FineField fout(FineGrid);   fout.checkerboard = checkerboard;

-    GridBase *FineGrid = _Aggregate.FineGrid;
-    FineField fin(FineGrid);
-    FineField fout(FineGrid);
-
-    _Aggregate.PromoteFromSubspace(in,fin);    std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
-    _Linop.HermOp(fin,fout);                   std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
-    _Aggregate.ProjectToSubspace(out,fout);    std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
+    blockPromote(in,fin,subspace);       std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
+    _Linop.HermOp(fin,fout);             std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
+    blockProject(out,fout,subspace);     std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
  }
 };

@ -99,24 +105,27 @@ public:

  OperatorFunction<FineField>   & _poly;
  LinearOperatorBase<FineField> &_Linop;
-  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
+  std::vector<FineField>        &subspace;

-  ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,LinearOperatorBase<FineField>& linop, 
-			  Aggregation<Fobj,CComplex,nbasis> &aggregate) : 
+  ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,
+			  LinearOperatorBase<FineField>& linop, 
+			  std::vector<FineField> & _subspace) :
    _poly(poly),
    _Linop(linop),
-    _Aggregate(aggregate)  {  };
+    subspace(_subspace)
+  {  };

  void operator()(const CoarseField& in, CoarseField& out) {
-
-    GridBase *FineGrid = _Aggregate.FineGrid;
-
-    FineField fin(FineGrid) ;fin.checkerboard  =_Aggregate.checkerboard;
-    FineField fout(FineGrid);fout.checkerboard =_Aggregate.checkerboard;
    
-    _Aggregate.PromoteFromSubspace(in,fin);    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
+    GridBase *FineGrid = subspace[0]._grid;    
+    int   checkerboard = subspace[0].checkerboard;
+
+    FineField fin (FineGrid); fin.checkerboard =checkerboard;
+    FineField fout(FineGrid);fout.checkerboard =checkerboard;
+    
+    blockPromote(in,fin,subspace);             std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
    _poly(_Linop,fin,fout);                    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
-    _Aggregate.ProjectToSubspace(out,fout);    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
+    blockProject(out,fout,subspace);           std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
  }
 };

@ -132,19 +141,23 @@ class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanc
  LinearFunction<CoarseField> & _Poly;
  OperatorFunction<FineField>   & _smoother;
  LinearOperatorBase<FineField> &_Linop;
-  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
-  RealD                             _coarse_relax_tol;
+  RealD                          _coarse_relax_tol;
+  std::vector<FineField>        &_subspace;
+  
  ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly,
 					   OperatorFunction<FineField>   &smoother,
 					   LinearOperatorBase<FineField> &Linop,
-					   Aggregation<Fobj,CComplex,nbasis> &Aggregate,
+					   std::vector<FineField>        &subspace,
 					   RealD coarse_relax_tol=5.0e3) 
-    : _smoother(smoother), _Linop(Linop),_Aggregate(Aggregate), _Poly(Poly), _coarse_relax_tol(coarse_relax_tol)  {    };
+    : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
+      _coarse_relax_tol(coarse_relax_tol)  
+  {    };

  int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
    CoarseField v(B);
    RealD eval_poly = eval;
+
    // Apply operator
    _Poly(B,v);

@ -168,14 +181,13 @@ class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanc
  }
  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
-    GridBase *FineGrid = _Aggregate.FineGrid;
-
-    int checkerboard   = _Aggregate.checkerboard;
-
+    GridBase *FineGrid = _subspace[0]._grid;    
+    int checkerboard   = _subspace[0].checkerboard;
    FineField fB(FineGrid);fB.checkerboard =checkerboard;
    FineField fv(FineGrid);fv.checkerboard =checkerboard;

-    _Aggregate.PromoteFromSubspace(B,fv);
+    blockPromote(B,fv,_subspace);  
+    
    _smoother(_Linop,fv,fB); 

    RealD eval_poly = eval;
@ -217,27 +229,65 @@ protected:
  int _checkerboard;
  LinearOperatorBase<FineField>                 & _FineOp;
  
-  // FIXME replace Aggregation with vector of fine; the code reuse is too small for
-  // the hassle and complexity of cross coupling.
-  Aggregation<Fobj,CComplex,nbasis>               _Aggregate;  
-  std::vector<RealD>                              evals_fine;
-  std::vector<RealD>                              evals_coarse; 
-  std::vector<CoarseField>                        evec_coarse;
+  std::vector<RealD>                              &evals_fine;
+  std::vector<RealD>                              &evals_coarse; 
+  std::vector<FineField>                          &subspace;
+  std::vector<CoarseField>                        &evec_coarse;
+
+private:
+  std::vector<RealD>                              _evals_fine;
+  std::vector<RealD>                              _evals_coarse; 
+  std::vector<FineField>                          _subspace;
+  std::vector<CoarseField>                        _evec_coarse;
+
 public:
+
  LocalCoherenceLanczos(GridBase *FineGrid,
-		GridBase *CoarseGrid,
-		LinearOperatorBase<FineField> &FineOp,
-		int checkerboard) :
+			GridBase *CoarseGrid,
+			LinearOperatorBase<FineField> &FineOp,
+			int checkerboard) :
    _CoarseGrid(CoarseGrid),
    _FineGrid(FineGrid),
-    _Aggregate(CoarseGrid,FineGrid,checkerboard),
    _FineOp(FineOp),
-    _checkerboard(checkerboard)
+    _checkerboard(checkerboard),
+    evals_fine  (_evals_fine),
+    evals_coarse(_evals_coarse),
+    subspace    (_subspace),
+    evec_coarse(_evec_coarse)
  {
    evals_fine.resize(0);
    evals_coarse.resize(0);
  };
-  void Orthogonalise(void ) { _Aggregate.Orthogonalise(); }
+  //////////////////////////////////////////////////////////////////////////
+  // Alternate constructore, external storage for use by Hadrons module
+  //////////////////////////////////////////////////////////////////////////
+  LocalCoherenceLanczos(GridBase *FineGrid,
+			GridBase *CoarseGrid,
+			LinearOperatorBase<FineField> &FineOp,
+			int checkerboard,
+			std::vector<FineField>   &ext_subspace,
+			std::vector<CoarseField> &ext_coarse,
+			std::vector<RealD>       &ext_eval_fine,
+			std::vector<RealD>       &ext_eval_coarse
+			) :
+    _CoarseGrid(CoarseGrid),
+    _FineGrid(FineGrid),
+    _FineOp(FineOp),
+    _checkerboard(checkerboard),
+    evals_fine  (ext_eval_fine), 
+    evals_coarse(ext_eval_coarse),
+    subspace    (ext_subspace),
+    evec_coarse (ext_coarse)
+  {
+    evals_fine.resize(0);
+    evals_coarse.resize(0);
+  };
+
+  void Orthogonalise(void ) {
+    CoarseScalar InnerProd(_CoarseGrid); 
+    blockOrthogonalise(InnerProd,subspace);std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
+    blockOrthogonalise(InnerProd,subspace);std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
+  };

  template<typename T>  static RealD normalise(T& v) 
  {
@ -246,43 +296,44 @@ public:
    v = v * (1.0/nn);
    return nn;
  }
-
+  /*
  void fakeFine(void)
  {
    int Nk = nbasis;
-    _Aggregate.subspace.resize(Nk,_FineGrid);
-    _Aggregate.subspace[0]=1.0;
-    _Aggregate.subspace[0].checkerboard=_checkerboard;
-    normalise(_Aggregate.subspace[0]);
+    subspace.resize(Nk,_FineGrid);
+    subspace[0]=1.0;
+    subspace[0].checkerboard=_checkerboard;
+    normalise(subspace[0]);
    PlainHermOp<FineField>    Op(_FineOp);
    for(int k=1;k<Nk;k++){
-      _Aggregate.subspace[k].checkerboard=_checkerboard;
-      Op(_Aggregate.subspace[k-1],_Aggregate.subspace[k]);
-      normalise(_Aggregate.subspace[k]);
+      subspace[k].checkerboard=_checkerboard;
+      Op(subspace[k-1],subspace[k]);
+      normalise(subspace[k]);
    }
  }
+  */

  void testFine(RealD resid) 
  {
    assert(evals_fine.size() == nbasis);
-    assert(_Aggregate.subspace.size() == nbasis);
+    assert(subspace.size() == nbasis);
    PlainHermOp<FineField>    Op(_FineOp);
    ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
    for(int k=0;k<nbasis;k++){
-      assert(SimpleTester.ReconstructEval(k,resid,_Aggregate.subspace[k],evals_fine[k],1.0)==1);
+      assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
    }
  }

  void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
  {
    assert(evals_fine.size() == nbasis);
-    assert(_Aggregate.subspace.size() == nbasis);
+    assert(subspace.size() == nbasis);
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////
    Chebyshev<FineField>                          ChebySmooth(cheby_smooth);
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,_Aggregate);
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,_subspace);
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);

    for(int k=0;k<evec_coarse.size();k++){
      if ( k < nbasis ) { 
@ -302,34 +353,34 @@ public:
    PlainHermOp<FineField>    Op(_FineOp);

    evals_fine.resize(Nm);
-    _Aggregate.subspace.resize(Nm,_FineGrid);
+    subspace.resize(Nm,_FineGrid);

    ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);

    FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard;

    int Nconv;
-    IRL.calc(evals_fine,_Aggregate.subspace,src,Nconv,false);
+    IRL.calc(evals_fine,subspace,src,Nconv,false);
    
    // Shrink down to number saved
    assert(Nstop>=nbasis);
    assert(Nconv>=nbasis);
    evals_fine.resize(nbasis);
-    _Aggregate.subspace.resize(nbasis,_FineGrid);
+    subspace.resize(nbasis,_FineGrid);
  }
  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
 		  int Nstop, int Nk, int Nm,RealD resid, 
 		  RealD MaxIt, RealD betastp, int MinRes)
  {
    Chebyshev<FineField>                          Cheby(cheby_op);
-    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,_Aggregate);
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,_Aggregate);
+    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,_subspace);
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,_subspace);
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////

    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_subspace,relax);

    evals_coarse.resize(Nm);
    evec_coarse.resize(Nm,_CoarseGrid);
--- a/lib/algorithms/iterative/SchurRedBlack.h
+++ b/lib/algorithms/iterative/SchurRedBlack.h
@ -107,7 +107,12 @@ namespace Grid {
    };

    template<class Matrix>
-      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+    void operator() (Matrix & _Matrix,const Field &in, Field &out){
+      ZeroGuesser guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+    template<class Matrix, class Guesser>
+    void operator() (Matrix & _Matrix,const Field &in, Field &out, Guesser &guess){

      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
@ -129,7 +134,6 @@ namespace Grid {
      pickCheckerboard(Odd ,src_o,in);
      pickCheckerboard(Even,sol_e,out);
      pickCheckerboard(Odd ,sol_o,out);
-
      std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" <<std::endl;
    
      /////////////////////////////////////////////////////
@ -146,6 +150,7 @@ namespace Grid {
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
+      guess(src_o,sol_o);
      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver called  the Mpc solver" <<std::endl;

@ -189,7 +194,12 @@ namespace Grid {
    CBfactorise=cb;
  };
    template<class Matrix>
-      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+    void operator() (Matrix & _Matrix,const Field &in, Field &out){
+      ZeroGuesser guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+    template<class Matrix, class Guesser>
+    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){

      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
@ -225,6 +235,7 @@ namespace Grid {
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
+      guess(src_o,sol_o);
      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);

      ///////////////////////////////////////////////////
@ -268,7 +279,12 @@ namespace Grid {
    };

    template<class Matrix>
-      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+    void operator() (Matrix & _Matrix,const Field &in, Field &out){
+      ZeroGuesser guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+    template<class Matrix,class Guesser>
+    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){

      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
@ -305,6 +321,7 @@ namespace Grid {
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
 //      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
+      guess(src_o,tmp);
      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
      _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd);

@ -347,7 +364,12 @@ namespace Grid {
    };

    template<class Matrix>
-      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+    void operator() (Matrix & _Matrix,const Field &in, Field &out){
+      ZeroGuesser guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+    template<class Matrix, class Guesser>
+    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){

      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
@ -385,6 +407,7 @@ namespace Grid {
      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
 //      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
 //      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
+      guess(src_o,tmp);
      _HermitianRBSolver(src_o,tmp);  assert(tmp.checkerboard==Odd);
      _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd);

--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@ -44,11 +44,15 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
-    assert (provided == MPI_THREAD_MULTIPLE);
+    //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
+    if( (nCommThreads == 1 && provided == MPI_THREAD_SINGLE) ||
+        (nCommThreads > 1 && provided != MPI_THREAD_MULTIPLE) )
+      assert(0);
  }

  Grid_quiesce_nodes();

+  // Never clean up as done once.
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);

  GlobalSharedMemory::Init(communicator_world);
@ -85,9 +89,17 @@ void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &c
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) 
 {
  MPI_Comm optimal_comm;
-  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm); // Remap using the shared memory optimising routine
+  ////////////////////////////////////////////////////
+  // Remap using the shared memory optimising routine
+  // The remap creates a comm which must be freed
+  ////////////////////////////////////////////////////
+  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm);
  InitFromMPICommunicator(processors,optimal_comm);
  SetCommunicator(optimal_comm);
+  ///////////////////////////////////////////////////
+  // Free the temp communicator
+  ///////////////////////////////////////////////////
+  MPI_Comm_free(&optimal_comm);
 }

 //////////////////////////////////
@ -183,8 +195,8 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,

  } else {
    srank = 0;
-    comm_split    = parent.communicator;
-    //    std::cout << " Inherited communicator " <<comm_split <<std::endl;
+    int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
+    assert(ierr==0);
  }

  //////////////////////////////////////////////////////////////////////////////////////////////////////
@ -196,6 +208,11 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
  // Take the right SHM buffers
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  SetCommunicator(comm_split);
+  
+  ///////////////////////////////////////////////
+  // Free the temp communicator 
+  ///////////////////////////////////////////////
+  MPI_Comm_free(&comm_split);

  if(0){ 
    std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
@ -210,6 +227,9 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,

 void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
 {
+  ////////////////////////////////////////////////////
+  // Creates communicator, and the communicator_halo
+  ////////////////////////////////////////////////////
  _ndimension = processors.size();
  _processor_coor.resize(_ndimension);

--- a/lib/communicator/SharedMemory.h
+++ b/lib/communicator/SharedMemory.h
@ -133,6 +133,7 @@ class SharedMemory

 public:
  SharedMemory() {};
+  ~SharedMemory();
  ///////////////////////////////////////////////////////////////////////////////////////
  // set the buffers & sizes
  ///////////////////////////////////////////////////////////////////////////////////////
--- a/lib/communicator/SharedMemoryMPI.cc
+++ b/lib/communicator/SharedMemoryMPI.cc
@ -27,6 +27,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */

 #include <Grid/GridCore.h>
+#include <pwd.h>

 namespace Grid { 

@ -226,6 +227,48 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 };
 #endif // MMAP

+#ifdef GRID_MPI3_SHM_NONE
+void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
+{
+  std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
+  assert(_ShmSetup==1);
+  assert(_ShmAlloc==0);
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // allocate the shared windows for our group
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  MPI_Barrier(WorldShmComm);
+  WorldShmCommBufs.resize(WorldShmSize);
+  
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  // Hugetlbf and others map filesystems as mappable huge pages
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  char shm_name [NAME_MAX];
+  assert(WorldShmSize == 1);
+  for(int r=0;r<WorldShmSize;r++){
+    
+    int fd=-1;
+    int mmap_flag = MAP_SHARED |MAP_ANONYMOUS ;
+#ifdef MAP_POPULATE    
+    mmap_flag|=MAP_POPULATE;
+#endif
+#ifdef MAP_HUGETLB
+    if ( flags ) mmap_flag |= MAP_HUGETLB;
+#endif
+    void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
+    if ( ptr == (void *)MAP_FAILED ) {    
+      printf("mmap %s failed\n",shm_name);
+      perror("failed mmap");      assert(0);    
+    }
+    assert(((uint64_t)ptr&0x3F)==0);
+    close(fd);
+    WorldShmCommBufs[r] =ptr;
+    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
+  }
+  _ShmAlloc=1;
+  _ShmAllocBytes  = bytes;
+};
+#endif // MMAP
+
 #ifdef GRID_MPI3_SHMOPEN
 ////////////////////////////////////////////////////////////////////////////////////////////
 // POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
@ -246,7 +289,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 	
      size_t size = bytes;
      
-      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldNode,r);
+      struct passwd *pw = getpwuid (getuid());
+      sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
      
      shm_unlink(shm_name);
      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
@ -281,7 +325,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)

      size_t size = bytes ;
      
-      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldNode,r);
+      struct passwd *pw = getpwuid (getuid());
+      sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
      
      int fd=shm_open(shm_name,O_RDWR,0666);
      if ( fd<0 ) {	perror("failed shm_open");	assert(0);      }
@ -399,5 +444,12 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
    return (void *) remote;
  }
 }
+SharedMemory::~SharedMemory()
+{
+  int MPI_is_finalised;  MPI_Finalized(&MPI_is_finalised);
+  if ( !MPI_is_finalised ) { 
+    MPI_Comm_free(&ShmComm);
+  }
+};

 }
--- a/lib/communicator/SharedMemoryNone.cc
+++ b/lib/communicator/SharedMemoryNone.cc
@ -122,5 +122,7 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
 {
  return NULL;
 }
+SharedMemory::~SharedMemory()
+{};

 }
--- a/lib/lattice/Lattice_comparison_utils.h
+++ b/lib/lattice/Lattice_comparison_utils.h
@ -198,7 +198,7 @@ namespace Grid {
      typedef typename vsimd::scalar_type scalar;\
      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
    }\
-  template<class vsimd>\
+  template<class vsimd,IfSimd<vsimd> = 0>\
    inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
    {									\
      return lhs._internal op rhs._internal;				\
--- a/lib/lattice/Lattice_coordinate.h
+++ b/lib/lattice/Lattice_coordinate.h
@ -52,23 +52,5 @@ namespace Grid {
      }
    };

-    // LatticeCoordinate();
-    // FIXME for debug; deprecate this; made obscelete by 
-    template<class vobj> void lex_sites(Lattice<vobj> &l){
-      Real *v_ptr = (Real *)&l._odata[0];
-      size_t o_len = l._grid->oSites();
-      size_t v_len = sizeof(vobj)/sizeof(vRealF);
-      size_t vec_len = vRealF::Nsimd();
-
-      for(int i=0;i<o_len;i++){
-	for(int j=0;j<v_len;j++){
-          for(int vv=0;vv<vec_len;vv+=2){
-	    v_ptr[i*v_len*vec_len+j*vec_len+vv  ]= i+vv*500;
-	    v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
-	  }
-	}}
-    }
-
-
 }
 #endif
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@ -659,6 +659,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
 template<class VobjOut, class VobjIn>
 void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
  assert(out._grid->Nd() == in._grid->Nd());
+  assert(out._grid->FullDimensions() == in._grid->FullDimensions());
  out.checkerboard = in.checkerboard;
  GridBase *in_grid=in._grid;
  GridBase *out_grid = out._grid;
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@ -91,7 +91,7 @@ class BinaryIO {
    typedef typename vobj::scalar_object sobj;

    GridBase *grid = lat._grid;
-    int lsites = grid->lSites();
+    uint64_t lsites = grid->lSites();

    std::vector<sobj> scalardata(lsites); 
    unvectorizeToLexOrdArray(scalardata,lat);    
@ -160,7 +160,9 @@ class BinaryIO {

 	/* 
 	 * Scidac csum  is rather more heavyweight
+	 * FIXME -- 128^3 x 256 x 16 will overflow.
 	 */
+	
 	int global_site;

 	Lexicographic::CoorFromIndex(coor,local_site,local_vol);
@ -261,7 +263,7 @@ class BinaryIO {
 			      GridBase *grid,
 			      std::vector<fobj> &iodata,
 			      std::string file,
-			      Integer offset,
+			      uint64_t offset,
 			      const std::string &format, int control,
 			      uint32_t &nersc_csum,
 			      uint32_t &scidac_csuma,
@ -523,7 +525,7 @@ class BinaryIO {
  static inline void readLatticeObject(Lattice<vobj> &Umu,
 				       std::string file,
 				       munger munge,
-				       Integer offset,
+				       uint64_t offset,
 				       const std::string &format,
 				       uint32_t &nersc_csum,
 				       uint32_t &scidac_csuma,
@ -533,7 +535,7 @@ class BinaryIO {
    typedef typename vobj::Realified::scalar_type word;    word w=0;

    GridBase *grid = Umu._grid;
-    int lsites = grid->lSites();
+    uint64_t lsites = grid->lSites();

    std::vector<sobj> scalardata(lsites); 
    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
@ -544,7 +546,7 @@ class BinaryIO {
    GridStopWatch timer; 
    timer.Start();

-    parallel_for(int x=0;x<lsites;x++) munge(iodata[x], scalardata[x]);
+    parallel_for(uint64_t x=0;x<lsites;x++) munge(iodata[x], scalardata[x]);

    vectorizeFromLexOrdArray(scalardata,Umu);    
    grid->Barrier();
@ -560,7 +562,7 @@ class BinaryIO {
    static inline void writeLatticeObject(Lattice<vobj> &Umu,
 					  std::string file,
 					  munger munge,
-					  Integer offset,
+					  uint64_t offset,
 					  const std::string &format,
 					  uint32_t &nersc_csum,
 					  uint32_t &scidac_csuma,
@ -569,7 +571,7 @@ class BinaryIO {
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::Realified::scalar_type word;    word w=0;
    GridBase *grid = Umu._grid;
-    int lsites = grid->lSites();
+    uint64_t lsites = grid->lSites();

    std::vector<sobj> scalardata(lsites); 
    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
@ -580,7 +582,7 @@ class BinaryIO {
    GridStopWatch timer; timer.Start();
    unvectorizeToLexOrdArray(scalardata,Umu);    

-    parallel_for(int x=0;x<lsites;x++) munge(scalardata[x],iodata[x]);
+    parallel_for(uint64_t x=0;x<lsites;x++) munge(scalardata[x],iodata[x]);

    grid->Barrier();
    timer.Stop();
@ -597,7 +599,7 @@ class BinaryIO {
  static inline void readRNG(GridSerialRNG &serial,
 			     GridParallelRNG &parallel,
 			     std::string file,
-			     Integer offset,
+			     uint64_t offset,
 			     uint32_t &nersc_csum,
 			     uint32_t &scidac_csuma,
 			     uint32_t &scidac_csumb)
@ -610,8 +612,8 @@ class BinaryIO {
    std::string format = "IEEE32BIG";

    GridBase *grid = parallel._grid;
-    int gsites = grid->gSites();
-    int lsites = grid->lSites();
+    uint64_t gsites = grid->gSites();
+    uint64_t lsites = grid->lSites();

    uint32_t nersc_csum_tmp   = 0;
    uint32_t scidac_csuma_tmp = 0;
@ -626,7 +628,7 @@ class BinaryIO {
 	     nersc_csum,scidac_csuma,scidac_csumb);

    timer.Start();
-    parallel_for(int lidx=0;lidx<lsites;lidx++){
+    parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
      std::vector<RngStateType> tmp(RngStateCount);
      std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
      parallel.SetState(tmp,lidx);
@ -659,7 +661,7 @@ class BinaryIO {
  static inline void writeRNG(GridSerialRNG &serial,
 			      GridParallelRNG &parallel,
 			      std::string file,
-			      Integer offset,
+			      uint64_t offset,
 			      uint32_t &nersc_csum,
 			      uint32_t &scidac_csuma,
 			      uint32_t &scidac_csumb)
@ -670,8 +672,8 @@ class BinaryIO {
    typedef std::array<RngStateType,RngStateCount> RNGstate;

    GridBase *grid = parallel._grid;
-    int gsites = grid->gSites();
-    int lsites = grid->lSites();
+    uint64_t gsites = grid->gSites();
+    uint64_t lsites = grid->lSites();

    uint32_t nersc_csum_tmp;
    uint32_t scidac_csuma_tmp;
@ -684,7 +686,7 @@ class BinaryIO {

    timer.Start();
    std::vector<RNGstate> iodata(lsites);
-    parallel_for(int lidx=0;lidx<lsites;lidx++){
+    parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
      std::vector<RngStateType> tmp(RngStateCount);
      parallel.GetState(tmp,lidx);
      std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
--- a/lib/parallelIO/IldgIO.h
+++ b/lib/parallelIO/IldgIO.h
@ -337,6 +337,20 @@ class GridLimeWriter : public BinaryIO {
  template<class vobj>
  void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
  {
+    ////////////////////////////////////////////////////////////////////
+    // NB: FILE and iostream are jointly writing disjoint sequences in the
+    // the same file through different file handles (integer units).
+    // 
+    // These are both buffered, so why I think this code is right is as follows.
+    //
+    // i)  write record header to FILE *File, telegraphing the size; flush
+    // ii) ftello reads the offset from FILE *File . 
+    // iii) iostream / MPI Open independently seek this offset. Write sequence direct to disk.
+    //      Closes iostream and flushes.
+    // iv) fseek on FILE * to end of this disjoint section.
+    //  v) Continue writing scidac record.
+    ////////////////////////////////////////////////////////////////////
+
    ////////////////////////////////////////////
    // Create record header
    ////////////////////////////////////////////
@ -350,25 +364,24 @@ class GridLimeWriter : public BinaryIO {
    //    std::cout << "W Gsites "           <<field._grid->_gsites<<std::endl;
    //    std::cout << "W Payload expected " <<PayloadSize<<std::endl;

-    ////////////////////////////////////////////////////////////////////
-    // NB: FILE and iostream are jointly writing disjoint sequences in the
-    // the same file through different file handles (integer units).
-    // 
-    // These are both buffered, so why I think this code is right is as follows.
-    //
-    // i)  write record header to FILE *File, telegraphing the size. 
-    // ii) ftello reads the offset from FILE *File .
-    // iii) iostream / MPI Open independently seek this offset. Write sequence direct to disk.
-    //      Closes iostream and flushes.
-    // iv) fseek on FILE * to end of this disjoint section.
-    //  v) Continue writing scidac record.
-    ////////////////////////////////////////////////////////////////////
-    uint64_t offset = ftello(File);
-    //    std::cout << " Writing to offset "<<offset << std::endl;
+    fflush(File);
+
+    ///////////////////////////////////////////
+    // Write by other means into the binary record
+    ///////////////////////////////////////////
+    uint64_t offset1 = ftello(File);    //    std::cout << " Writing to offset "<<offset1 << std::endl;
    std::string format = getFormatString<vobj>();
    BinarySimpleMunger<sobj,sobj> munge;
-    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
-    //    fseek(File,0,SEEK_END);    offset = ftello(File);std::cout << " offset now "<<offset << std::endl;
+    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb);
+
+    ///////////////////////////////////////////
+    // Wind forward and close the record
+    ///////////////////////////////////////////
+    fseek(File,0,SEEK_END);             
+    uint64_t offset2 = ftello(File);     //    std::cout << " now at offset "<<offset2 << std::endl;
+
+    assert((offset2-offset1) == PayloadSize);
+
    err=limeWriterCloseRecord(LimeW);  assert(err>=0);

    ////////////////////////////////////////
@ -568,7 +581,6 @@ class IldgWriter : public ScidacWriter {
    writeLimeIldgLFN(header.ildg_lfn);                                                 // rec
    writeLimeLatticeBinaryObject(Umu,std::string(ILDG_BINARY_DATA));      // Closes message with checksum
    //    limeDestroyWriter(LimeW);
-    fclose(File);
  }
 };

--- a/lib/parallelIO/NerscIO.h
+++ b/lib/parallelIO/NerscIO.h
@ -57,7 +57,7 @@ namespace Grid {
      // for the header-reader
      static inline int readHeader(std::string file,GridBase *grid,  FieldMetaData &field)
      {
-      int offset=0;
+      uint64_t offset=0;
      std::map<std::string,std::string> header;
      std::string line;

@ -139,7 +139,7 @@ namespace Grid {
      typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;

      GridBase *grid = Umu._grid;
-      int offset = readHeader(file,Umu._grid,header);
+      uint64_t offset = readHeader(file,Umu._grid,header);

      FieldMetaData clone(header);

@ -236,7 +236,7 @@ namespace Grid {
 	GaugeStatistics(Umu,header);
 	MachineCharacteristics(header);

-	int offset;
+	uint64_t offset;
  
 	truncate(file);

@ -278,7 +278,7 @@ namespace Grid {
 	header.plaquette=0.0;
 	MachineCharacteristics(header);

-	int offset;
+	uint64_t offset;
  
 #ifdef RNG_RANLUX
 	header.floating_point = std::string("UINT64");
@ -313,7 +313,7 @@ namespace Grid {

 	GridBase *grid = parallel._grid;

-	int offset = readHeader(file,grid,header);
+	uint64_t offset = readHeader(file,grid,header);

 	FieldMetaData clone(header);

--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@ -73,7 +73,7 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi
  this->DW(psi,tmp_f,DaggerYes);

  for(int s=0;s<Ls;s++){
-    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp_f,s,s);// chi = (1-c[s] D_W) psi
+    axpby_ssp(chi,Coeff_t(1.0),psi,conjugate(-cs[s]),tmp_f,s,s);// chi = (1-c[s] D_W) psi
  }
 }

--- a/lib/qcd/action/gauge/WilsonGaugeAction.h
+++ b/lib/qcd/action/gauge/WilsonGaugeAction.h
@ -71,18 +71,14 @@ class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {

    RealD factor = 0.5 * beta / RealD(Nc);

-    //GaugeLinkField Umu(U._grid);
+    GaugeLinkField Umu(U._grid);
    GaugeLinkField dSdU_mu(U._grid);
    for (int mu = 0; mu < Nd; mu++) {
-      //Umu = PeekIndex<LorentzIndex>(U, mu);
+      Umu = PeekIndex<LorentzIndex>(U, mu);

      // Staple in direction mu
-      //WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
-      //dSdU_mu = Ta(Umu * dSdU_mu) * factor;
-
-  
-      WilsonLoops<Gimpl>::StapleMult(dSdU_mu, U, mu);
-      dSdU_mu = Ta(dSdU_mu) * factor;
+      WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
+      dSdU_mu = Ta(Umu * dSdU_mu) * factor;

      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
    }
--- a/lib/qcd/utils/WilsonLoops.h
+++ b/lib/qcd/utils/WilsonLoops.h
@ -212,6 +212,7 @@ public:


 // For the force term
+/*
 static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
    GridBase *grid = Umu._grid;
    std::vector<GaugeMat> U(Nd, grid);
@ -225,7 +226,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {

    for (int nu = 0; nu < Nd; nu++) {
      if (nu != mu) {
-        // this is ~10% faster than the Staple
+        // this is ~10% faster than the Staple  -- PAB: so what it gives the WRONG answers for other BC's!
        tmp1 = Cshift(U[nu], mu, 1);
        tmp2 = Cshift(U[mu], nu, 1);
        staple += tmp1* adj(U[nu]*tmp2);
@ -235,7 +236,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
    }
    staple = U[mu]*staple;
 }
-
+*/
  //////////////////////////////////////////////////
  // the sum over all staples on each site
  //////////////////////////////////////////////////
--- a/lib/serialisation/BaseIO.h
+++ b/lib/serialisation/BaseIO.h
@ -31,113 +31,10 @@ Author: Guido Cossu <guido.cossu@ed.ac.uk>
 #define GRID_SERIALISATION_ABSTRACT_READER_H

 #include <type_traits>
+#include <Grid/tensors/Tensors.h>
+#include <Grid/serialisation/VectorUtils.h>

 namespace Grid {
-  // Vector IO utilities ///////////////////////////////////////////////////////
-  // helper function to read space-separated values
-  template <typename T>
-  std::vector<T> strToVec(const std::string s)
-  {
-    std::istringstream sstr(s);
-    T                  buf;
-    std::vector<T>     v;
-    
-    while(!sstr.eof())
-    {
-      sstr >> buf;
-      v.push_back(buf);
-    }
-    
-    return v;
-  }
-  
-  // output to streams for vectors
-  template < class T >
-  inline std::ostream & operator<<(std::ostream &os, const std::vector<T> &v)
-  {
-    os << "[";
-    for (auto &x: v)
-    {
-      os << x << " ";
-    }
-    if (v.size() > 0)
-    {
-      os << "\b";
-    }
-    os << "]";
-    
-    return os;
-  }
-  
-  // Vector element trait //////////////////////////////////////////////////////  
-  template <typename T>
-  struct element
-  {
-    typedef T type;
-    static constexpr bool is_number = false;
-  };
-  
-  template <typename T>
-  struct element<std::vector<T>>
-  {
-    typedef typename element<T>::type type;
-    static constexpr bool is_number = std::is_arithmetic<T>::value
-                                      or is_complex<T>::value
-                                      or element<T>::is_number;
-  };
-  
-  // Vector flattening utility class ////////////////////////////////////////////
-  // Class to flatten a multidimensional std::vector
-  template <typename V>
-  class Flatten
-  {
-  public:
-    typedef typename element<V>::type Element;
-  public:
-    explicit                     Flatten(const V &vector);
-    const V &                    getVector(void);
-    const std::vector<Element> & getFlatVector(void);
-    const std::vector<size_t>  & getDim(void);
-  private:
-    void accumulate(const Element &e);
-    template <typename W>
-    void accumulate(const W &v);
-    void accumulateDim(const Element &e);
-    template <typename W>
-    void accumulateDim(const W &v);
-  private:
-    const V              &vector_;
-    std::vector<Element> flatVector_;
-    std::vector<size_t>  dim_;
-  };
-  
-  // Class to reconstruct a multidimensional std::vector
-  template <typename V>
-  class Reconstruct
-  {
-  public:
-    typedef typename element<V>::type Element;
-  public:
-    Reconstruct(const std::vector<Element> &flatVector,
-                const std::vector<size_t> &dim);
-    const V &                    getVector(void);
-    const std::vector<Element> & getFlatVector(void);
-    const std::vector<size_t>  & getDim(void);
-  private:
-    void fill(std::vector<Element> &v);
-    template <typename W>
-    void fill(W &v);
-    void resize(std::vector<Element> &v, const unsigned int dim);
-    template <typename W>
-    void resize(W &v, const unsigned int dim);
-  private:
-    V                          vector_;
-    const std::vector<Element> &flatVector_;
-    std::vector<size_t>        dim_;
-    size_t                     ind_{0};
-    unsigned int               dimInd_{0};
-  };
-  
  // Pair IO utilities /////////////////////////////////////////////////////////
  // helper function to parse input in the format "<obj1 obj2>"
  template <typename T1, typename T2>
@ -151,15 +48,15 @@ namespace Grid {
    do
    {
      is.get(c);
-    } while (c != '<' && !is.eof());
-    if (c == '<')
+    } while (c != '(' && !is.eof());
+    if (c == '(')
    {
      int start = is.tellg();
      do
      {
        is.get(c);
-      } while (c != '>' && !is.eof());
-      if (c == '>')
+      } while (c != ')' && !is.eof());
+      if (c == ')')
      {
        int end = is.tellg();
        int psize = end - start - 1;
@ -182,7 +79,7 @@ namespace Grid {
  template <class T1, class T2>
  inline std::ostream & operator<<(std::ostream &os, const std::pair<T1, T2> &p)
  {
-    os << "<" << p.first << " " << p.second << ">";
+    os << "(" << p.first << " " << p.second << ")";
    return os;
  }

@ -205,6 +102,12 @@ namespace Grid {
    template <typename U>
    typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
    write(const std::string& s, const U &output);
+    template <typename U>
+    void write(const std::string &s, const iScalar<U> &output);
+    template <typename U, int N>
+    void write(const std::string &s, const iVector<U, N> &output);
+    template <typename U, int N>
+    void write(const std::string &s, const iMatrix<U, N> &output);
  private:
    T *upcast;
  };
@ -224,6 +127,12 @@ namespace Grid {
    template <typename U>
    typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
    read(const std::string& s, U &output);
+    template <typename U>
+    void read(const std::string &s, iScalar<U> &output);
+    template <typename U, int N>
+    void read(const std::string &s, iVector<U, N> &output);
+    template <typename U, int N>
+    void read(const std::string &s, iMatrix<U, N> &output);
  protected:
    template <typename U>
    void fromString(U &output, const std::string &s);
@ -237,203 +146,9 @@ namespace Grid {
  };
  template<typename T> struct isWriter {
    static const bool value = false;
-  }; 
-
-
-
-  // Generic writer interface
-  // serializable base class
-  class Serializable
-  {
-  public:
-    template <typename T>
-    static inline void write(Writer<T> &WR,const std::string &s,
-                             const Serializable &obj)
-    {}
-    
-    template <typename T>
-    static inline void read(Reader<T> &RD,const std::string &s,
-                            Serializable &obj)
-    {}
-    
-    friend inline std::ostream & operator<<(std::ostream &os,
-                                            const Serializable &obj)
-    {
-      return os;
-    }
  };
-  
-  // Flatten class template implementation /////////////////////////////////////
-  template <typename V>
-  void Flatten<V>::accumulate(const Element &e)
-  {
-    flatVector_.push_back(e);
-  }
-  
-  template <typename V>
-  template <typename W>
-  void Flatten<V>::accumulate(const W &v)
-  {
-    for (auto &e: v)
-    {
-      accumulate(e);
-    }
-  }
-  
-  template <typename V>
-  void Flatten<V>::accumulateDim(const Element &e) {};
-  
-  template <typename V>
-  template <typename W>
-  void Flatten<V>::accumulateDim(const W &v)
-  {
-    dim_.push_back(v.size());
-    accumulateDim(v[0]);
-  }
-  
-  template <typename V>
-  Flatten<V>::Flatten(const V &vector)
-  : vector_(vector)
-  {
-    accumulate(vector_);
-    accumulateDim(vector_);
-  }
-  
-  template <typename V>
-  const V & Flatten<V>::getVector(void)
-  {
-    return vector_;
-  }
-  
-  template <typename V>
-  const std::vector<typename Flatten<V>::Element> &
-  Flatten<V>::getFlatVector(void)
-  {
-    return flatVector_;
-  }
-  
-  template <typename V>
-  const std::vector<size_t> & Flatten<V>::getDim(void)
-  {
-    return dim_;
-  }
-  
-  // Reconstruct class template implementation /////////////////////////////////
-  template <typename V>
-  void Reconstruct<V>::fill(std::vector<Element> &v)
-  {
-    for (auto &e: v)
-    {
-      e = flatVector_[ind_++];
-    }
-  }
-  
-  template <typename V>
-  template <typename W>
-  void Reconstruct<V>::fill(W &v)
-  {
-    for (auto &e: v)
-    {
-      fill(e);
-    }
-  }
-  
-  template <typename V>
-  void Reconstruct<V>::resize(std::vector<Element> &v, const unsigned int dim)
-  {
-    v.resize(dim_[dim]);
-  }
-  
-  template <typename V>
-  template <typename W>
-  void Reconstruct<V>::resize(W &v, const unsigned int dim)
-  {
-    v.resize(dim_[dim]);
-    for (auto &e: v)
-    {
-      resize(e, dim + 1);
-    }
-  }
-  
-  template <typename V>
-  Reconstruct<V>::Reconstruct(const std::vector<Element> &flatVector,
-                              const std::vector<size_t> &dim)
-  : flatVector_(flatVector)
-  , dim_(dim)
-  {
-    resize(vector_, 0);
-    fill(vector_);
-  }
-  
-  template <typename V>
-  const V & Reconstruct<V>::getVector(void)
-  {
-    return vector_;
-  }
-  
-  template <typename V>
-  const std::vector<typename Reconstruct<V>::Element> &
-  Reconstruct<V>::getFlatVector(void)
-  {
-    return flatVector_;
-  }
-  
-  template <typename V>
-  const std::vector<size_t> & Reconstruct<V>::getDim(void)
-  {
-    return dim_;
-  }
-  
-  // Generic writer interface //////////////////////////////////////////////////
-  template <typename T>
-  inline void push(Writer<T> &w, const std::string &s) {
-    w.push(s);
-  }
-  
-  template <typename T>
-  inline void push(Writer<T> &w, const char *s)
-  {
-    w.push(std::string(s));
-  }
-  
-  template <typename T>
-  inline void pop(Writer<T> &w)
-  {
-    w.pop();
-  }
-  
-  template <typename T, typename U>
-  inline void write(Writer<T> &w, const std::string& s, const U &output)
-  {
-    w.write(s, output);
-  }
-  
-  // Generic reader interface
-  template <typename T>
-  inline bool push(Reader<T> &r, const std::string &s)
-  {
-    return r.push(s);
-  }
-  
-  template <typename T>
-  inline bool push(Reader<T> &r, const char *s)
-  {
-    return r.push(std::string(s));
-  }
-  
-  template <typename T>
-  inline void pop(Reader<T> &r)
-  {
-    r.pop();
-  }
-  
-  template <typename T, typename U>
-  inline void read(Reader<T> &r, const std::string &s, U &output)
-  {
-    r.read(s, output);
-  }
-  
-  // Writer template implementation ////////////////////////////////////////////
+
+  // Writer template implementation
  template <typename T>
  Writer<T>::Writer(void)
  {
@ -467,6 +182,27 @@ namespace Grid {
  {
    upcast->writeDefault(s, output);
  }
+
+  template <typename T>
+  template <typename U>
+  void Writer<T>::write(const std::string &s, const iScalar<U> &output)
+  {
+    upcast->writeDefault(s, tensorToVec(output));
+  }
+
+  template <typename T>
+  template <typename U, int N>
+  void Writer<T>::write(const std::string &s, const iVector<U, N> &output)
+  {
+    upcast->writeDefault(s, tensorToVec(output));
+  }
+
+  template <typename T>
+  template <typename U, int N>
+  void Writer<T>::write(const std::string &s, const iMatrix<U, N> &output)
+  {
+    upcast->writeDefault(s, tensorToVec(output));
+  }
  
  // Reader template implementation
  template <typename T>
@ -502,7 +238,37 @@ namespace Grid {
  {
    upcast->readDefault(s, output);
  }
+
+  template <typename T>
+  template <typename U>
+  void Reader<T>::read(const std::string &s, iScalar<U> &output)
+  {
+    typename TensorToVec<iScalar<U>>::type v;
+
+    upcast->readDefault(s, v);
+    vecToTensor(output, v);
+  }
+
+  template <typename T>
+  template <typename U, int N>
+  void Reader<T>::read(const std::string &s, iVector<U, N> &output)
+  {
+    typename TensorToVec<iVector<U, N>>::type v;
+    
+    upcast->readDefault(s, v);
+    vecToTensor(output, v);
+  }
  
+  template <typename T>
+  template <typename U, int N>
+  void Reader<T>::read(const std::string &s, iMatrix<U, N> &output)
+  {
+    typename TensorToVec<iMatrix<U, N>>::type v;
+    
+    upcast->readDefault(s, v);
+    vecToTensor(output, v);
+  }
+
  template <typename T>
  template <typename U>
  void Reader<T>::fromString(U &output, const std::string &s)
@ -521,6 +287,76 @@ namespace Grid {
      abort();
    }
  }
+
+  // serializable base class ///////////////////////////////////////////////////
+  class Serializable
+  {
+  public:
+    template <typename T>
+    static inline void write(Writer<T> &WR,const std::string &s,
+                             const Serializable &obj)
+    {}
+    
+    template <typename T>
+    static inline void read(Reader<T> &RD,const std::string &s,
+                            Serializable &obj)
+    {}
+    
+    friend inline std::ostream & operator<<(std::ostream &os,
+                                            const Serializable &obj)
+    {
+      return os;
+    }
+  };
+  
+  // Generic writer interface //////////////////////////////////////////////////
+  template <typename T>
+  inline void push(Writer<T> &w, const std::string &s) {
+    w.push(s);
+  }
+  
+  template <typename T>
+  inline void push(Writer<T> &w, const char *s)
+  {
+    w.push(std::string(s));
+  }
+  
+  template <typename T>
+  inline void pop(Writer<T> &w)
+  {
+    w.pop();
+  }
+  
+  template <typename T, typename U>
+  inline void write(Writer<T> &w, const std::string& s, const U &output)
+  {
+    w.write(s, output);
+  }
+  
+  // Generic reader interface //////////////////////////////////////////////////
+  template <typename T>
+  inline bool push(Reader<T> &r, const std::string &s)
+  {
+    return r.push(s);
+  }
+  
+  template <typename T>
+  inline bool push(Reader<T> &r, const char *s)
+  {
+    return r.push(std::string(s));
+  }
+  
+  template <typename T>
+  inline void pop(Reader<T> &r)
+  {
+    r.pop();
+  }
+  
+  template <typename T, typename U>
+  inline void read(Reader<T> &r, const std::string &s, U &output)
+  {
+    r.read(s, output);
+  }
 }

 #endif
--- a/lib/serialisation/Hdf5IO.h
+++ b/lib/serialisation/Hdf5IO.h
@ -5,6 +5,7 @@
 #include <string>
 #include <vector>
 #include <H5Cpp.h>
+#include <Grid/tensors/Tensors.h>
 #include "Hdf5Type.h"

 #ifndef H5_NO_NAMESPACE
--- a/lib/serialisation/VectorUtils.h
+++ b/lib/serialisation/VectorUtils.h
@ -0,0 +1,336 @@
+#ifndef GRID_SERIALISATION_VECTORUTILS_H
+#define GRID_SERIALISATION_VECTORUTILS_H
+
+#include <type_traits>
+#include <Grid/tensors/Tensors.h>
+
+namespace Grid {
+  // Grid scalar tensors to nested std::vectors //////////////////////////////////
+  template <typename T>
+  struct TensorToVec
+  {
+    typedef T type;
+  };
+
+  template <typename T>
+  struct TensorToVec<iScalar<T>>
+  {
+    typedef typename TensorToVec<T>::type type;
+  };
+
+  template <typename T, int N>
+  struct TensorToVec<iVector<T, N>>
+  {
+    typedef typename std::vector<typename TensorToVec<T>::type> type;
+  };
+
+  template <typename T, int N>
+  struct TensorToVec<iMatrix<T, N>>
+  {
+    typedef typename std::vector<std::vector<typename TensorToVec<T>::type>> type;
+  };
+
+  template <typename T>
+  typename TensorToVec<T>::type tensorToVec(const T &t)
+  {
+    return t;
+  }
+
+  template <typename T>
+  typename TensorToVec<iScalar<T>>::type tensorToVec(const iScalar<T>& t)
+  {
+    return tensorToVec(t._internal);
+  }
+
+  template <typename T, int N>
+  typename TensorToVec<iVector<T, N>>::type tensorToVec(const iVector<T, N>& t)
+  {
+    typename TensorToVec<iVector<T, N>>::type v;
+
+    v.resize(N);
+    for (unsigned int i = 0; i < N; i++) 
+    {
+      v[i] = tensorToVec(t._internal[i]);
+    }
+
+    return v;
+  }
+
+  template <typename T, int N>
+  typename TensorToVec<iMatrix<T, N>>::type tensorToVec(const iMatrix<T, N>& t)
+  {
+    typename TensorToVec<iMatrix<T, N>>::type v;
+
+    v.resize(N);
+    for (unsigned int i = 0; i < N; i++)
+    {
+      v[i].resize(N);
+      for (unsigned int j = 0; j < N; j++) 
+      {
+        v[i][j] = tensorToVec(t._internal[i][j]);
+      }
+    }
+
+    return v;
+  }
+
+  template <typename T>
+  void vecToTensor(T &t, const typename TensorToVec<T>::type &v)
+  {
+    t = v;
+  }
+
+
+  template <typename T>
+  void vecToTensor(iScalar<T> &t, const typename TensorToVec<iScalar<T>>::type &v)
+  {
+    vecToTensor(t._internal, v);
+  }
+
+  template <typename T, int N>
+  void vecToTensor(iVector<T, N> &t, const typename TensorToVec<iVector<T, N>>::type &v)
+  {
+    for (unsigned int i = 0; i < N; i++) 
+    {
+      vecToTensor(t._internal[i], v[i]);
+    }
+  }
+
+  template <typename T, int N>
+  void vecToTensor(iMatrix<T, N> &t, const typename TensorToVec<iMatrix<T, N>>::type &v)
+  {
+    for (unsigned int i = 0; i < N; i++)
+    for (unsigned int j = 0; j < N; j++)
+    {
+      vecToTensor(t._internal[i][j], v[i][j]);
+    }
+  }
+
+  // Vector element trait //////////////////////////////////////////////////////  
+  template <typename T>
+  struct element
+  {
+    typedef T type;
+    static constexpr bool is_number = false;
+  };
+  
+  template <typename T>
+  struct element<std::vector<T>>
+  {
+    typedef typename element<T>::type type;
+    static constexpr bool is_number = std::is_arithmetic<T>::value
+                                      or is_complex<T>::value
+                                      or element<T>::is_number;
+  };
+  
+  // Vector flattening utility class ////////////////////////////////////////////
+  // Class to flatten a multidimensional std::vector
+  template <typename V>
+  class Flatten
+  {
+  public:
+    typedef typename element<V>::type Element;
+  public:
+    explicit                     Flatten(const V &vector);
+    const V &                    getVector(void);
+    const std::vector<Element> & getFlatVector(void);
+    const std::vector<size_t>  & getDim(void);
+  private:
+    void accumulate(const Element &e);
+    template <typename W>
+    void accumulate(const W &v);
+    void accumulateDim(const Element &e);
+    template <typename W>
+    void accumulateDim(const W &v);
+  private:
+    const V              &vector_;
+    std::vector<Element> flatVector_;
+    std::vector<size_t>  dim_;
+  };
+  
+  // Class to reconstruct a multidimensional std::vector
+  template <typename V>
+  class Reconstruct
+  {
+  public:
+    typedef typename element<V>::type Element;
+  public:
+    Reconstruct(const std::vector<Element> &flatVector,
+                const std::vector<size_t> &dim);
+    const V &                    getVector(void);
+    const std::vector<Element> & getFlatVector(void);
+    const std::vector<size_t>  & getDim(void);
+  private:
+    void fill(std::vector<Element> &v);
+    template <typename W>
+    void fill(W &v);
+    void resize(std::vector<Element> &v, const unsigned int dim);
+    template <typename W>
+    void resize(W &v, const unsigned int dim);
+  private:
+    V                          vector_;
+    const std::vector<Element> &flatVector_;
+    std::vector<size_t>        dim_;
+    size_t                     ind_{0};
+    unsigned int               dimInd_{0};
+  };
+
+  // Flatten class template implementation
+  template <typename V>
+  void Flatten<V>::accumulate(const Element &e)
+  {
+    flatVector_.push_back(e);
+  }
+  
+  template <typename V>
+  template <typename W>
+  void Flatten<V>::accumulate(const W &v)
+  {
+    for (auto &e: v)
+    {
+      accumulate(e);
+    }
+  }
+  
+  template <typename V>
+  void Flatten<V>::accumulateDim(const Element &e) {};
+  
+  template <typename V>
+  template <typename W>
+  void Flatten<V>::accumulateDim(const W &v)
+  {
+    dim_.push_back(v.size());
+    accumulateDim(v[0]);
+  }
+  
+  template <typename V>
+  Flatten<V>::Flatten(const V &vector)
+  : vector_(vector)
+  {
+    accumulate(vector_);
+    accumulateDim(vector_);
+  }
+  
+  template <typename V>
+  const V & Flatten<V>::getVector(void)
+  {
+    return vector_;
+  }
+  
+  template <typename V>
+  const std::vector<typename Flatten<V>::Element> &
+  Flatten<V>::getFlatVector(void)
+  {
+    return flatVector_;
+  }
+  
+  template <typename V>
+  const std::vector<size_t> & Flatten<V>::getDim(void)
+  {
+    return dim_;
+  }
+  
+  // Reconstruct class template implementation
+  template <typename V>
+  void Reconstruct<V>::fill(std::vector<Element> &v)
+  {
+    for (auto &e: v)
+    {
+      e = flatVector_[ind_++];
+    }
+  }
+  
+  template <typename V>
+  template <typename W>
+  void Reconstruct<V>::fill(W &v)
+  {
+    for (auto &e: v)
+    {
+      fill(e);
+    }
+  }
+  
+  template <typename V>
+  void Reconstruct<V>::resize(std::vector<Element> &v, const unsigned int dim)
+  {
+    v.resize(dim_[dim]);
+  }
+  
+  template <typename V>
+  template <typename W>
+  void Reconstruct<V>::resize(W &v, const unsigned int dim)
+  {
+    v.resize(dim_[dim]);
+    for (auto &e: v)
+    {
+      resize(e, dim + 1);
+    }
+  }
+  
+  template <typename V>
+  Reconstruct<V>::Reconstruct(const std::vector<Element> &flatVector,
+                              const std::vector<size_t> &dim)
+  : flatVector_(flatVector)
+  , dim_(dim)
+  {
+    resize(vector_, 0);
+    fill(vector_);
+  }
+  
+  template <typename V>
+  const V & Reconstruct<V>::getVector(void)
+  {
+    return vector_;
+  }
+  
+  template <typename V>
+  const std::vector<typename Reconstruct<V>::Element> &
+  Reconstruct<V>::getFlatVector(void)
+  {
+    return flatVector_;
+  }
+  
+  template <typename V>
+  const std::vector<size_t> & Reconstruct<V>::getDim(void)
+  {
+    return dim_;
+  }
+
+  // Vector IO utilities ///////////////////////////////////////////////////////
+  // helper function to read space-separated values
+  template <typename T>
+  std::vector<T> strToVec(const std::string s)
+  {
+    std::istringstream sstr(s);
+    T                  buf;
+    std::vector<T>     v;
+    
+    while(!sstr.eof())
+    {
+      sstr >> buf;
+      v.push_back(buf);
+    }
+    
+    return v;
+  }
+  
+  // output to streams for vectors
+  template < class T >
+  inline std::ostream & operator<<(std::ostream &os, const std::vector<T> &v)
+  {
+    os << "[";
+    for (auto &x: v)
+    {
+      os << x << " ";
+    }
+    if (v.size() > 0)
+    {
+      os << "\b";
+    }
+    os << "]";
+    
+    return os;
+  }
+}
+
+#endif
--- a/lib/tensors/Tensor_logical.h
+++ b/lib/tensors/Tensor_logical.h
@ -55,5 +55,38 @@ LOGICAL_BINOP(&);
 LOGICAL_BINOP(||);
 LOGICAL_BINOP(&&);

+template <class T>
+strong_inline bool operator==(const iScalar<T> &t1, const iScalar<T> &t2)
+{
+  return (t1._internal == t2._internal);
+}
+
+template <class T, int N>
+strong_inline bool operator==(const iVector<T, N> &t1, const iVector<T, N> &t2)
+{
+  bool res = true;
+
+  for (unsigned int i = 0; i < N; ++i)
+  {
+    res = (res && (t1._internal[i] == t2._internal[i]));
+  }
+
+  return res;
+}
+
+template <class T, int N>
+strong_inline bool operator==(const iMatrix<T, N> &t1, const iMatrix<T, N> &t2)
+{
+  bool res = true;
+
+  for (unsigned int i = 0; i < N; ++i)
+  for (unsigned int j = 0; j < N; ++j)
+  {
+    res = (res && (t1._internal[i][j] == t2._internal[i][j]));
+  }
+  
+  return res;
+}
+
 }
 #endif
--- a/lib/util/Init.cc
+++ b/lib/util/Init.cc
@ -49,6 +49,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/Grid.h>

 #include <Grid/util/CompilerCompatible.h>
+#include <version.h>


 #include <fenv.h>
@ -288,6 +289,7 @@ void Grid_init(int *argc,char ***argv)
    std::cout << "but WITHOUT ANY WARRANTY; without even the implied warranty of"<<std::endl;
    std::cout << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the"<<std::endl;
    std::cout << "GNU General Public License for more details."<<std::endl;
+    printHash();
    std::cout << std::endl;
  }

--- a/lib/util/Init.h
+++ b/lib/util/Init.h
@ -61,6 +61,7 @@ namespace Grid {
 		       std::vector<int> &simd,
 		       std::vector<int> &mpi);

+  void printHash(void);

 };
 #endif
--- a/lib/util/version.cc
+++ b/lib/util/version.cc
@ -0,0 +1,12 @@
+#include <iostream>
+#include <version.h>
+namespace Grid {
+  void printHash(){
+#ifdef GITHASH
+    std::cout << "Current Grid git commit hash=" << GITHASH << std::endl;
+#else
+    std::cout << "Current Grid git commit hash is undefined. Check makefile." << std::endl;
+#endif
+#undef GITHASH
+}
+}
--- a/tests/IO/Test_serialisation.cc
+++ b/tests/IO/Test_serialisation.cc
@ -45,7 +45,8 @@ public:
                          bool , b,
                          std::vector<double>, array,
                          std::vector<std::vector<double> >, twodimarray,
-                          std::vector<std::vector<std::vector<Complex> > >, cmplx3darray
+                          std::vector<std::vector<std::vector<Complex> > >, cmplx3darray,
+                          SpinColourMatrix, scm
                          );
  myclass() {}
  myclass(int i)
@ -59,6 +60,12 @@ public:
    y=2*i;
    b=true;
    name="bother said pooh";
+    scm()(0, 1)(2, 1) = 2.356;
+    scm()(3, 0)(1, 1) = 1.323;
+    scm()(2, 1)(0, 1) = 5.3336;
+    scm()(0, 2)(1, 1) = 6.336;
+    scm()(2, 1)(2, 2) = 7.344;
+    scm()(1, 1)(2, 0) = 8.3534;
  }
 };

@ -93,8 +100,30 @@ void ioTest(const std::string &filename, const O &object, const std::string &nam
  if (!good) exit(EXIT_FAILURE);
 }

+template <typename T>
+void tensorConvTestFn(GridSerialRNG &rng, const std::string label)
+{
+  T    t, ft;
+  Real n;
+  bool good;
+
+  random(rng, t);
+  auto tv = tensorToVec(t);
+  vecToTensor(ft, tv);
+  n    = norm2(t - ft);
+  good = (n == 0);
+  std::cout << label << " norm 2 diff: " << n << " -- " 
+            << (good ? "success" : "failure") << std::endl;
+}
+
+#define tensorConvTest(rng, type) tensorConvTestFn<type>(rng, #type)
+
 int main(int argc,char **argv)
 {
+  GridSerialRNG    rng;
+
+  rng.SeedFixedIntegers(std::vector<int>({42,10,81,9}));
+  
  std::cout << "==== basic IO" << std::endl;
  XmlWriter WR("bother.xml");

@ -120,7 +149,7 @@ int main(int argc,char **argv)
  std::cout << "-- serialisable class writing to 'bother.xml'..." << std::endl;
  write(WR,"obj",obj);
  WR.write("obj2", obj);
-  vec.push_back(myclass(1234));
+  vec.push_back(obj);
  vec.push_back(myclass(5678));
  vec.push_back(myclass(3838));
  pair = std::make_pair(myenum::red, myenum::blue);
@ -131,8 +160,6 @@ int main(int argc,char **argv)
  std::cout << "-- serialisable class comparison:" << std::endl;
  std::cout << "vec[0] == obj: " << ((vec[0] == obj) ? "true" : "false") << std::endl;
  std::cout << "vec[1] == obj: " << ((vec[1] == obj) ? "true" : "false") << std::endl;
-
-  write(WR, "objpair", pair);
  std::cout << "-- pair writing to std::cout:" << std::endl;
  std::cout << pair << std::endl;

@ -141,26 +168,20 @@ int main(int argc,char **argv)
  //// XML
  ioTest<XmlWriter, XmlReader>("iotest.xml", obj, "XML    (object)           ");
  ioTest<XmlWriter, XmlReader>("iotest.xml", vec, "XML    (vector of objects)");
-  ioTest<XmlWriter, XmlReader>("iotest.xml", pair, "XML    (pair of objects)");
  //// binary
  ioTest<BinaryWriter, BinaryReader>("iotest.bin", obj, "binary (object)           ");
  ioTest<BinaryWriter, BinaryReader>("iotest.bin", vec, "binary (vector of objects)");
-  ioTest<BinaryWriter, BinaryReader>("iotest.bin", pair, "binary (pair of objects)");
  //// text
  ioTest<TextWriter, TextReader>("iotest.dat", obj, "text   (object)           ");
  ioTest<TextWriter, TextReader>("iotest.dat", vec, "text   (vector of objects)");
-  ioTest<TextWriter, TextReader>("iotest.dat", pair, "text   (pair of objects)");
  //// text
  ioTest<JSONWriter, JSONReader>("iotest.json", obj,  "JSON   (object)           ");
  ioTest<JSONWriter, JSONReader>("iotest.json", vec,  "JSON   (vector of objects)");
-  ioTest<JSONWriter, JSONReader>("iotest.json", pair, "JSON   (pair of objects)");

  //// HDF5
-#undef HAVE_HDF5
 #ifdef HAVE_HDF5
  ioTest<Hdf5Writer, Hdf5Reader>("iotest.h5", obj, "HDF5   (object)           ");
  ioTest<Hdf5Writer, Hdf5Reader>("iotest.h5", vec, "HDF5   (vector of objects)");
-  ioTest<Hdf5Writer, Hdf5Reader>("iotest.h5", pair, "HDF5   (pair of objects)");
 #endif

  std::cout << "\n==== vector flattening/reconstruction" << std::endl;
@ -197,68 +218,11 @@ int main(int argc,char **argv)
  std::cout << flatdv.getVector() << std::endl;
  std::cout << std::endl;

-
-  std::cout << ".:::::: Testing JSON classes "<< std::endl;
-
-
-  {
-    JSONWriter JW("bother.json");
-
-    // test basic type writing
-    myenum a = myenum::red;
-    push(JW,"BasicTypes");
-    write(JW,std::string("i16"),i16);
-    write(JW,"myenum",a);
-    write(JW,"u16",u16);
-    write(JW,"i32",i32);
-    write(JW,"u32",u32);
-    write(JW,"i64",i64);
-    write(JW,"u64",u64);
-    write(JW,"f",f);
-    write(JW,"d",d);
-    write(JW,"b",b);
-    pop(JW);
-
-
-    // test serializable class writing
-    myclass obj(1234); // non-trivial constructor
-    std::cout << obj << std::endl;
-    std::cout << "-- serialisable class writing to 'bother.json'..." << std::endl;
-    write(JW,"obj",obj);
-    JW.write("obj2", obj);
-
-
-    std::vector<myclass> vec;
-    vec.push_back(myclass(1234));
-    vec.push_back(myclass(5678));
-    vec.push_back(myclass(3838));
-    write(JW, "objvec", vec);
-
-  }
-
-
-  {
-    JSONReader RD("bother.json");
-    myclass jcopy1;
-    std::vector<myclass> jveccopy1;
-    read(RD,"obj",jcopy1);
-    read(RD,"objvec", jveccopy1);
-    std::cout << "Loaded (JSON) -----------------" << std::endl;
-    std::cout << jcopy1 << std::endl << jveccopy1 << std::endl;
-  }
- 
-
-/*
-  // This is still work in progress
-  {
-    // Testing the next element function
-    JSONReader RD("test.json");
-    RD.push("grid");
-    RD.push("Observable");
-    std::string name;
-    read(RD,"name", name);
-  }
-*/
-
-
+  std::cout << "==== Grid tensor to vector test" << std::endl;
+  tensorConvTest(rng, SpinColourMatrix);
+  tensorConvTest(rng, SpinColourVector);
+  tensorConvTest(rng, ColourMatrix);
+  tensorConvTest(rng, ColourVector);
+  tensorConvTest(rng, SpinMatrix);
+  tensorConvTest(rng, SpinVector);
 }
--- a/tests/Test_dwf_mixedcg_prec.cc
+++ b/tests/Test_dwf_mixedcg_prec.cc
@ -103,6 +103,33 @@ int main (int argc, char ** argv)

  std::cout << "Diff between mixed and regular CG: " << diff << std::endl;

+  #ifdef HAVE_LIME
+  if( GridCmdOptionExists(argv,argv+argc,"--checksums") ){
  
+  std::string file1("./Propagator1");
+  std::string file2("./Propagator2");
+  emptyUserRecord record;
+  uint32_t nersc_csum;
+  uint32_t scidac_csuma;
+  uint32_t scidac_csumb;
+  typedef SpinColourVectorD   FermionD;
+  typedef vSpinColourVectorD vFermionD;
+
+  BinarySimpleMunger<FermionD,FermionD> munge;
+  std::string format = getFormatString<vFermionD>();
+  
+  BinaryIO::writeLatticeObject<vFermionD,FermionD>(result_o,file1,munge, 0, format,
+						   nersc_csum,scidac_csuma,scidac_csumb);
+
+  std::cout << " Mixed checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
+
+  BinaryIO::writeLatticeObject<vFermionD,FermionD>(result_o_2,file1,munge, 0, format,
+						   nersc_csum,scidac_csuma,scidac_csumb);
+
+  std::cout << " CG checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
+  }
+  #endif
+
+
  Grid_finalize();
 }
--- a/tests/core/Test_main.cc
+++ b/tests/core/Test_main.cc
@ -393,7 +393,6 @@ int main(int argc, char **argv) {
      }
      random(Foo);
      */
-      lex_sites(Foo);

      Integer mm[4];
      mm[0] = 1;
--- a/tests/debug/Test_cayley_coarsen_support.cc
+++ b/tests/debug/Test_cayley_coarsen_support.cc
@ -111,6 +111,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage<<"Error "<<norm2(err)<<std::endl;

  const int nbasis = 2;
+  const int cb = 0 ;
  LatticeFermion prom(FGrid);

  std::vector<LatticeFermion> subspace(nbasis,FGrid);
@ -119,7 +120,7 @@ int main (int argc, char ** argv)

  MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermDefOp(Ddwf);
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
-  Subspace Aggregates(Coarse5d,FGrid);
+  Subspace Aggregates(Coarse5d,FGrid,cb);
  Aggregates.CreateSubspaceRandom(RNG5);

  subspace=Aggregates.subspace;
--- a/tests/debug/Test_cayley_ldop_cr.cc
+++ b/tests/debug/Test_cayley_ldop_cr.cc
@ -78,6 +78,7 @@ int main (int argc, char ** argv)

  RealD mass=0.1;
  RealD M5=1.5;
+  int cb=0;

  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Building g5R5 hermitian DWF operator" <<std::endl;
@ -95,7 +96,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Calling Aggregation class to build subspace" <<std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermDefOp(Ddwf);
-  Subspace Aggregates(Coarse5d,FGrid);
+  Subspace Aggregates(Coarse5d,FGrid,cb);
  Aggregates.CreateSubspace(RNG5,HermDefOp);


--- a/tests/forces/Test_gp_plaq_force.cc
+++ b/tests/forces/Test_gp_plaq_force.cc
@ -0,0 +1,123 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_gp_rect_force.cc
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(&Grid);
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  std::vector<int> seeds({1,2,3,4});
+
+  GridParallelRNG          pRNG(&Grid);
+  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+
+  LatticeGaugeField U(&Grid);
+
+  SU3::HotConfiguration(pRNG,U);
+  
+  double beta = 1.0;
+  double c1   = 0.331;
+
+  //ConjugatePlaqPlusRectangleActionR Action(beta,c1);
+  ConjugateWilsonGaugeActionR Action(beta);
+  //WilsonGaugeActionR Action(beta);
+
+  ComplexD S    = Action.S(U);
+
+  // get the deriv of phidag MdagM phi with respect to "U"
+  LatticeGaugeField UdSdU(&Grid);
+
+  Action.deriv(U,UdSdU);
+
+  ////////////////////////////////////
+  // Modify the gauge field a little 
+  ////////////////////////////////////
+  RealD dt = 0.0001;
+
+  LatticeColourMatrix mommu(&Grid); 
+  LatticeColourMatrix forcemu(&Grid); 
+  LatticeGaugeField mom(&Grid); 
+  LatticeGaugeField Uprime(&Grid); 
+
+  for(int mu=0;mu<Nd;mu++){
+
+    SU3::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+
+    PokeIndex<LorentzIndex>(mom,mommu,mu);
+
+    // fourth order exponential approx
+    parallel_for(auto i=mom.begin();i<mom.end();i++){ // exp(pmu dt) * Umu
+      Uprime[i](mu) = U[i](mu) + mom[i](mu)*U[i](mu)*dt ;
+    }
+  }
+
+  ComplexD Sprime    = Action.S(Uprime);
+
+  //////////////////////////////////////////////
+  // Use derivative to estimate dS
+  //////////////////////////////////////////////
+
+  LatticeComplex dS(&Grid); dS = zero;
+
+  for(int mu=0;mu<Nd;mu++){
+
+    auto UdSdUmu = PeekIndex<LorentzIndex>(UdSdU,mu);
+         mommu   = PeekIndex<LorentzIndex>(mom,mu);
+
+    // Update gauge action density
+    // U = exp(p dt) U
+    // dU/dt = p U
+    // so dSdt = trace( dUdt dSdU) = trace( p UdSdUmu ) 
+
+    dS = dS - trace(mommu*UdSdUmu)*dt*2.0;
+
+  }
+  ComplexD dSpred    = sum(dS);
+
+  std::cout << GridLogMessage << " S      "<<S<<std::endl;
+  std::cout << GridLogMessage << " Sprime "<<Sprime<<std::endl;
+  std::cout << GridLogMessage << "dS      "<<Sprime-S<<std::endl;
+  std::cout << GridLogMessage << "pred dS "<< dSpred <<std::endl;
+  assert( fabs(real(Sprime-S-dSpred)) < 1.0e-2 ) ;
+  std::cout<< GridLogMessage << "Done" <<std::endl;
+  Grid_finalize();
+}
--- a/tests/forces/Test_gp_rect_force.cc
+++ b/tests/forces/Test_gp_rect_force.cc
@ -59,8 +59,8 @@ int main (int argc, char ** argv)
  double beta = 1.0;
  double c1   = 0.331;

-  //GparityPlaqPlusRectangleActionR Action(beta,c1);
-  ConjugateWilsonGaugeActionR Action(beta);
+  ConjugatePlaqPlusRectangleActionR Action(beta,c1);
+  //  ConjugateWilsonGaugeActionR Action(beta);
  //WilsonGaugeActionR Action(beta);

  ComplexD S    = Action.S(U);
--- a/tests/forces/Test_gpwilson_force.cc
+++ b/tests/forces/Test_gpwilson_force.cc
@ -91,7 +91,7 @@ int main (int argc, char ** argv)
  ////////////////////////////////////
  // Modify the gauge field a little 
  ////////////////////////////////////
-  RealD dt = 0.0001;
+  RealD dt = 0.01;

  LatticeColourMatrix mommu(UGrid); 
  LatticeColourMatrix forcemu(UGrid); 
--- a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc
+++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc
@ -56,12 +56,12 @@ public:

  void checkpointFine(std::string evecs_file,std::string evals_file)
  {
-    assert(this->_Aggregate.subspace.size()==nbasis);
+    assert(this->subspace.size()==nbasis);
    emptyUserRecord record;
    Grid::QCD::ScidacWriter WR;
    WR.open(evecs_file);
    for(int k=0;k<nbasis;k++) {
-      WR.writeScidacFieldRecord(this->_Aggregate.subspace[k],record);
+      WR.writeScidacFieldRecord(this->subspace[k],record);
    }
    WR.close();
    
@ -72,7 +72,7 @@ public:
  void checkpointFineRestore(std::string evecs_file,std::string evals_file)
  {
    this->evals_fine.resize(nbasis);
-    this->_Aggregate.subspace.resize(nbasis,this->_FineGrid);
+    this->subspace.resize(nbasis,this->_FineGrid);
    
    std::cout << GridLogIRL<< "checkpointFineRestore:  Reading evals from "<<evals_file<<std::endl;
    XmlReader RDx(evals_file);
@ -85,8 +85,8 @@ public:
    Grid::QCD::ScidacReader RD ;
    RD.open(evecs_file);
    for(int k=0;k<nbasis;k++) {
-      this->_Aggregate.subspace[k].checkerboard=this->_checkerboard;
-      RD.readScidacFieldRecord(this->_Aggregate.subspace[k],record);
+      this->subspace[k].checkerboard=this->_checkerboard;
+      RD.readScidacFieldRecord(this->subspace[k],record);
      
    }
    RD.close();
@ -180,7 +180,6 @@ int main (int argc, char ** argv) {
  GridCartesian         * CoarseGrid4    = SpaceTimeGrid::makeFourDimGrid(coarseLatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * CoarseGrid4rb  = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseGrid4);
  GridCartesian         * CoarseGrid5    = SpaceTimeGrid::makeFiveDimGrid(cLs,CoarseGrid4);
-  GridRedBlackCartesian * CoarseGrid5rb  = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseGrid5);

  // Gauge field
  LatticeGaugeField Umu(UGrid);
@ -206,7 +205,7 @@ int main (int argc, char ** argv) {

  const int nbasis= 60;
  assert(nbasis==Ns1);
-  LocalCoherenceLanczosScidac<vSpinColourVector,vTComplex,nbasis> _LocalCoherenceLanczos(FrbGrid,CoarseGrid5rb,HermOp,Odd);
+  LocalCoherenceLanczosScidac<vSpinColourVector,vTComplex,nbasis> _LocalCoherenceLanczos(FrbGrid,CoarseGrid5,HermOp,Odd);
  std::cout << GridLogMessage << "Constructed LocalCoherenceLanczos" << std::endl;

  assert( (Params.doFine)||(Params.doFineRead));
@ -221,7 +220,9 @@ int main (int argc, char ** argv) {
    std::cout << GridLogIRL<<"Checkpointing Fine evecs"<<std::endl;
    _LocalCoherenceLanczos.checkpointFine(std::string("evecs.scidac"),std::string("evals.xml"));
    _LocalCoherenceLanczos.testFine(fine.resid*100.0); // Coarse check
+    std::cout << GridLogIRL<<"Orthogonalising"<<std::endl;
    _LocalCoherenceLanczos.Orthogonalise();
+    std::cout << GridLogIRL<<"Orthogonaled"<<std::endl;
  }

  if ( Params.doFineRead ) { 
@ -231,8 +232,6 @@ int main (int argc, char ** argv) {
  }

  if ( Params.doCoarse ) {
-    std::cout << GridLogMessage << "Orthogonalising " << nbasis<<" Nm "<<Nm2<< std::endl;
-    
    std::cout << GridLogMessage << "Performing coarse grid IRL Nstop "<< Ns2<< " Nk "<<Nk2<<" Nm "<<Nm2<< std::endl;
    _LocalCoherenceLanczos.calcCoarse(coarse.Cheby,Params.Smoother,Params.coarse_relax_tol,
 			      coarse.Nstop, coarse.Nk,coarse.Nm,