Merge branch 'develop' of https://github.com/paboyle/Grid into merge

2026-06-18 09:53:43 +01:00 · 2018-03-07 15:24:11 -05:00
parent ebb1bebf24 a7d19dbb64
commit 0b63e2e9cd
175 changed files with 12512 additions and 4800 deletions
@@ -1,28 +1,18 @@
 extra_sources=
 extra_headers=
-if BUILD_COMMS_MPI
-  extra_sources+=communicator/Communicator_mpi.cc
-  extra_sources+=communicator/Communicator_base.cc
-endif

 if BUILD_COMMS_MPI3
  extra_sources+=communicator/Communicator_mpi3.cc
  extra_sources+=communicator/Communicator_base.cc
-endif
-
-if BUILD_COMMS_MPIT
-  extra_sources+=communicator/Communicator_mpit.cc
-  extra_sources+=communicator/Communicator_base.cc
-endif
-
-if BUILD_COMMS_SHMEM
-  extra_sources+=communicator/Communicator_shmem.cc
-  extra_sources+=communicator/Communicator_base.cc
+  extra_sources+=communicator/SharedMemoryMPI.cc
+  extra_sources+=communicator/SharedMemory.cc
 endif

 if BUILD_COMMS_NONE
  extra_sources+=communicator/Communicator_none.cc
  extra_sources+=communicator/Communicator_base.cc
+  extra_sources+=communicator/SharedMemoryNone.cc
+  extra_sources+=communicator/SharedMemory.cc
 endif

 if BUILD_HDF5
@@ -39,10 +39,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/approx/MultiShiftFunction.h>
 #include <Grid/algorithms/approx/Forecast.h>

-#include <Grid/algorithms/densematrix/DenseMatrix.h>
-#include <Grid/algorithms/densematrix/Francis.h>
-#include <Grid/algorithms/densematrix/Householder.h>
-
+#include <Grid/algorithms/iterative/Deflation.h>
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateResidual.h>
 #include <Grid/algorithms/iterative/NormalEquations.h>
@@ -183,11 +183,13 @@ namespace Grid {
      virtual  RealD Mpc      (const Field &in, Field &out) =0;
      virtual  RealD MpcDag   (const Field &in, Field &out) =0;
      virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
-	Field tmp(in._grid);
+      Field tmp(in._grid);
+      tmp.checkerboard = in.checkerboard;
 	ni=Mpc(in,tmp);
 	no=MpcDag(tmp,out);
      }
      virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+      out.checkerboard = in.checkerboard;
 	MpcDagMpc(in,out,n1,n2);
      }
      virtual void HermOp(const Field &in, Field &out){
@@ -215,13 +217,15 @@ namespace Grid {
    public:
      SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
      virtual  RealD Mpc      (const Field &in, Field &out) {
-	Field tmp(in._grid);
-//	std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << "  _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
+      Field tmp(in._grid);
+      tmp.checkerboard = !in.checkerboard;
+	//std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << "  _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;

 	_Mat.Meooe(in,tmp);
 	_Mat.MooeeInv(tmp,out);
 	_Mat.Meooe(out,tmp);

+      //std::cout << "cb in " << in.checkerboard << "  cb out " << out.checkerboard << std::endl;
 	_Mat.Mooee(in,out);
 	return axpy_norm(out,-1.0,tmp,out);
      }
@@ -0,0 +1,101 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_DEFLATION_H
+#define GRID_DEFLATION_H
+
+namespace Grid { 
+
+struct ZeroGuesser {
+public:
+  template<class Field> 
+  void operator()(const Field &src,Field &guess) { guess = Zero(); };
+};
+struct SourceGuesser {
+public:
+  template<class Field> 
+  void operator()(const Field &src,Field &guess) { guess = src; };
+};
+
+////////////////////////////////
+// Fine grid deflation
+////////////////////////////////
+template<class Field>
+struct DeflatedGuesser {
+private:
+  const std::vector<Field> &evec;
+  const std::vector<RealD> &eval;
+
+public:
+
+  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
+
+  void operator()(const Field &src,Field &guess) { 
+    guess = zero;
+    assert(evec.size()==eval.size());
+    auto N = evec.size();
+    for (int i=0;i<N;i++) {
+      const Field& tmp = evec[i];
+      axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
+    }
+  }
+};
+
+template<class FineField, class CoarseField>
+class LocalCoherenceDeflatedGuesser {
+private:
+  const std::vector<FineField>   &subspace;
+  const std::vector<CoarseField> &evec_coarse;
+  const std::vector<RealD>       &eval_coarse;
+public:
+  
+  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
+				const std::vector<CoarseField> &_evec_coarse,
+				const std::vector<RealD>       &_eval_coarse)
+    : subspace(_subspace), 
+      evec_coarse(_evec_coarse), 
+      eval_coarse(_eval_coarse)  
+  {
+  }
+  
+  void operator()(const FineField &src,FineField &guess) { 
+    int N = (int)evec_coarse.size();
+    CoarseField src_coarse(evec_coarse[0]._grid);
+    CoarseField guess_coarse(evec_coarse[0]._grid);    guess_coarse = zero;
+    blockProject(src_coarse,src,subspace);    
+    for (int i=0;i<N;i++) {
+      const CoarseField & tmp = evec_coarse[i];
+      axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
+    }
+    blockPromote(guess_coarse,guess,subspace);
+  };
+};
+
+
+
+}
+#endif
@@ -149,19 +149,6 @@ void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, boo
  basisReorderInPlace(_v,sort_vals,idx);
 }

-// PAB: faster to compute the inner products first then fuse loops.
-// If performance critical can improve.
-template<class Field>
-void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
-  result = zero;
-  assert(_v.size()==eval.size());
-  int N = (int)_v.size();
-  for (int i=0;i<N;i++) {
-    Field& tmp = _v[i];
-    axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
-  }
-}
-
 /////////////////////////////////////////////////////////////
 // Implicitly restarted lanczos
 /////////////////////////////////////////////////////////////
@@ -181,6 +168,7 @@ enum IRLdiagonalisation {
 template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public ImplicitlyRestartedLanczosTester<Field>
 {
 public:
+
  LinearFunction<Field>       &_HermOp;
  ImplicitlyRestartedLanczosHermOpTester(LinearFunction<Field> &HermOp) : _HermOp(HermOp)  {  };
  int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox)
@@ -243,6 +231,7 @@ class ImplicitlyRestartedLanczos {
  /////////////////////////
  
 public:       
+
  //////////////////////////////////////////////////////////////////
  // PAB:
  //////////////////////////////////////////////////////////////////
@@ -28,7 +28,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #ifndef GRID_LOCAL_COHERENCE_IRL_H
 #define GRID_LOCAL_COHERENCE_IRL_H
+
 namespace Grid { 
+
+
 struct LanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams,
@@ -70,21 +73,24 @@ public:
  typedef Lattice<Fobj>          FineField;

  LinearOperatorBase<FineField> &_Linop;
-  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
+  std::vector<FineField>        &subspace;

-  ProjectedHermOp(LinearOperatorBase<FineField>& linop,  Aggregation<Fobj,CComplex,nbasis> &aggregate) : 
-    _Linop(linop),
-    _Aggregate(aggregate)  {  };
+  ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) : 
+    _Linop(linop), subspace(_subspace)
+  {  
+    assert(subspace.size() >0);
+  };

  void operator()(const CoarseField& in, CoarseField& out) {
+    GridBase *FineGrid = subspace[0]._grid;    
+    int   checkerboard = subspace[0].checkerboard;
+      
+    FineField fin (FineGrid);     fin.checkerboard= checkerboard;
+    FineField fout(FineGrid);   fout.checkerboard = checkerboard;

-    GridBase *FineGrid = _Aggregate.FineGrid;
-    FineField fin(FineGrid);
-    FineField fout(FineGrid);
-
-    _Aggregate.PromoteFromSubspace(in,fin);    std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
-    _Linop.HermOp(fin,fout);                   std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
-    _Aggregate.ProjectToSubspace(out,fout);    std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
+    blockPromote(in,fin,subspace);       std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
+    _Linop.HermOp(fin,fout);             std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
+    blockProject(out,fout,subspace);     std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
  }
 };

@@ -99,24 +105,27 @@ public:

  OperatorFunction<FineField>   & _poly;
  LinearOperatorBase<FineField> &_Linop;
-  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
+  std::vector<FineField>        &subspace;

-  ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,LinearOperatorBase<FineField>& linop, 
-			  Aggregation<Fobj,CComplex,nbasis> &aggregate) : 
+  ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,
+			  LinearOperatorBase<FineField>& linop, 
+			  std::vector<FineField> & _subspace) :
    _poly(poly),
    _Linop(linop),
-    _Aggregate(aggregate)  {  };
+    subspace(_subspace)
+  {  };

  void operator()(const CoarseField& in, CoarseField& out) {
-
-    GridBase *FineGrid = _Aggregate.FineGrid;
-
-    FineField fin(FineGrid) ;fin.checkerboard  =_Aggregate.checkerboard;
-    FineField fout(FineGrid);fout.checkerboard =_Aggregate.checkerboard;
    
-    _Aggregate.PromoteFromSubspace(in,fin);    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
+    GridBase *FineGrid = subspace[0]._grid;    
+    int   checkerboard = subspace[0].checkerboard;
+
+    FineField fin (FineGrid); fin.checkerboard =checkerboard;
+    FineField fout(FineGrid);fout.checkerboard =checkerboard;
+    
+    blockPromote(in,fin,subspace);             std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
    _poly(_Linop,fin,fout);                    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
-    _Aggregate.ProjectToSubspace(out,fout);    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
+    blockProject(out,fout,subspace);           std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
  }
 };

@@ -132,19 +141,23 @@ class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanc
  LinearFunction<CoarseField> & _Poly;
  OperatorFunction<FineField>   & _smoother;
  LinearOperatorBase<FineField> &_Linop;
-  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
-  RealD                             _coarse_relax_tol;
+  RealD                          _coarse_relax_tol;
+  std::vector<FineField>        &_subspace;
+  
  ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly,
 					   OperatorFunction<FineField>   &smoother,
 					   LinearOperatorBase<FineField> &Linop,
-					   Aggregation<Fobj,CComplex,nbasis> &Aggregate,
+					   std::vector<FineField>        &subspace,
 					   RealD coarse_relax_tol=5.0e3) 
-    : _smoother(smoother), _Linop(Linop),_Aggregate(Aggregate), _Poly(Poly), _coarse_relax_tol(coarse_relax_tol)  {    };
+    : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
+      _coarse_relax_tol(coarse_relax_tol)  
+  {    };

  int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
    CoarseField v(B);
    RealD eval_poly = eval;
+
    // Apply operator
    _Poly(B,v);

@@ -168,14 +181,13 @@ class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanc
  }
  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
-    GridBase *FineGrid = _Aggregate.FineGrid;
-
-    int checkerboard   = _Aggregate.checkerboard;
-
+    GridBase *FineGrid = _subspace[0]._grid;    
+    int checkerboard   = _subspace[0].checkerboard;
    FineField fB(FineGrid);fB.checkerboard =checkerboard;
    FineField fv(FineGrid);fv.checkerboard =checkerboard;

-    _Aggregate.PromoteFromSubspace(B,fv);
+    blockPromote(B,fv,_subspace);  
+    
    _smoother(_Linop,fv,fB); 

    RealD eval_poly = eval;
@@ -217,27 +229,65 @@ protected:
  int _checkerboard;
  LinearOperatorBase<FineField>                 & _FineOp;
  
-  // FIXME replace Aggregation with vector of fine; the code reuse is too small for
-  // the hassle and complexity of cross coupling.
-  Aggregation<Fobj,CComplex,nbasis>               _Aggregate;  
-  std::vector<RealD>                              evals_fine;
-  std::vector<RealD>                              evals_coarse; 
-  std::vector<CoarseField>                        evec_coarse;
+  std::vector<RealD>                              &evals_fine;
+  std::vector<RealD>                              &evals_coarse; 
+  std::vector<FineField>                          &subspace;
+  std::vector<CoarseField>                        &evec_coarse;
+
+private:
+  std::vector<RealD>                              _evals_fine;
+  std::vector<RealD>                              _evals_coarse; 
+  std::vector<FineField>                          _subspace;
+  std::vector<CoarseField>                        _evec_coarse;
+
 public:
+
  LocalCoherenceLanczos(GridBase *FineGrid,
-		GridBase *CoarseGrid,
-		LinearOperatorBase<FineField> &FineOp,
-		int checkerboard) :
+			GridBase *CoarseGrid,
+			LinearOperatorBase<FineField> &FineOp,
+			int checkerboard) :
    _CoarseGrid(CoarseGrid),
    _FineGrid(FineGrid),
-    _Aggregate(CoarseGrid,FineGrid,checkerboard),
    _FineOp(FineOp),
-    _checkerboard(checkerboard)
+    _checkerboard(checkerboard),
+    evals_fine  (_evals_fine),
+    evals_coarse(_evals_coarse),
+    subspace    (_subspace),
+    evec_coarse(_evec_coarse)
  {
    evals_fine.resize(0);
    evals_coarse.resize(0);
  };
-  void Orthogonalise(void ) { _Aggregate.Orthogonalise(); }
+  //////////////////////////////////////////////////////////////////////////
+  // Alternate constructore, external storage for use by Hadrons module
+  //////////////////////////////////////////////////////////////////////////
+  LocalCoherenceLanczos(GridBase *FineGrid,
+			GridBase *CoarseGrid,
+			LinearOperatorBase<FineField> &FineOp,
+			int checkerboard,
+			std::vector<FineField>   &ext_subspace,
+			std::vector<CoarseField> &ext_coarse,
+			std::vector<RealD>       &ext_eval_fine,
+			std::vector<RealD>       &ext_eval_coarse
+			) :
+    _CoarseGrid(CoarseGrid),
+    _FineGrid(FineGrid),
+    _FineOp(FineOp),
+    _checkerboard(checkerboard),
+    evals_fine  (ext_eval_fine), 
+    evals_coarse(ext_eval_coarse),
+    subspace    (ext_subspace),
+    evec_coarse (ext_coarse)
+  {
+    evals_fine.resize(0);
+    evals_coarse.resize(0);
+  };
+
+  void Orthogonalise(void ) {
+    CoarseScalar InnerProd(_CoarseGrid); 
+    blockOrthogonalise(InnerProd,subspace);std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
+    blockOrthogonalise(InnerProd,subspace);std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
+  };

  template<typename T>  static RealD normalise(T& v) 
  {
@@ -246,43 +296,44 @@ public:
    v = v * (1.0/nn);
    return nn;
  }
-
+  /*
  void fakeFine(void)
  {
    int Nk = nbasis;
-    _Aggregate.subspace.resize(Nk,_FineGrid);
-    _Aggregate.subspace[0]=1.0;
-    _Aggregate.subspace[0].checkerboard=_checkerboard;
-    normalise(_Aggregate.subspace[0]);
+    subspace.resize(Nk,_FineGrid);
+    subspace[0]=1.0;
+    subspace[0].checkerboard=_checkerboard;
+    normalise(subspace[0]);
    PlainHermOp<FineField>    Op(_FineOp);
    for(int k=1;k<Nk;k++){
-      _Aggregate.subspace[k].checkerboard=_checkerboard;
-      Op(_Aggregate.subspace[k-1],_Aggregate.subspace[k]);
-      normalise(_Aggregate.subspace[k]);
+      subspace[k].checkerboard=_checkerboard;
+      Op(subspace[k-1],subspace[k]);
+      normalise(subspace[k]);
    }
  }
+  */

  void testFine(RealD resid) 
  {
    assert(evals_fine.size() == nbasis);
-    assert(_Aggregate.subspace.size() == nbasis);
+    assert(subspace.size() == nbasis);
    PlainHermOp<FineField>    Op(_FineOp);
    ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
    for(int k=0;k<nbasis;k++){
-      assert(SimpleTester.ReconstructEval(k,resid,_Aggregate.subspace[k],evals_fine[k],1.0)==1);
+      assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
    }
  }

  void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
  {
    assert(evals_fine.size() == nbasis);
-    assert(_Aggregate.subspace.size() == nbasis);
+    assert(subspace.size() == nbasis);
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////
    Chebyshev<FineField>                          ChebySmooth(cheby_smooth);
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,_Aggregate);
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,_subspace);
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);

    for(int k=0;k<evec_coarse.size();k++){
      if ( k < nbasis ) { 
@@ -302,34 +353,34 @@ public:
    PlainHermOp<FineField>    Op(_FineOp);

    evals_fine.resize(Nm);
-    _Aggregate.subspace.resize(Nm,_FineGrid);
+    subspace.resize(Nm,_FineGrid);

    ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);

    FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard;

    int Nconv;
-    IRL.calc(evals_fine,_Aggregate.subspace,src,Nconv,false);
+    IRL.calc(evals_fine,subspace,src,Nconv,false);
    
    // Shrink down to number saved
    assert(Nstop>=nbasis);
    assert(Nconv>=nbasis);
    evals_fine.resize(nbasis);
-    _Aggregate.subspace.resize(nbasis,_FineGrid);
+    subspace.resize(nbasis,_FineGrid);
  }
  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
 		  int Nstop, int Nk, int Nm,RealD resid, 
 		  RealD MaxIt, RealD betastp, int MinRes)
  {
    Chebyshev<FineField>                          Cheby(cheby_op);
-    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,_Aggregate);
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,_Aggregate);
+    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,_subspace);
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,_subspace);
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////

    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_subspace,relax);

    evals_coarse.resize(Nm);
    evec_coarse.resize(Nm,_CoarseGrid);
@@ -107,7 +107,12 @@ namespace Grid {
    };

    template<class Matrix>
-      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+    void operator() (Matrix & _Matrix,const Field &in, Field &out){
+      ZeroGuesser guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+    template<class Matrix, class Guesser>
+    void operator() (Matrix & _Matrix,const Field &in, Field &out, Guesser &guess){

      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
@@ -129,7 +134,6 @@ namespace Grid {
      pickCheckerboard(Odd ,src_o,in);
      pickCheckerboard(Even,sol_e,out);
      pickCheckerboard(Odd ,sol_o,out);
-
      std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" <<std::endl;
    
      /////////////////////////////////////////////////////
@@ -146,6 +150,7 @@ namespace Grid {
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
+      guess(src_o,sol_o);
      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver called  the Mpc solver" <<std::endl;

@@ -189,7 +194,12 @@ namespace Grid {
    CBfactorise=cb;
  };
    template<class Matrix>
-      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+    void operator() (Matrix & _Matrix,const Field &in, Field &out){
+      ZeroGuesser guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+    template<class Matrix, class Guesser>
+    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){

      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
@@ -225,6 +235,7 @@ namespace Grid {
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
+      guess(src_o,sol_o);
      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);

      ///////////////////////////////////////////////////
@@ -268,7 +279,12 @@ namespace Grid {
    };

    template<class Matrix>
-      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+    void operator() (Matrix & _Matrix,const Field &in, Field &out){
+      ZeroGuesser guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+    template<class Matrix,class Guesser>
+    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){

      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
@@ -305,6 +321,7 @@ namespace Grid {
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
 //      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
+      guess(src_o,tmp);
      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
      _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd);

@@ -347,7 +364,12 @@ namespace Grid {
    };

    template<class Matrix>
-      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+    void operator() (Matrix & _Matrix,const Field &in, Field &out){
+      ZeroGuesser guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+    template<class Matrix, class Guesser>
+    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){

      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
@@ -385,6 +407,7 @@ namespace Grid {
      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
 //      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
 //      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
+      guess(src_o,tmp);
      _HermitianRBSolver(src_o,tmp);  assert(tmp.checkerboard==Odd);
      _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd);

@@ -3,9 +3,12 @@

 namespace Grid {

+MemoryStats *MemoryProfiler::stats = nullptr;
+bool         MemoryProfiler::debug = false;
+
 int PointerCache::victim;

-  PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
+PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];

 void *PointerCache::Insert(void *ptr,size_t bytes) {

@@ -94,4 +97,29 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
 #endif
 }

+std::string sizeString(const size_t bytes)
+{
+  constexpr unsigned int bufSize = 256;
+  const char             *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
+  char                   buf[256];
+  size_t                 s     = 0;
+  double                 count = bytes;
+  
+  while (count >= 1024 && s < 7)
+  {
+      s++;
+      count /= 1024;
+  }
+  if (count - floor(count) == 0.0)
+  {
+      snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
+  }
+  else
+  {
+      snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
+  }
+  
+  return std::string(buf);
+}
+
 }
@@ -63,6 +63,64 @@ namespace Grid {
    static void *Lookup(size_t bytes) ;

  };
+  
+  std::string sizeString(size_t bytes);
+
+  struct MemoryStats
+  {
+    size_t totalAllocated{0}, maxAllocated{0}, 
+           currentlyAllocated{0}, totalFreed{0};
+  };
+    
+  class MemoryProfiler
+  {
+  public:
+    static MemoryStats *stats;
+    static bool        debug;
+  };
+
+  #define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
+  #define profilerDebugPrint \
+  if (MemoryProfiler::stats)\
+  {\
+    auto s = MemoryProfiler::stats;\
+    std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl;\
+    std::cout << GridLogDebug << "[Memory debug] total  : " << memString(s->totalAllocated) \
+              << std::endl;\
+    std::cout << GridLogDebug << "[Memory debug] max    : " << memString(s->maxAllocated) \
+              << std::endl;\
+    std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
+              << std::endl;\
+    std::cout << GridLogDebug << "[Memory debug] freed  : " << memString(s->totalFreed) \
+              << std::endl;\
+  }
+
+  #define profilerAllocate(bytes)\
+  if (MemoryProfiler::stats)\
+  {\
+    auto s = MemoryProfiler::stats;\
+    s->totalAllocated     += (bytes);\
+    s->currentlyAllocated += (bytes);\
+    s->maxAllocated        = std::max(s->maxAllocated, s->currentlyAllocated);\
+  }\
+  if (MemoryProfiler::debug)\
+  {\
+    std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl;\
+    profilerDebugPrint;\
+  }
+
+  #define profilerFree(bytes)\
+  if (MemoryProfiler::stats)\
+  {\
+    auto s = MemoryProfiler::stats;\
+    s->totalFreed         += (bytes);\
+    s->currentlyAllocated -= (bytes);\
+  }\
+  if (MemoryProfiler::debug)\
+  {\
+    std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl;\
+    profilerDebugPrint;\
+  }

  void check_huge_pages(void *Buf,uint64_t BYTES);

@@ -92,6 +150,7 @@ public:
  pointer allocate(size_type __n, const void* _p= 0)
  { 
    size_type bytes = __n*sizeof(_Tp);
+    profilerAllocate(bytes);

    _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
    //    if ( ptr != NULL ) 
@@ -122,6 +181,8 @@ public:
  void deallocate(pointer __p, size_type __n) { 
    size_type bytes = __n * sizeof(_Tp);

+    profilerFree(bytes);
+
    pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);

 #ifdef HAVE_MM_MALLOC_H
@@ -172,10 +233,13 @@ public:
 #ifdef GRID_COMMS_SHMEM
  pointer allocate(size_type __n, const void* _p= 0)
  {
+    size_type bytes = __n*sizeof(_Tp);
+
+    profilerAllocate(bytes);
 #ifdef CRAY
-    _Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
+    _Tp *ptr = (_Tp *) shmem_align(bytes,64);
 #else
-    _Tp *ptr = (_Tp *) shmem_align(64,__n*sizeof(_Tp));
+    _Tp *ptr = (_Tp *) shmem_align(64,bytes);
 #endif
 #ifdef PARANOID_SYMMETRIC_HEAP
    static void * bcast;
@@ -193,18 +257,23 @@ public:
 #endif 
    return ptr;
  }
-  void deallocate(pointer __p, size_type) { 
+  void deallocate(pointer __p, size_type __n) { 
+    size_type bytes = __n*sizeof(_Tp);
+
+    profilerFree(bytes);
    shmem_free((void *)__p);
  }
 #else
  pointer allocate(size_type __n, const void* _p= 0) 
  {
-#ifdef HAVE_MM_MALLOC_H
-    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),GRID_ALLOC_ALIGN);
-#else
-    _Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,__n*sizeof(_Tp));
-#endif
    size_type bytes = __n*sizeof(_Tp);
+    
+    profilerAllocate(bytes);
+#ifdef HAVE_MM_MALLOC_H
+    _Tp * ptr = (_Tp *) _mm_malloc(bytes, GRID_ALLOC_ALIGN);
+#else
+    _Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN, bytes);
+#endif
    uint8_t *cp = (uint8_t *)ptr;
    if ( ptr ) { 
    // One touch per 4k page, static OMP loop to catch same loop order
@@ -215,7 +284,10 @@ public:
    }
    return ptr;
  }
-  void deallocate(pointer __p, size_type) { 
+  void deallocate(pointer __p, size_type __n) {
+    size_type bytes = __n*sizeof(_Tp);
+
+    profilerFree(bytes);
 #ifdef HAVE_MM_MALLOC_H
    _mm_free((void *)__p); 
 #else
@@ -59,6 +59,7 @@ public:

    virtual ~GridBase() = default;

+
    // Physics Grid information.
    std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes.
    std::vector<int> _fdimensions;// (full) Global dimensions of array prior to cb removal
@@ -78,6 +79,8 @@ public:
    std::vector<int> _lstart;     // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
    std::vector<int> _lend  ;     // local end of array in gcoors   _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1

+    bool _isCheckerBoarded; 
+
 public:

    ////////////////////////////////////////////////////////////////
@@ -97,6 +97,7 @@ public:
      ///////////////////////
      // Grid information
      ///////////////////////
+      _isCheckerBoarded = false;
      _ndimension = dimensions.size();

      _fdimensions.resize(_ndimension);
@@ -122,6 +123,7 @@ public:

        // Use a reduced simd grid
        _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
+        //std::cout << _ldimensions[d] << "  " << _gdimensions[d] << "  " << _processors[d] << std::endl;
        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);

        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
@@ -166,6 +168,7 @@ public:
        block = block * _rdimensions[d];
      }
    };
+
 };
 }
 #endif
@@ -171,9 +171,8 @@ public:
              const std::vector<int> &checker_dim_mask,
              int checker_dim)
    {
-      ///////////////////////
-      // Grid information
-      ///////////////////////
+
+      _isCheckerBoarded = true;
      _checker_dim = checker_dim;
      assert(checker_dim_mask[checker_dim] == 1);
      _ndimension = dimensions.size();
@@ -28,6 +28,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_COMMUNICATOR_H
 #define GRID_COMMUNICATOR_H

+#include <Grid/communicator/SharedMemory.h>
 #include <Grid/communicator/Communicator_base.h>

 #endif
@@ -36,33 +36,9 @@ namespace Grid {
 ///////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////
-void *              CartesianCommunicator::ShmCommBuf;
-uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 1024LL*1024LL*1024LL; 
 CartesianCommunicator::CommunicatorPolicy_t  
 CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
 int CartesianCommunicator::nCommThreads = -1;
-int CartesianCommunicator::Hugepages = 0;
-
-/////////////////////////////////
-// Alloc, free shmem region
-/////////////////////////////////
-void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){
-  //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
-  void *ptr = (void *)heap_top;
-  heap_top  += bytes;
-  heap_bytes+= bytes;
-  if (heap_bytes >= MAX_MPI_SHM_BYTES) {
-    std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
-    std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
-    std::cout<< " Current value is " << (MAX_MPI_SHM_BYTES/(1024*1024)) <<std::endl;
-    assert(heap_bytes<MAX_MPI_SHM_BYTES);
-  }
-  return ptr;
-}
-void CartesianCommunicator::ShmBufferFreeAll(void) { 
-  heap_top  =(size_t)ShmBufferSelf();
-  heap_bytes=0;
-}

 /////////////////////////////////
 // Grid information queries
@@ -95,282 +71,6 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
 {
  GlobalSumVector((double *)c,2*N);
 }
-
-
-#if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) || defined (GRID_COMMS_MPI3)
-void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
-{
-  std::vector<int> row(_ndimension,1);
-  assert(dim>=0 && dim<_ndimension);
-
-  //  Split the communicator
-  row[dim] = _processors[dim];
-
-  int me;
-  CartesianCommunicator Comm(row,*this,me);
-  Comm.AllToAll(in,out,words,bytes);
-}
-void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t bytes)
-{
-  // MPI is a pain and uses "int" arguments
-  // 64*64*64*128*16 == 500Million elements of data.
-  // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
-  // (Turns up on 32^3 x 64 Gparity too)
-  MPI_Datatype object;
-  int iwords; 
-  int ibytes;
-  iwords = words;
-  ibytes = bytes;
-  assert(words == iwords); // safe to cast to int ?
-  assert(bytes == ibytes); // safe to cast to int ?
-  MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
-  MPI_Type_commit(&object);
-  MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);
-  MPI_Type_free(&object);
-}
-#endif
-
-#if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) 
-CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank) 
-{
-  _ndimension = processors.size();
-
-  int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
-  std::vector<int> parent_processor_coor(_ndimension,0);
-  std::vector<int> parent_processors    (_ndimension,1);
-
-  // Can make 5d grid from 4d etc...
-  int pad = _ndimension-parent_ndimension;
-  for(int d=0;d<parent_ndimension;d++){
-    parent_processor_coor[pad+d]=parent._processor_coor[d];
-    parent_processors    [pad+d]=parent._processors[d];
-  }
-
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  // split the communicator
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  int Nparent;
-  MPI_Comm_size(parent.communicator,&Nparent);
-
-  int childsize=1;
-  for(int d=0;d<processors.size();d++) {
-    childsize *= processors[d];
-  }
-  int Nchild = Nparent/childsize;
-  assert (childsize * Nchild == Nparent);
-
-  std::vector<int> ccoor(_ndimension); // coor within subcommunicator
-  std::vector<int> scoor(_ndimension); // coor of split within parent
-  std::vector<int> ssize(_ndimension); // coor of split within parent
-
-  std::vector<int> pcoor(_ndimension,0); 
-  std::vector<int> pdims(_ndimension,1); 
-
-  if(parent._processors.size()==4 && _ndimension==5){
-      for(int i=0;i<4;i++) pcoor[i+1]=parent._processor_coor[i];
-      for(int i=0;i<4;i++) pdims[i+1]=parent._processors[i];
-  } else {
-      assert(_ndimension == parent._ndimension);
-      for(int i=0;i<_ndimension;i++) pcoor[i]=parent._processor_coor[i];
-      for(int i=0;i<_ndimension;i++) pdims[i]=parent._processors[i];
-  }
-
-  for(int d=0;d<_ndimension;d++){
-    ccoor[d] = pcoor[d] % processors[d];
-    scoor[d] = pcoor[d] / processors[d];
-    ssize[d] = pdims[d] / processors[d];
-  }
-  int crank;  // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
-  // Mpi uses the reverse Lexico convention to us
-  Lexicographic::IndexFromCoorReversed(ccoor,crank,processors);
-  Lexicographic::IndexFromCoorReversed(scoor,srank,ssize);
-
-  MPI_Comm comm_split;
-  if ( Nchild > 1 ) { 
-
-    if(0){
-      std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
-      std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"]    ";
-      for(int d=0;d<parent._ndimension;d++)  std::cout << parent._processors[d] << " ";
-      std::cout<<std::endl;
-      
-      std::cout << GridLogMessage<<" child grid["<< _ndimension <<"]    ";
-      for(int d=0;d<processors.size();d++)  std::cout << processors[d] << " ";
-      std::cout<<std::endl;
-      
-      std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< parent._ndimension <<"]    ";
-      for(int d=0;d<parent._ndimension;d++)  std::cout << parent._processor_coor[d] << " ";
-      std::cout<<std::endl;
-      
-      std::cout << GridLogMessage<<" new split "<< srank<<" scoor ["<< _ndimension <<"]    ";
-      for(int d=0;d<processors.size();d++)  std::cout << scoor[d] << " ";
-      std::cout<<std::endl;
-      
-      std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"]    ";
-      for(int d=0;d<processors.size();d++)  std::cout << ccoor[d] << " ";
-      std::cout<<std::endl;
-    }
-
-    int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
-    assert(ierr==0);
-    //////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Declare victory
-    //////////////////////////////////////////////////////////////////////////////////////////////////////
-    //    std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
-    //	      << Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
-  } else {
-    comm_split=parent.communicator;
-    srank = 0;
-  }
-
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Set up from the new split communicator
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  InitFromMPICommunicator(processors,comm_split);
-
-  if(0){ 
-    std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
-    for(int d=0;d<processors.size();d++){
-      std::cout << d<< " " << _processor_coor[d] <<" " <<  ccoor[d]<<std::endl;
-    }
-  }
-  for(int d=0;d<processors.size();d++){
-    assert(_processor_coor[d] == ccoor[d] );
-  }
-
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-// Take an MPI_Comm and self assemble
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
-{
-  _ndimension = processors.size();
-  _processor_coor.resize(_ndimension);
-
-  /////////////////////////////////
-  // Count the requested nodes
-  /////////////////////////////////
-  _Nprocessors=1;
-  _processors = processors;
-  for(int i=0;i<_ndimension;i++){
-    _Nprocessors*=_processors[i];
-  }
-
-  std::vector<int> periodic(_ndimension,1);
-  MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator);
-  MPI_Comm_rank(communicator,&_processor);
-  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
-
-  if ( 0 && (communicator_base != communicator_world) ) {
-    std::cout << "InitFromMPICommunicator Cartesian communicator created with a non-world communicator"<<std::endl;
-    
-    std::cout << " new communicator rank "<<_processor<< " coor ["<<_ndimension<<"] ";
-    for(int d=0;d<_processors.size();d++){
-      std::cout << _processor_coor[d]<<" ";
-    }
-    std::cout << std::endl;
-  }
-
-  int Size;
-  MPI_Comm_size(communicator,&Size);
-
-#if defined(GRID_COMMS_MPIT) || defined (GRID_COMMS_MPI3)
-  communicator_halo.resize (2*_ndimension);
-  for(int i=0;i<_ndimension*2;i++){
-    MPI_Comm_dup(communicator,&communicator_halo[i]);
-  }
-#endif
-  
-  assert(Size==_Nprocessors);
-}
-#endif
-
-#if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) 
-CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) 
-{
-  InitFromMPICommunicator(processors,communicator_world);
-}
-
-#endif
-
-#if !defined( GRID_COMMS_MPI3) 
-int                      CartesianCommunicator::NodeCount(void)    { return ProcessorCount();};
-int                      CartesianCommunicator::RankCount(void)    { return ProcessorCount();};
-#endif
-
-#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT)
-double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						     int xmit_to_rank,
-						     void *recv,
-						     int recv_from_rank,
-						     int bytes, int dir)
-{
-  std::vector<CommsRequest_t> list;
-  // Discard the "dir"
-  SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
-  SendToRecvFromComplete(list);
-  return 2.0*bytes;
-}
-double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-							 void *xmit,
-							 int xmit_to_rank,
-							 void *recv,
-							 int recv_from_rank,
-							 int bytes, int dir)
-{
-  // Discard the "dir"
-  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
-  return 2.0*bytes;
-}
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
-{
-  SendToRecvFromComplete(waitall);
-}
-#endif
-
-#if !defined( GRID_COMMS_MPI3) 
-
-void CartesianCommunicator::StencilBarrier(void){};
-
-commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
-
-void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
-
-void *CartesianCommunicator::ShmBuffer(int rank) {
-  return NULL;
-}
-void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { 
-  return NULL;
-}
-void CartesianCommunicator::ShmInitGeneric(void){
-#if 1
-  int mmap_flag =0;
-#ifdef MAP_ANONYMOUS
-  mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
-#endif
-#ifdef MAP_ANON
-  mmap_flag = mmap_flag| MAP_SHARED | MAP_ANON;
-#endif
-#ifdef MAP_HUGETLB
-  if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
-#endif
-  ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); 
-  if (ShmCommBuf == (void *)MAP_FAILED) {
-    perror("mmap failed ");
-    exit(EXIT_FAILURE);  
-  }
-#ifdef MADV_HUGEPAGE
-  if (!Hugepages ) madvise(ShmCommBuf,MAX_MPI_SHM_BYTES,MADV_HUGEPAGE);
-#endif
-#else 
-  ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
-  ShmCommBuf=(void *)&ShmBufStorageVector[0];
-#endif
-  bzero(ShmCommBuf,MAX_MPI_SHM_BYTES);
-}
-
-#endif
  
 }

@@ -32,117 +32,33 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 ///////////////////////////////////
 // Processor layout information
 ///////////////////////////////////
-#ifdef GRID_COMMS_MPI
-#include <mpi.h>
-#endif
-#ifdef GRID_COMMS_MPI3
-#include <mpi.h>
-#endif
-#ifdef GRID_COMMS_MPIT
-#include <mpi.h>
-#endif
-#ifdef GRID_COMMS_SHMEM
-#include <mpp/shmem.h>
-#endif
+#include <Grid/communicator/SharedMemory.h>

 namespace Grid {

-class CartesianCommunicator {
-  public:    
+class CartesianCommunicator : public SharedMemory {

+public:    

  ////////////////////////////////////////////
-  // Isend/Irecv/Wait, or Sendrecv blocking
+  // Policies
  ////////////////////////////////////////////
  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
  static CommunicatorPolicy_t CommunicatorPolicy;
  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
-
-  ///////////////////////////////////////////
-  // Up to 65536 ranks per node adequate for now
-  // 128MB shared memory for comms enought for 48^4 local vol comms
-  // Give external control (command line override?) of this
-  ///////////////////////////////////////////
-  static const int MAXLOG2RANKSPERNODE = 16;            
-  static uint64_t  MAX_MPI_SHM_BYTES;
  static int       nCommThreads;
-  // use explicit huge pages
-  static int       Hugepages;

+  ////////////////////////////////////////////
  // Communicator should know nothing of the physics grid, only processor grid.
+  ////////////////////////////////////////////
  int              _Nprocessors;     // How many in all
  std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
  int              _processor;       // linear processor rank
  std::vector<int> _processor_coor;  // linear processor coordinate
-  unsigned long _ndimension;
-
-#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
-  static MPI_Comm communicator_world;
-
-  MPI_Comm              communicator;
-  std::vector<MPI_Comm> communicator_halo;
-
-  typedef MPI_Request CommsRequest_t;
-
-#else 
-  typedef int CommsRequest_t;
-#endif
-
-
-  ////////////////////////////////////////////////////////////////////
-  // Helper functionality for SHM Windows common to all other impls
-  ////////////////////////////////////////////////////////////////////
-  // Longer term; drop this in favour of a master / slave model with 
-  // cartesian communicator on a subset of ranks, slave ranks controlled
-  // by group leader with data xfer via shared memory
-  ////////////////////////////////////////////////////////////////////
-#ifdef GRID_COMMS_MPI3
-
-  static int ShmRank;
-  static int ShmSize;
-  static int GroupRank;
-  static int GroupSize;
-  static int WorldRank;
-  static int WorldSize;
-
-  std::vector<int>  WorldDims;
-  std::vector<int>  GroupDims;
-  std::vector<int>  ShmDims;
-  
-  std::vector<int> GroupCoor;
-  std::vector<int> ShmCoor;
-  std::vector<int> WorldCoor;
-
-  static std::vector<int> GroupRanks; 
-  static std::vector<int> MyGroup;
-  static int ShmSetup;
-  static MPI_Win ShmWindow; 
-  static MPI_Comm ShmComm;
-  
-  std::vector<int>  LexicographicToWorldRank;
-  
-  static std::vector<void *> ShmCommBufs;
-
-#else 
-  static void ShmInitGeneric(void);
-  static commVector<uint8_t> ShmBufStorageVector;
-#endif 
-
-  /////////////////////////////////
-  // Grid information and queries
-  // Implemented in Communicator_base.C
-  /////////////////////////////////
-  static void * ShmCommBuf;
-
-  
-  size_t heap_top;
-  size_t heap_bytes;
-
-  void *ShmBufferSelf(void);
-  void *ShmBuffer(int rank);
-  void *ShmBufferTranslate(int rank,void * local_p);
-  void *ShmBufferMalloc(size_t bytes);
-  void ShmBufferFreeAll(void) ;
+  unsigned long    _ndimension;
+  static Grid_MPI_Comm      communicator_world;
+  Grid_MPI_Comm             communicator;
+  std::vector<Grid_MPI_Comm> communicator_halo;
  
  ////////////////////////////////////////////////
  // Must call in Grid startup
@@ -158,14 +74,15 @@ class CartesianCommunicator {
  virtual ~CartesianCommunicator();

 private:
-#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT)  || defined (GRID_COMMS_MPI3) 
+
  ////////////////////////////////////////////////
  // Private initialise from an MPI communicator
  // Can use after an MPI_Comm_split, but hidden from user so private
  ////////////////////////////////////////////////
-  void InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base);
-#endif
+  void InitFromMPICommunicator(const std::vector<int> &processors, Grid_MPI_Comm communicator_base);
+
 public:
+
  
  ////////////////////////////////////////////////////////////////////////////////////////
  // Wraps MPI_Cart routines, or implements equivalent on other impls
@@ -181,8 +98,6 @@ class CartesianCommunicator {
  const std::vector<int> & ThisProcessorCoor(void) ;
  const std::vector<int> & ProcessorGrid(void)     ;
  int                      ProcessorCount(void)    ;
-  int                      NodeCount(void)    ;
-  int                      RankCount(void)    ;

  ////////////////////////////////////////////////////////////////////////////////
  // very VERY rarely (Log, serial RNG) we need world without a grid
@@ -270,15 +185,10 @@ class CartesianCommunicator {
  template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){
    assert(dim>=0);
    assert(dim<_ndimension);
-    int numnode = _processors[dim];
-    //    std::cerr << " AllToAll in.size()  "<<in.size()<<std::endl;
-    //    std::cerr << " AllToAll out.size() "<<out.size()<<std::endl;
    assert(in.size()==out.size());
+    int numnode = _processors[dim];
    uint64_t bytes=sizeof(T);
    uint64_t words=in.size()/numnode;
-    //    std:: cout << "AllToAll buffer size "<< in.size()*sizeof(T)<<std::endl;
-    //    std:: cout << "AllToAll datum bytes "<< bytes<<std::endl;
-    //    std:: cout << "AllToAll datum count "<< words<<std::endl;
    assert(numnode * words == in.size());
    assert(words < (1ULL<<31));
    AllToAll(dim,(void *)&in[0],(void *)&out[0],words,bytes);
@@ -1,222 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/communicator/Communicator_mpi.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/GridCore.h>
-#include <Grid/GridQCDcore.h>
-#include <Grid/qcd/action/ActionCore.h>
-#include <mpi.h>
-
-namespace Grid {
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Info that is setup once and indept of cartesian layout
-///////////////////////////////////////////////////////////////////////////////////////////////////
-MPI_Comm CartesianCommunicator::communicator_world;
-
-// Should error check all MPI calls.
-void CartesianCommunicator::Init(int *argc, char ***argv) {
-  int flag;
-  int provided;
-  MPI_Initialized(&flag); // needed to coexist with other libs apparently
-  if ( !flag ) {
-    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
-    if ( provided != MPI_THREAD_MULTIPLE ) {
-      QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
-    }
-  }
-  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
-  ShmInitGeneric();
-}
-
-CartesianCommunicator::~CartesianCommunicator()
-{
-  int MPI_is_finalised;
-  MPI_Finalized(&MPI_is_finalised);
-  if (communicator && !MPI_is_finalised)
-    MPI_Comm_free(&communicator);
-}
-
-void CartesianCommunicator::GlobalSum(uint32_t &u){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSum(uint64_t &u){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalXOR(uint32_t &u){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalXOR(uint64_t &u){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSum(float &f){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSumVector(float *f,int N)
-{
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSum(double &d)
-{
-  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSumVector(double *d,int N)
-{
-  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
-{
-  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
-  assert(ierr==0);
-}
-int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
-{
-  int rank;
-  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
-  assert(ierr==0);
-  return rank;
-}
-void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
-{
-  coor.resize(_ndimension);
-  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
-  assert(ierr==0);
-}
-
-// Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFrom(void *xmit,
-					   int dest,
-					   void *recv,
-					   int from,
-					   int bytes)
-{
-  std::vector<CommsRequest_t> reqs(0);
-  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
-  SendToRecvFromComplete(reqs);
-}
-
-void CartesianCommunicator::SendRecvPacket(void *xmit,
-					   void *recv,
-					   int sender,
-					   int receiver,
-					   int bytes)
-{
-  MPI_Status stat;
-  assert(sender != receiver);
-  int tag = sender;
-  if ( _processor == sender ) {
-    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
-  }
-  if ( _processor == receiver ) { 
-    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
-  }
-}
-
-// Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes)
-{
-  int myrank = _processor;
-  int ierr;
-  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
-    MPI_Request xrq;
-    MPI_Request rrq;
-
-    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
-    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
-    
-    assert(ierr==0);
-    list.push_back(xrq);
-    list.push_back(rrq);
-  } else { 
-    // Give the CPU to MPI immediately; can use threads to overlap optionally
-    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
-		      recv,bytes,MPI_CHAR,from, from,
-		      communicator,MPI_STATUS_IGNORE);
-    assert(ierr==0);
-  }
-}
-void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
-{
-  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
-    int nreq=list.size();
-    std::vector<MPI_Status> status(nreq);
-    int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
-    assert(ierr==0);
-  }
-}
-
-void CartesianCommunicator::Barrier(void)
-{
-  int ierr = MPI_Barrier(communicator);
-  assert(ierr==0);
-}
-
-void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
-{
-  int ierr=MPI_Bcast(data,
-		     bytes,
-		     MPI_BYTE,
-		     root,
-		     communicator);
-  assert(ierr==0);
-}
-  ///////////////////////////////////////////////////////
-  // Should only be used prior to Grid Init finished.
-  // Check for this?
-  ///////////////////////////////////////////////////////
-int CartesianCommunicator::RankWorld(void){ 
-  int r; 
-  MPI_Comm_rank(communicator_world,&r);
-  return r;
-}
-void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
-{
-  int ierr= MPI_Bcast(data,
-		      bytes,
-		      MPI_BYTE,
-		      root,
-		      communicator_world);
-  assert(ierr==0);
-}
-
-
-
-}
-
@@ -26,580 +26,246 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/GridCore.h>
-
-#include <mpi.h>
-
-#include <semaphore.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <limits.h>
-#include <sys/types.h>
-#include <sys/ipc.h>
-#include <sys/shm.h>
-#include <sys/mman.h>
-#include <zlib.h>
-#ifdef HAVE_NUMAIF_H
-#include <numaif.h>
-#endif
-
+#include <Grid/communicator/SharedMemory.h>

 namespace Grid {

-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Info that is setup once and indept of cartesian layout
-///////////////////////////////////////////////////////////////////////////////////////////////////
-int CartesianCommunicator::ShmSetup = 0;
+Grid_MPI_Comm       CartesianCommunicator::communicator_world;

-int CartesianCommunicator::ShmRank;
-int CartesianCommunicator::ShmSize;
-int CartesianCommunicator::GroupRank;
-int CartesianCommunicator::GroupSize;
-int CartesianCommunicator::WorldRank;
-int CartesianCommunicator::WorldSize;
-
-MPI_Comm CartesianCommunicator::communicator_world;
-MPI_Comm CartesianCommunicator::ShmComm;
-MPI_Win  CartesianCommunicator::ShmWindow;
-
-std::vector<int> CartesianCommunicator::GroupRanks;  
-std::vector<int> CartesianCommunicator::MyGroup;
-std::vector<void *> CartesianCommunicator::ShmCommBufs;
-
-int CartesianCommunicator::NodeCount(void)    { return GroupSize;};
-int CartesianCommunicator::RankCount(void)    { return WorldSize;};
-
-
-#undef FORCE_COMMS
-void *CartesianCommunicator::ShmBufferSelf(void)
+////////////////////////////////////////////
+// First initialise of comms system
+////////////////////////////////////////////
+void CartesianCommunicator::Init(int *argc, char ***argv) 
 {
-  return ShmCommBufs[ShmRank];
-}
-void *CartesianCommunicator::ShmBuffer(int rank)
-{
-  int gpeer = GroupRanks[rank];
-#ifdef FORCE_COMMS
-  return NULL;
-#endif
-  if (gpeer == MPI_UNDEFINED){
-    return NULL;
-  } else { 
-    return ShmCommBufs[gpeer];
-  }
-}
-void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p)
-{
-  static int count =0;
-  int gpeer = GroupRanks[rank];
-  assert(gpeer!=ShmRank); // never send to self
-  assert(rank!=WorldRank);// never send to self
-#ifdef FORCE_COMMS
-  return NULL;
-#endif
-  if (gpeer == MPI_UNDEFINED){
-    return NULL;
-  } else { 
-    uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
-    uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
-    return (void *) remote;
-  }
-}
-
-void CartesianCommunicator::Init(int *argc, char ***argv) {

  int flag;
  int provided;
-  //  mtrace();

  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
-    assert (provided == MPI_THREAD_MULTIPLE);
+    //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
+    if( (nCommThreads == 1 && provided == MPI_THREAD_SINGLE) ||
+        (nCommThreads > 1 && provided != MPI_THREAD_MULTIPLE) )
+      assert(0);
  }

  Grid_quiesce_nodes();

+  // Never clean up as done once.
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
-  MPI_Comm_rank(communicator_world,&WorldRank);
-  MPI_Comm_size(communicator_world,&WorldSize);

-  if ( WorldRank == 0 ) {
-    std::cout << GridLogMessage<< "Initialising MPI "<< WorldRank <<"/"<<WorldSize <<std::endl;
-  }
-
-  /////////////////////////////////////////////////////////////////////
-  // Split into groups that can share memory
-  /////////////////////////////////////////////////////////////////////
-  MPI_Comm_split_type(communicator_world, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
-  MPI_Comm_rank(ShmComm     ,&ShmRank);
-  MPI_Comm_size(ShmComm     ,&ShmSize);
-  GroupSize = WorldSize/ShmSize;
-
-  /////////////////////////////////////////////////////////////////////
-  // find world ranks in our SHM group (i.e. which ranks are on our node)
-  /////////////////////////////////////////////////////////////////////
-  MPI_Group WorldGroup, ShmGroup;
-  MPI_Comm_group (communicator_world, &WorldGroup); 
-  MPI_Comm_group (ShmComm, &ShmGroup);
-  
-  std::vector<int> world_ranks(WorldSize); 
-  GroupRanks.resize(WorldSize); 
-  for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
-  
-  MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]); 
-
-  ///////////////////////////////////////////////////////////////////
-  // Identify who is in my group and noninate the leader
-  ///////////////////////////////////////////////////////////////////
-  int g=0;
-  MyGroup.resize(ShmSize);
-  for(int rank=0;rank<WorldSize;rank++){
-    if(GroupRanks[rank]!=MPI_UNDEFINED){
-      assert(g<ShmSize);
-      MyGroup[g++] = rank;
-    }
-  }
-  
-  std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
-  int myleader = MyGroup[0];
-  
-  std::vector<int> leaders_1hot(WorldSize,0);
-  std::vector<int> leaders_group(GroupSize,0);
-  leaders_1hot [ myleader ] = 1;
-    
-  ///////////////////////////////////////////////////////////////////
-  // global sum leaders over comm world
-  ///////////////////////////////////////////////////////////////////
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator_world);
-  assert(ierr==0);
-  ///////////////////////////////////////////////////////////////////
-  // find the group leaders world rank
-  ///////////////////////////////////////////////////////////////////
-  int group=0;
-  for(int l=0;l<WorldSize;l++){
-    if(leaders_1hot[l]){
-      leaders_group[group++] = l;
-    }
-  }
-  ///////////////////////////////////////////////////////////////////
-  // Identify the rank of the group in which I (and my leader) live
-  ///////////////////////////////////////////////////////////////////
-  GroupRank=-1;
-  for(int g=0;g<GroupSize;g++){
-    if (myleader == leaders_group[g]){
-      GroupRank=g;
-    }
-  }
-  assert(GroupRank!=-1);
-  //////////////////////////////////////////////////////////////////////////////////////////////////////////
-  // allocate the shared window for our group
-  //////////////////////////////////////////////////////////////////////////////////////////////////////////
-  MPI_Barrier(ShmComm);
-
-  ShmCommBuf = 0;
-  ShmCommBufs.resize(ShmSize);
-
-  ////////////////////////////////////////////////////////////////////////////////////////////
-  // Hugetlbf and others map filesystems as mappable huge pages
-  ////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef GRID_MPI3_SHMMMAP
-  char shm_name [NAME_MAX];
-  for(int r=0;r<ShmSize;r++){
-    
-    size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
-    sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",GroupRank,r);
-    //sprintf(shm_name,"/var/lib/hugetlbfs/group/wheel/pagesize-2MB/" "Grid_mpi3_shm_%d_%d",GroupRank,r);
-    //    printf("Opening file %s \n",shm_name);
-    int fd=open(shm_name,O_RDWR|O_CREAT,0666);
-    if ( fd == -1) { 
-      printf("open %s failed\n",shm_name);
-      perror("open hugetlbfs");
-      exit(0);
-    }
-    int mmap_flag = MAP_SHARED ;
-#ifdef MAP_POPULATE    
-    mmap_flag|=MAP_POPULATE;
-#endif
-#ifdef MAP_HUGETLB
-    if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
-#endif
-    void *ptr = (void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
-    if ( ptr == (void *)MAP_FAILED ) {    
-      printf("mmap %s failed\n",shm_name);
-      perror("failed mmap");      assert(0);    
-    }
-    assert(((uint64_t)ptr&0x3F)==0);
-    ShmCommBufs[r] =ptr;
-    
-  }
-#endif
-  ////////////////////////////////////////////////////////////////////////////////////////////
-  // POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
-  // tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for 
-  // the posix shm virtual file system
-  ////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef GRID_MPI3_SHMOPEN
-  char shm_name [NAME_MAX];
-  if ( ShmRank == 0 ) {
-    for(int r=0;r<ShmSize;r++){
-
-      size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
-
-      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",GroupRank,r);
-
-      shm_unlink(shm_name);
-      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
-      if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      }
-      ftruncate(fd, size);
-      
-      int mmap_flag = MAP_SHARED;
-#ifdef MAP_POPULATE 
-      mmap_flag |= MAP_POPULATE;
-#endif
-#ifdef MAP_HUGETLB
-      if (Hugepages) mmap_flag |= MAP_HUGETLB;
-#endif
-      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
-
-      if ( ptr == (void * )MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
-      assert(((uint64_t)ptr&0x3F)==0);
-
-// Experiments; Experiments; Try to force numa domain on the shm segment if we have numaif.h
-#if 0
-//#ifdef HAVE_NUMAIF_H
-	int status;
-	int flags=MPOL_MF_MOVE;
-#ifdef KNL
-	int nodes=1; // numa domain == MCDRAM
-	// Find out if in SNC2,SNC4 mode ?
-#else
-	int nodes=r; // numa domain == MPI ID
-#endif
-	unsigned long count=1;
-	for(uint64_t page=0;page<size;page+=4096){
-	  void *pages = (void *) ( page + (uint64_t)ptr );
-	  uint64_t *cow_it = (uint64_t *)pages;	*cow_it = 1;
-	  ierr= move_pages(0,count, &pages,&nodes,&status,flags);
-	  if (ierr && (page==0)) perror("numa relocate command failed");
-	}
-#endif
-	ShmCommBufs[r] =ptr;
-      
-    }
-  }
-
-  MPI_Barrier(ShmComm);
-
-  if ( ShmRank != 0 ) { 
-    for(int r=0;r<ShmSize;r++){
-      size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES ;
-    
-      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",GroupRank,r);
-
-      int fd=shm_open(shm_name,O_RDWR,0666);
-      if ( fd<0 ) {	perror("failed shm_open");	assert(0);      }
-
-      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-      if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
-      assert(((uint64_t)ptr&0x3F)==0);
-      ShmCommBufs[r] =ptr;
-    }
-  }
-#endif
-  ////////////////////////////////////////////////////////////////////////////////////////////
-  // SHMGET SHMAT and SHM_HUGETLB flag
-  ////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef GRID_MPI3_SHMGET
-  std::vector<int> shmids(ShmSize);
-
-  if ( ShmRank == 0 ) {
-    for(int r=0;r<ShmSize;r++){
-      size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
-      key_t key   = IPC_PRIVATE;
-      int flags = IPC_CREAT | SHM_R | SHM_W;
-#ifdef SHM_HUGETLB
-      if (Hugepages) flags|=SHM_HUGETLB;
-#endif
-      if ((shmids[r]= shmget(key,size, flags)) ==-1) {
-	int errsv = errno;
-	printf("Errno %d\n",errsv);
-	printf("key   %d\n",key);
-	printf("size  %lld\n",size);
-	printf("flags %d\n",flags);
-	perror("shmget");
-	exit(1);
-      } else { 
-	printf("shmid: 0x%x\n", shmids[r]);
-      }
-    }
-  }
-  MPI_Barrier(ShmComm);
-  MPI_Bcast(&shmids[0],ShmSize*sizeof(int),MPI_BYTE,0,ShmComm);
-  MPI_Barrier(ShmComm);
-
-  for(int r=0;r<ShmSize;r++){
-    ShmCommBufs[r] = (uint64_t *)shmat(shmids[r], NULL,0);
-    if (ShmCommBufs[r] == (uint64_t *)-1) {
-      perror("Shared memory attach failure");
-      shmctl(shmids[r], IPC_RMID, NULL);
-      exit(2);
-    }
-    printf("shmaddr: %p\n", ShmCommBufs[r]);
-  }
-  MPI_Barrier(ShmComm);
-  // Mark for clean up
-  for(int r=0;r<ShmSize;r++){
-    shmctl(shmids[r], IPC_RMID,(struct shmid_ds *)NULL);
-  }
-  MPI_Barrier(ShmComm);
-
-#endif
-  ShmCommBuf         = ShmCommBufs[ShmRank];
-
-  MPI_Barrier(ShmComm);
-  if ( ShmRank == 0 ) {
-    for(int r=0;r<ShmSize;r++){
-      uint64_t * check = (uint64_t *) ShmCommBufs[r];
-      check[0] = GroupRank;
-      check[1] = r;
-      check[2] = 0x5A5A5A;
-    }
-  }
-
-  MPI_Barrier(ShmComm);
-  for(int r=0;r<ShmSize;r++){
-    uint64_t * check = (uint64_t *) ShmCommBufs[r];
-    
-    assert(check[0]==GroupRank);
-    assert(check[1]==r);
-    assert(check[2]==0x5A5A5A);
-
-  }
-  MPI_Barrier(ShmComm);
-
-  //////////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Verbose for now
-  //////////////////////////////////////////////////////////////////////////////////////////////////////////
-  if (WorldRank == 0){
-    std::cout<<GridLogMessage<< "Grid MPI-3 configuration: detected ";
-    std::cout<< WorldSize << " Ranks " ;
-    std::cout<< GroupSize << " Nodes " ;
-    std::cout<< " with "<< ShmSize  << " ranks-per-node "<<std::endl;
-    
-    std::cout<<GridLogMessage     <<"Grid MPI-3 configuration: allocated shared memory region of size ";
-    std::cout<<std::hex << MAX_MPI_SHM_BYTES <<" ShmCommBuf address = "<<ShmCommBuf << std::dec<<std::endl;
-
-    for(int g=0;g<GroupSize;g++){
-      std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<<leaders_group[g]<<std::endl;
-    }
-
-    std::cout<<GridLogMessage<<" Boss Node Shm Pointers are {";
-    for(int g=0;g<ShmSize;g++){
-      std::cout<<std::hex<<ShmCommBufs[g]<<std::dec;
-      if(g!=ShmSize-1) std::cout<<",";
-      else std::cout<<"}"<<std::endl;
-    }
-  }
-  
-  for(int g=0;g<GroupSize;g++){
-    if ( (ShmRank == 0) && (GroupRank==g) )  std::cout<<GridLogMessage<<"["<<g<<"] Node Group "<<g<<" is ranks {";
-    for(int r=0;r<ShmSize;r++){
-      if ( (ShmRank == 0) && (GroupRank==g) ) {
-	std::cout<<MyGroup[r];
-	if(r<ShmSize-1) std::cout<<",";
-	else std::cout<<"}"<<std::endl<<std::flush;
-      }
-      MPI_Barrier(communicator_world);
-    }
-  }
-
-  assert(ShmSetup==0);  ShmSetup=1;
+  GlobalSharedMemory::Init(communicator_world);
+  GlobalSharedMemory::SharedMemoryAllocate(
+		   GlobalSharedMemory::MAX_MPI_SHM_BYTES,
+		   GlobalSharedMemory::Hugepages);
 }

-////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Want to implement some magic ... Group sub-cubes into those on same node
-////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &dest,int &source)
+///////////////////////////////////////////////////////////////////////////
+// Use cartesian communicators now even in MPI3
+///////////////////////////////////////////////////////////////////////////
+void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
-  std::vector<int> coor = _processor_coor; // my coord
-  assert(std::abs(shift) <_processors[dim]);
-
-  coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
-  Lexicographic::IndexFromCoor(coor,source,_processors);
-  source = LexicographicToWorldRank[source];
-
-  coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
-  Lexicographic::IndexFromCoor(coor,dest,_processors);
-  dest = LexicographicToWorldRank[dest];
-
-}// rank is world rank.
-
+  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
+  assert(ierr==0);
+}
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  int rank;
-  Lexicographic::IndexFromCoor(coor,rank,_processors);
-  rank = LexicographicToWorldRank[rank];
+  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
+  assert(ierr==0);
  return rank;
-}// rank is world rank
-
+}
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
-  int lr=-1;
-  for(int r=0;r<WorldSize;r++){// map world Rank to lexico and then to coor
-    if( LexicographicToWorldRank[r]==rank) lr = r;
-  }
-  assert(lr!=-1);
-  Lexicographic::CoorFromIndex(coor,lr,_processors);
+  coor.resize(_ndimension);
+  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
+  assert(ierr==0);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Initialises from communicator_world
+////////////////////////////////////////////////////////////////////////////////////////////////////////
+CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) 
+{
+  MPI_Comm optimal_comm;
+  ////////////////////////////////////////////////////
+  // Remap using the shared memory optimising routine
+  // The remap creates a comm which must be freed
+  ////////////////////////////////////////////////////
+  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm);
+  InitFromMPICommunicator(processors,optimal_comm);
+  SetCommunicator(optimal_comm);
+  ///////////////////////////////////////////////////
+  // Free the temp communicator
+  ///////////////////////////////////////////////////
+  MPI_Comm_free(&optimal_comm);
 }

 //////////////////////////////////
 // Try to subdivide communicator
 //////////////////////////////////
-/*
- * Use default in MPI compile
- */
-CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank) 
-  : CartesianCommunicator(processors) 
+CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)    
 {
-  std::cout << "Attempts to split MPI3 communicators will fail until implemented" <<std::endl;
+  _ndimension = processors.size();
+
+  int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
+  std::vector<int> parent_processor_coor(_ndimension,0);
+  std::vector<int> parent_processors    (_ndimension,1);
+
+  // Can make 5d grid from 4d etc...
+  int pad = _ndimension-parent_ndimension;
+  for(int d=0;d<parent_ndimension;d++){
+    parent_processor_coor[pad+d]=parent._processor_coor[d];
+    parent_processors    [pad+d]=parent._processors[d];
+  }
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // split the communicator
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  //  int Nparent = parent._processors ; 
+  //  std::cout << " splitting from communicator "<<parent.communicator <<std::endl;
+  int Nparent;
+  MPI_Comm_size(parent.communicator,&Nparent);
+  //  std::cout << " Parent size  "<<Nparent <<std::endl;
+
+  int childsize=1;
+  for(int d=0;d<processors.size();d++) {
+    childsize *= processors[d];
+  }
+  int Nchild = Nparent/childsize;
+  assert (childsize * Nchild == Nparent);
+
+  //  std::cout << " child size  "<<childsize <<std::endl;
+
+  std::vector<int> ccoor(_ndimension); // coor within subcommunicator
+  std::vector<int> scoor(_ndimension); // coor of split within parent
+  std::vector<int> ssize(_ndimension); // coor of split within parent
+
+  for(int d=0;d<_ndimension;d++){
+    ccoor[d] = parent_processor_coor[d] % processors[d];
+    scoor[d] = parent_processor_coor[d] / processors[d];
+    ssize[d] = parent_processors[d]     / processors[d];
+  }
+
+  // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
+  int crank;  
+  // Mpi uses the reverse Lexico convention to us; so reversed routines called
+  Lexicographic::IndexFromCoorReversed(ccoor,crank,processors); // processors is the split grid dimensions
+  Lexicographic::IndexFromCoorReversed(scoor,srank,ssize);      // ssize is the number of split grids
+
+  MPI_Comm comm_split;
+  if ( Nchild > 1 ) { 
+
+    if(0){
+      std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
+      std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"]    ";
+      for(int d=0;d<parent._ndimension;d++)  std::cout << parent._processors[d] << " ";
+      std::cout<<std::endl;
+      
+      std::cout << GridLogMessage<<" child grid["<< _ndimension <<"]    ";
+      for(int d=0;d<processors.size();d++)  std::cout << processors[d] << " ";
+      std::cout<<std::endl;
+      
+      std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< parent._ndimension <<"]    ";
+      for(int d=0;d<parent._ndimension;d++)  std::cout << parent._processor_coor[d] << " ";
+      std::cout<<std::endl;
+      
+      std::cout << GridLogMessage<<" new split "<< srank<<" scoor ["<< _ndimension <<"]    ";
+      for(int d=0;d<processors.size();d++)  std::cout << scoor[d] << " ";
+      std::cout<<std::endl;
+      
+      std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"]    ";
+      for(int d=0;d<processors.size();d++)  std::cout << ccoor[d] << " ";
+      std::cout<<std::endl;
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////
+      // Declare victory
+      //////////////////////////////////////////////////////////////////////////////////////////////////////
+      std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
+		<< Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
+      std::cout << " Split communicator " <<comm_split <<std::endl;
+    }
+
+    ////////////////////////////////////////////////////////////////
+    // Split the communicator
+    ////////////////////////////////////////////////////////////////
+    int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
+    assert(ierr==0);
+
+  } else {
+    srank = 0;
+    int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
+    assert(ierr==0);
+  }
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Set up from the new split communicator
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  InitFromMPICommunicator(processors,comm_split);
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Take the right SHM buffers
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  SetCommunicator(comm_split);
+  
+  ///////////////////////////////////////////////
+  // Free the temp communicator 
+  ///////////////////////////////////////////////
+  MPI_Comm_free(&comm_split);
+
+  if(0){ 
+    std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
+    for(int d=0;d<processors.size();d++){
+      std::cout << d<< " " << _processor_coor[d] <<" " <<  ccoor[d]<<std::endl;
+    }
+  }
+  for(int d=0;d<processors.size();d++){
+    assert(_processor_coor[d] == ccoor[d] );
+  }
 }

-CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
-{ 
-  int ierr;
-  communicator=communicator_world;
-
+void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
+{
+  ////////////////////////////////////////////////////
+  // Creates communicator, and the communicator_halo
+  ////////////////////////////////////////////////////
  _ndimension = processors.size();
+  _processor_coor.resize(_ndimension);
+
+  /////////////////////////////////
+  // Count the requested nodes
+  /////////////////////////////////
+  _Nprocessors=1;
+  _processors = processors;
+  for(int i=0;i<_ndimension;i++){
+    _Nprocessors*=_processors[i];
+  }
+
+  std::vector<int> periodic(_ndimension,1);
+  MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator);
+  MPI_Comm_rank(communicator,&_processor);
+  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
+
+  if ( 0 && (communicator_base != communicator_world) ) {
+    std::cout << "InitFromMPICommunicator Cartesian communicator created with a non-world communicator"<<std::endl;
+    std::cout << " new communicator rank "<<_processor<< " coor ["<<_ndimension<<"] ";
+    for(int d=0;d<_processors.size();d++){
+      std::cout << _processor_coor[d]<<" ";
+    }
+    std::cout << std::endl;
+  }
+
+  int Size;
+  MPI_Comm_size(communicator,&Size);

  communicator_halo.resize (2*_ndimension);
  for(int i=0;i<_ndimension*2;i++){
    MPI_Comm_dup(communicator,&communicator_halo[i]);
  }
+  assert(Size==_Nprocessors);
+}

-  ////////////////////////////////////////////////////////////////
-  // Assert power of two shm_size.
-  ////////////////////////////////////////////////////////////////
-  int log2size = -1;
-  for(int i=0;i<=MAXLOG2RANKSPERNODE;i++){  
-    if ( (0x1<<i) == ShmSize ) {
-      log2size = i;
-      break;
-    }
-  }
-  assert(log2size != -1);
-
-  ////////////////////////////////////////////////////////////////
-  // Identify subblock of ranks on node spreading across dims
-  // in a maximally symmetrical way
-  ////////////////////////////////////////////////////////////////
-  std::vector<int> WorldDims = processors;
-
-  ShmDims.resize  (_ndimension,1);
-  GroupDims.resize(_ndimension);
-  ShmCoor.resize  (_ndimension);
-  GroupCoor.resize(_ndimension);
-  WorldCoor.resize(_ndimension);
-
-  int dim = 0;
-  for(int l2=0;l2<log2size;l2++){
-    while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%_ndimension;
-    ShmDims[dim]*=2;
-    dim=(dim+1)%_ndimension;
-  }
-
-  ////////////////////////////////////////////////////////////////
-  // Establish torus of processes and nodes with sub-blockings
-  ////////////////////////////////////////////////////////////////
-  for(int d=0;d<_ndimension;d++){
-    GroupDims[d] = WorldDims[d]/ShmDims[d];
-  }
-
-  ////////////////////////////////////////////////////////////////
-  // Verbose
-  ////////////////////////////////////////////////////////////////
-#if 0
-  std::cout<< GridLogMessage << "MPI-3 usage "<<std::endl;
-  std::cout<< GridLogMessage << "SHM   ";
-  for(int d=0;d<_ndimension;d++){
-    std::cout<< ShmDims[d] <<" ";
-  }
-  std::cout<< std::endl;
-
-  std::cout<< GridLogMessage << "Group ";
-  for(int d=0;d<_ndimension;d++){
-    std::cout<< GroupDims[d] <<" ";
-  }
-  std::cout<< std::endl;
-
-  std::cout<< GridLogMessage<<"World ";
-  for(int d=0;d<_ndimension;d++){
-    std::cout<< WorldDims[d] <<" ";
-  }
-  std::cout<< std::endl;
-#endif
-  ////////////////////////////////////////////////////////////////
-  // Check processor counts match
-  ////////////////////////////////////////////////////////////////
-  _Nprocessors=1;
-  _processors = processors;
-  _processor_coor.resize(_ndimension);
-  for(int i=0;i<_ndimension;i++){
-    _Nprocessors*=_processors[i];
-  }
-  assert(WorldSize==_Nprocessors);
-      
-  ////////////////////////////////////////////////////////////////
-  // Establish mapping between lexico physics coord and WorldRank
-  ////////////////////////////////////////////////////////////////
-  Lexicographic::CoorFromIndex(GroupCoor,GroupRank,GroupDims);
-  Lexicographic::CoorFromIndex(ShmCoor,ShmRank,ShmDims);
-  for(int d=0;d<_ndimension;d++){
-    WorldCoor[d] = GroupCoor[d]*ShmDims[d]+ShmCoor[d];
-  }
-  _processor_coor = WorldCoor;
-  _processor      = WorldRank;
-
-  ///////////////////////////////////////////////////////////////////
-  // global sum Lexico to World mapping
-  ///////////////////////////////////////////////////////////////////
-  int lexico;
-  LexicographicToWorldRank.resize(WorldSize,0);
-  Lexicographic::IndexFromCoor(WorldCoor,lexico,WorldDims);
-  LexicographicToWorldRank[lexico] = WorldRank;
-  ierr=MPI_Allreduce(MPI_IN_PLACE,&LexicographicToWorldRank[0],WorldSize,MPI_INT,MPI_SUM,communicator);
-  assert(ierr==0);
-
-  for(int i=0;i<WorldSize;i++){
-
-    int wr = LexicographicToWorldRank[i];
-    //    int wr = i;
-
-    std::vector<int> coor(_ndimension);
-    ProcessorCoorFromRank(wr,coor); // from world rank
-    int ck = RankFromProcessorCoor(coor);
-    assert(ck==wr);
-
-    if ( wr == WorldRank ) { 
-      for(int j=0;j<coor.size();j++) {
-	assert(coor[j] == _processor_coor[j]);
-      }
-    }
-    /*
-    std::cout << GridLogMessage<< " Lexicographic "<<i;
-    std::cout << " MPI rank      "<<wr;
-    std::cout << " Coor          ";
-    for(int j=0;j<coor.size();j++) std::cout << coor[j];
-    std::cout<< std::endl;
-    */
-    /////////////////////////////////////////////////////
-    // Check everyone agrees on everyone elses coords
-    /////////////////////////////////////////////////////
-    std::vector<int> mcoor = coor;
-    this->Broadcast(0,(void *)&mcoor[0],mcoor.size()*sizeof(int));
-    for(int d = 0 ; d< _ndimension; d++) {
-      assert(coor[d] == mcoor[d]);
-    }
-  }
-};
 CartesianCommunicator::~CartesianCommunicator()
 {
  int MPI_is_finalised;
@@ -734,19 +400,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  MPI_Request rrq;

  int ierr;
-  int gdest = GroupRanks[dest];
-  int gfrom = GroupRanks[from];
-  int gme   = GroupRanks[_processor];
+  int gdest = ShmRanks[dest];
+  int gfrom = ShmRanks[from];
+  int gme   = ShmRanks[_processor];

  assert(dest != _processor);
  assert(from != _processor);
  assert(gme  == ShmRank);
  double off_node_bytes=0.0;

-#ifdef FORCE_COMMS
-  gdest = MPI_UNDEFINED;
-  gfrom = MPI_UNDEFINED;
-#endif
  if ( gfrom ==MPI_UNDEFINED) {
    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
    assert(ierr==0);
@@ -815,5 +477,38 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
  assert(ierr==0);
 }

+void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
+{
+  std::vector<int> row(_ndimension,1);
+  assert(dim>=0 && dim<_ndimension);
+
+  //  Split the communicator
+  row[dim] = _processors[dim];
+
+  int me;
+  CartesianCommunicator Comm(row,*this,me);
+  Comm.AllToAll(in,out,words,bytes);
+}
+void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t bytes)
+{
+  // MPI is a pain and uses "int" arguments
+  // 64*64*64*128*16 == 500Million elements of data.
+  // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
+  // (Turns up on 32^3 x 64 Gparity too)
+  MPI_Datatype object;
+  int iwords; 
+  int ibytes;
+  iwords = words;
+  ibytes = bytes;
+  assert(words == iwords); // safe to cast to int ?
+  assert(bytes == ibytes); // safe to cast to int ?
+  MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
+  MPI_Type_commit(&object);
+  MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);
+  MPI_Type_free(&object);
+}
+
+
+
 }

@@ -1,988 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/communicator/Communicator_mpi.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include "Grid.h"
-#include <mpi.h>
-//#include <numaif.h>
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Workarounds:
-/// i) bloody mac os doesn't implement unnamed semaphores since it is "optional" posix.
-///    darwin dispatch semaphores don't seem to be multiprocess.
-///
-/// ii) openmpi under --mca shmem posix works with two squadrons per node; 
-///     openmpi under default mca settings (I think --mca shmem mmap) on MacOS makes two squadrons map the SAME
-///     memory as each other, despite their living on different communicators. This appears to be a bug in OpenMPI.
-///
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-#include <semaphore.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <limits.h>
-typedef sem_t *Grid_semaphore;
-
-
-#error  /*THis is deprecated*/
-
-#if 0 
-#define SEM_INIT(S)      S = sem_open(sem_name,0,0600,0); assert ( S != SEM_FAILED );
-#define SEM_INIT_EXCL(S) sem_unlink(sem_name); S = sem_open(sem_name,O_CREAT|O_EXCL,0600,0); assert ( S != SEM_FAILED );
-#define SEM_POST(S) assert ( sem_post(S) == 0 ); 
-#define SEM_WAIT(S) assert ( sem_wait(S) == 0 );
-#else
-#define SEM_INIT(S)      ;
-#define SEM_INIT_EXCL(S) ;
-#define SEM_POST(S) ;
-#define SEM_WAIT(S) ;
-#endif
-#include <sys/mman.h>
-
-namespace Grid {
-
-enum { COMMAND_ISEND, COMMAND_IRECV, COMMAND_WAITALL, COMMAND_SENDRECV };
-
-struct Descriptor {
-  uint64_t buf;
-  size_t bytes;
-  int rank;
-  int tag;
-  int command;
-  uint64_t xbuf;
-  uint64_t rbuf;
-  int xtag;
-  int rtag;
-  int src;
-  int dest;
-  MPI_Request request;
-};
-
-const int pool = 48;
-
-class SlaveState {
-public:
-  volatile int head;
-  volatile int start;
-  volatile int tail;
-  volatile Descriptor Descrs[pool];
-};
-
-class Slave {
-public:
-  Grid_semaphore  sem_head;
-  Grid_semaphore  sem_tail;
-  SlaveState *state;
-  MPI_Comm squadron;
-  uint64_t     base;
-  int universe_rank;
-  int vertical_rank;
-  char sem_name [NAME_MAX];
-  ////////////////////////////////////////////////////////////
-  // Descriptor circular pointers
-  ////////////////////////////////////////////////////////////
-  Slave() {};
-
-  void Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank);
-
-  void SemInit(void) {
-    sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
-    SEM_INIT(sem_head);
-    sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
-    SEM_INIT(sem_tail);
-  }  
-  void SemInitExcl(void) {
-    sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
-    SEM_INIT_EXCL(sem_head);
-    sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
-    SEM_INIT_EXCL(sem_tail);
-  }  
-  void WakeUpDMA(void) { 
-    SEM_POST(sem_head);
-  };
-  void WakeUpCompute(void) { 
-    SEM_POST(sem_tail);
-  };
-  void WaitForCommand(void) { 
-    SEM_WAIT(sem_head);
-  };
-  void WaitForComplete(void) { 
-    SEM_WAIT(sem_tail);
-  };
-  void EventLoop (void) {
-    //    std::cout<< " Entering event loop "<<std::endl;
-    while(1){
-      WaitForCommand();
-      //      std::cout << "Getting command "<<std::endl;
-#if 0
-      _mm_monitor((void *)&state->head,0,0);
-      int s=state->start;
-      if ( s != state->head ) {
-	_mm_mwait(0,0);
-      }
-#endif
-      Event();
-    }
-  }
-
-  int Event (void) ;
-
-  uint64_t QueueCommand(int command,void *buf, int bytes, int hashtag, MPI_Comm comm,int u_rank) ;
-  void QueueSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) ;
-
-  void WaitAll() {
-    //    std::cout << "Queueing WAIT command  "<<std::endl;
-    QueueCommand(COMMAND_WAITALL,0,0,0,squadron,0);
-    //    std::cout << "Waking up DMA "<<std::endl;
-    WakeUpDMA();
-    //    std::cout << "Waiting from semaphore "<<std::endl;
-    WaitForComplete();
-    //    std::cout << "Checking FIFO is empty "<<std::endl;
-    while ( state->tail != state->head );
-  }
-};
-
-////////////////////////////////////////////////////////////////////////
-// One instance of a data mover.
-// Master and Slave must agree on location in shared memory
-////////////////////////////////////////////////////////////////////////
-
-class MPIoffloadEngine { 
-public:
-
-  static std::vector<Slave> Slaves;
-
-  static int ShmSetup;
-  
-  static int UniverseRank;
-  static int UniverseSize;
-  
-  static MPI_Comm communicator_universe;
-  static MPI_Comm communicator_cached;
-
-  static MPI_Comm HorizontalComm;
-  static int HorizontalRank;
-  static int HorizontalSize;
-  
-  static MPI_Comm VerticalComm;
-  static MPI_Win  VerticalWindow; 
-  static int VerticalSize;
-  static int VerticalRank;
-  
-  static std::vector<void *> VerticalShmBufs;
-  static std::vector<std::vector<int> > UniverseRanks;
-  static std::vector<int> UserCommunicatorToWorldRanks; 
-  
-  static MPI_Group WorldGroup, CachedGroup;
-  
-  static void CommunicatorInit (MPI_Comm &communicator_world,
-				MPI_Comm &ShmComm,
-				void * &ShmCommBuf);
-
-  static void MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int commrank);
-
-  /////////////////////////////////////////////////////////
-  // routines for master proc must handle any communicator
-  /////////////////////////////////////////////////////////
-
-  static void QueueSend(int slave,void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
-     //    std::cout<< " Queueing send  "<< bytes<< " slave "<< slave << " to comm "<<rank  <<std::endl;
-    Slaves[slave].QueueCommand(COMMAND_ISEND,buf,bytes,tag,comm,rank);
-    //    std::cout << "Queued send command to rank "<< rank<< " via "<<slave <<std::endl;
-    Slaves[slave].WakeUpDMA();
-    //    std::cout << "Waking up DMA "<< slave<<std::endl;
-  };
-
-  static void QueueSendRecv(int slave,void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) 
-  {
-    Slaves[slave].QueueSendRecv(xbuf,rbuf,bytes,xtag,rtag,comm,dest,src);
-    Slaves[slave].WakeUpDMA();
-  }
-
-  static void QueueRecv(int slave, void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
-    //    std::cout<< " Queueing recv "<< bytes<< " slave "<< slave << " from comm "<<rank  <<std::endl;
-    Slaves[slave].QueueCommand(COMMAND_IRECV,buf,bytes,tag,comm,rank);
-    //    std::cout << "Queued recv command from rank "<< rank<< " via "<<slave <<std::endl;
-    Slaves[slave].WakeUpDMA();
-    //    std::cout << "Waking up DMA "<< slave<<std::endl;
-  };
-
-  static void WaitAll() {
-    for(int s=1;s<VerticalSize;s++) {
-      //      std::cout << "Waiting for slave "<< s<<std::endl;
-      Slaves[s].WaitAll();
-    }
-    //    std::cout << " Wait all Complete "<<std::endl;
-  };
-
-  static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
-    int basework = nwork/units;
-    int backfill = units-(nwork%units);
-    if ( me >= units ) { 
-      mywork = myoff = 0;
-    } else { 
-      mywork = (nwork+me)/units;
-      myoff  = basework * me;
-      if ( me > backfill ) 
-	myoff+= (me-backfill);
-    }
-    return;
-  };
-
-  static void QueueRoundRobinSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) {
-    uint8_t * cxbuf = (uint8_t *) xbuf;
-    uint8_t * crbuf = (uint8_t *) rbuf;
-    static int rrp=0;
-    int procs = VerticalSize-1;
-    int myoff=0;
-    int mywork=bytes;
-    QueueSendRecv(rrp+1,&cxbuf[myoff],&crbuf[myoff],mywork,xtag,rtag,comm,dest,src);
-    rrp = rrp+1;
-    if ( rrp == (VerticalSize-1) ) rrp = 0;
-  }
-
-  static void QueueMultiplexedSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) {
-    uint8_t * cxbuf = (uint8_t *) xbuf;
-    uint8_t * crbuf = (uint8_t *) rbuf;
-    int mywork, myoff, procs;
-    procs = VerticalSize-1;
-    for(int s=0;s<procs;s++) {
-      GetWork(bytes,s,mywork,myoff,procs);
-      QueueSendRecv(s+1,&cxbuf[myoff],&crbuf[myoff],mywork,xtag,rtag,comm,dest,src);
-    }
-  };
-  static void QueueMultiplexedSend(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
-    uint8_t * cbuf = (uint8_t *) buf;
-    int mywork, myoff, procs;
-    procs = VerticalSize-1;
-    for(int s=0;s<procs;s++) {
-      GetWork(bytes,s,mywork,myoff,procs);
-      QueueSend(s+1,&cbuf[myoff],mywork,tag,comm,rank);
-    }
-  };
-
-  static void QueueMultiplexedRecv(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
-    uint8_t * cbuf = (uint8_t *) buf;
-    int mywork, myoff, procs;
-    procs = VerticalSize-1;
-    for(int s=0;s<procs;s++) {
-      GetWork(bytes,s,mywork,myoff,procs);
-      QueueRecv(s+1,&cbuf[myoff],mywork,tag,comm,rank);
-    }
-  };
-
-};
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Info that is setup once and indept of cartesian layout
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-std::vector<Slave> MPIoffloadEngine::Slaves;
-    
-int MPIoffloadEngine::UniverseRank;
-int MPIoffloadEngine::UniverseSize;
-
-MPI_Comm  MPIoffloadEngine::communicator_universe;
-MPI_Comm  MPIoffloadEngine::communicator_cached;
-MPI_Group MPIoffloadEngine::WorldGroup;
-MPI_Group MPIoffloadEngine::CachedGroup;
-
-MPI_Comm MPIoffloadEngine::HorizontalComm;
-int      MPIoffloadEngine::HorizontalRank;
-int      MPIoffloadEngine::HorizontalSize;
-
-MPI_Comm MPIoffloadEngine::VerticalComm;
-int      MPIoffloadEngine::VerticalSize;
-int      MPIoffloadEngine::VerticalRank;
-MPI_Win  MPIoffloadEngine::VerticalWindow; 
-std::vector<void *>            MPIoffloadEngine::VerticalShmBufs;
-std::vector<std::vector<int> > MPIoffloadEngine::UniverseRanks;
-std::vector<int>               MPIoffloadEngine::UserCommunicatorToWorldRanks; 
-
-int CartesianCommunicator::NodeCount(void)    { return HorizontalSize;};
-int MPIoffloadEngine::ShmSetup = 0;
-
-void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
-					 MPI_Comm &ShmComm,
-					 void * &ShmCommBuf)
-{      
-  int flag;
-  assert(ShmSetup==0);  
-  
-  //////////////////////////////////////////////////////////////////////
-  // Universe is all nodes prior to squadron grouping
-  //////////////////////////////////////////////////////////////////////
-  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_universe);
-  MPI_Comm_rank(communicator_universe,&UniverseRank);
-  MPI_Comm_size(communicator_universe,&UniverseSize);
-  
-  /////////////////////////////////////////////////////////////////////
-  // Split into groups that can share memory (Verticals)
-  /////////////////////////////////////////////////////////////////////
-#undef MPI_SHARED_MEM_DEBUG
-#ifdef  MPI_SHARED_MEM_DEBUG
-  MPI_Comm_split(communicator_universe,(UniverseRank/4),UniverseRank,&VerticalComm);
-#else 
-  MPI_Comm_split_type(communicator_universe, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&VerticalComm);
-#endif
-  MPI_Comm_rank(VerticalComm     ,&VerticalRank);
-  MPI_Comm_size(VerticalComm     ,&VerticalSize);
-  
-  //////////////////////////////////////////////////////////////////////
-  // Split into horizontal groups by rank in squadron
-  //////////////////////////////////////////////////////////////////////
-  MPI_Comm_split(communicator_universe,VerticalRank,UniverseRank,&HorizontalComm);
-  MPI_Comm_rank(HorizontalComm,&HorizontalRank);
-  MPI_Comm_size(HorizontalComm,&HorizontalSize);
-  assert(HorizontalSize*VerticalSize==UniverseSize);
-  
-  ////////////////////////////////////////////////////////////////////////////////
-  // What is my place in the world
-  ////////////////////////////////////////////////////////////////////////////////
-  int WorldRank=0;
-  if(VerticalRank==0) WorldRank = HorizontalRank;
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&WorldRank,1,MPI_INT,MPI_SUM,VerticalComm);
-  assert(ierr==0);
-  
-  ////////////////////////////////////////////////////////////////////////////////
-  // Where is the world in the universe?
-  ////////////////////////////////////////////////////////////////////////////////
-  UniverseRanks = std::vector<std::vector<int> >(HorizontalSize,std::vector<int>(VerticalSize,0));
-  UniverseRanks[WorldRank][VerticalRank] = UniverseRank;
-  for(int w=0;w<HorizontalSize;w++){
-    ierr=MPI_Allreduce(MPI_IN_PLACE,&UniverseRanks[w][0],VerticalSize,MPI_INT,MPI_SUM,communicator_universe);
-    assert(ierr==0);
-  }
-  
-  //////////////////////////////////////////////////////////////////////////////////////////////////////////
-  // allocate the shared window for our group, pass back Shm info to CartesianCommunicator
-  //////////////////////////////////////////////////////////////////////////////////////////////////////////
-  VerticalShmBufs.resize(VerticalSize);
-
-#undef MPI_SHARED_MEM
-#ifdef MPI_SHARED_MEM
-  ierr = MPI_Win_allocate_shared(CartesianCommunicator::MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,VerticalComm,&ShmCommBuf,&VerticalWindow);
-  ierr|= MPI_Win_lock_all (MPI_MODE_NOCHECK, VerticalWindow);
-  assert(ierr==0);
-  //  std::cout<<"SHM "<<ShmCommBuf<<std::endl;
-
-  for(int r=0;r<VerticalSize;r++){
-    MPI_Aint sz;
-    int dsp_unit;
-    MPI_Win_shared_query (VerticalWindow, r, &sz, &dsp_unit, &VerticalShmBufs[r]);
-    //    std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
-  }
-#else 
-  char shm_name [NAME_MAX];
-  MPI_Barrier(VerticalComm);
-
-  if ( VerticalRank == 0 ) {
-    for(int r=0;r<VerticalSize;r++){
-
-      size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
-      if ( r>0 ) size = sizeof(SlaveState);
-
-      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldRank,r);
-      
-      shm_unlink(shm_name);
-
-      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0600);
-      if ( fd < 0 ) {
-	perror("failed shm_open");
-	assert(0);
-      }
-
-      ftruncate(fd, size);
-
-      VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-      if ( VerticalShmBufs[r] == MAP_FAILED ) { 
-	perror("failed mmap");
-	assert(0);
-      }
-
-      /*
-      for(uint64_t page=0;page<size;page+=4096){
-	void *pages = (void *) ( page + (uint64_t)VerticalShmBufs[r] );
-	int status;
-	int flags=MPOL_MF_MOVE_ALL;
-	int nodes=1; // numa domain == MCDRAM
-	unsigned long count=1;
-	ierr= move_pages(0,count, &pages,&nodes,&status,flags);
-	if (ierr && (page==0)) perror("numa relocate command failed");
-      }
-      */
-      uint64_t * check = (uint64_t *) VerticalShmBufs[r];
-      check[0] = WorldRank;
-      check[1] = r;
-
-      //      std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
-    }
-  }
-
-  MPI_Barrier(VerticalComm);
-
-  if ( VerticalRank != 0 ) { 
-  for(int r=0;r<VerticalSize;r++){
-
-    size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES ;
-    if ( r>0 ) size = sizeof(SlaveState);
-    
-    sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldRank,r);
-    
-    int fd=shm_open(shm_name,O_RDWR|O_CREAT,0600);
-    if ( fd<0 ) {
-      perror("failed shm_open");
-      assert(0);
-    }
-    VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-
-    uint64_t * check = (uint64_t *) VerticalShmBufs[r];
-    assert(check[0]== WorldRank);
-    assert(check[1]== r);
-    //    std::cerr<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
-  }
-  }
-#endif
-  MPI_Barrier(VerticalComm);
-
-  //////////////////////////////////////////////////////////////////////
-  // Map rank of leader on node in their in new world, to the
-  // rank in this vertical plane's horizontal communicator
-  //////////////////////////////////////////////////////////////////////
-  communicator_world = HorizontalComm;
-  ShmComm            = VerticalComm;
-  ShmCommBuf         = VerticalShmBufs[0];
-  MPI_Comm_group (communicator_world, &WorldGroup); 
-  
-  ///////////////////////////////////////////////////////////
-  // Start the slave data movers
-  ///////////////////////////////////////////////////////////
-  if ( VerticalRank != 0 ) {
-    Slave indentured;
-    indentured.Init( (SlaveState *) VerticalShmBufs[VerticalRank], VerticalComm, UniverseRank,VerticalRank);
-    indentured.SemInitExcl();// init semaphore in shared memory
-    MPI_Barrier(VerticalComm);
-    MPI_Barrier(VerticalComm);
-    indentured.EventLoop();
-    assert(0);
-  } else {
-    Slaves.resize(VerticalSize);
-    for(int i=1;i<VerticalSize;i++){
-      Slaves[i].Init((SlaveState *)VerticalShmBufs[i],VerticalComm, UniverseRanks[HorizontalRank][i],i);
-    }
-    MPI_Barrier(VerticalComm);
-    for(int i=1;i<VerticalSize;i++){
-      Slaves[i].SemInit();// init semaphore in shared memory
-    }
-    MPI_Barrier(VerticalComm);
-  }
-  
-  ///////////////////////////////////////////////////////////
-  // Verbose for now
-  ///////////////////////////////////////////////////////////
-  
-  ShmSetup=1;
-  
-  if (UniverseRank == 0){
-      
-    std::cout<<GridLogMessage << "Grid MPI-3 configuration: detected ";
-    std::cout<<UniverseSize   << " Ranks " ;
-    std::cout<<HorizontalSize << " Nodes " ;
-    std::cout<<VerticalSize   << " with ranks-per-node "<<std::endl;
-    
-    std::cout<<GridLogMessage << "Grid MPI-3 configuration: using one lead process per node " << std::endl;
-    std::cout<<GridLogMessage << "Grid MPI-3 configuration: reduced communicator has size " << HorizontalSize << std::endl;
-    
-    for(int g=0;g<HorizontalSize;g++){
-      std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<< UniverseRanks[g][0]<<std::endl;
-    }
-    
-    for(int g=0;g<HorizontalSize;g++){
-      std::cout<<GridLogMessage<<" { ";
-      for(int s=0;s<VerticalSize;s++){
-	std::cout<< UniverseRanks[g][s];
-	if ( s<VerticalSize-1 ) {
-	  std::cout<<",";
-	}
-      }
-      std::cout<<" } "<<std::endl;
-    }
-  }
-};
-
-  ///////////////////////////////////////////////////////////////////////////////////////////////
-  // Map the communicator into communicator_world, and find the neighbour.
-  // Cache the mappings; cache size is 1.
-  ///////////////////////////////////////////////////////////////////////////////////////////////
-void MPIoffloadEngine::MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int rank) {
-
-  if ( comm == HorizontalComm ) {
-    comm_world_peer = rank;
-    //    std::cout << " MapCommRankToWorldRank  horiz " <<rank<<"->"<<comm_world_peer<<std::endl;
-  } else if ( comm == communicator_cached ) {
-    comm_world_peer = UserCommunicatorToWorldRanks[rank];
-    //    std::cout << " MapCommRankToWorldRank  cached " <<rank<<"->"<<comm_world_peer<<std::endl;
-  } else { 
-    
-    int size;
-
-    MPI_Comm_size(comm,&size);
-
-    UserCommunicatorToWorldRanks.resize(size);
-
-    std::vector<int> cached_ranks(size); 
-
-    for(int r=0;r<size;r++) {
-      cached_ranks[r]=r;
-    }
-
-    communicator_cached=comm;
-    
-    MPI_Comm_group(communicator_cached, &CachedGroup);
-    
-    MPI_Group_translate_ranks(CachedGroup,size,&cached_ranks[0],WorldGroup, &UserCommunicatorToWorldRanks[0]); 
-    
-    comm_world_peer = UserCommunicatorToWorldRanks[rank];
-    //    std::cout << " MapCommRankToWorldRank  cache miss " <<rank<<"->"<<comm_world_peer<<std::endl;
-    
-    assert(comm_world_peer != MPI_UNDEFINED);
-  }
-
-  assert( (tag & (~0xFFFFL)) ==0); 
-  
-  uint64_t icomm = (uint64_t)comm;
-  int comm_hash = ((icomm>>0 )&0xFFFF)^((icomm>>16)&0xFFFF)
-                ^ ((icomm>>32)&0xFFFF)^((icomm>>48)&0xFFFF);
-  
-  //  hashtag = (comm_hash<<15) | tag;      
-  hashtag = tag;      
-
-};
-
-void Slave::Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank)
-{
-  squadron=_squadron;
-  universe_rank=_universe_rank;
-  vertical_rank=_vertical_rank;
-  state   =_state;
-  //  std::cout << "state "<<_state<<" comm "<<_squadron<<" universe_rank"<<universe_rank <<std::endl;
-  state->head = state->tail = state->start = 0;
-  base = (uint64_t)MPIoffloadEngine::VerticalShmBufs[0];
-  int rank; MPI_Comm_rank(_squadron,&rank);
-}
-#define PERI_PLUS(A) ( (A+1)%pool )
-int Slave::Event (void) {
-
-  static int tail_last;
-  static int head_last;
-  static int start_last;
-  int ierr;
-  MPI_Status stat;
-  static int i=0;
-
-  ////////////////////////////////////////////////////
-  // Try to advance the start pointers
-  ////////////////////////////////////////////////////
-  int s=state->start;
-  if ( s != state->head ) {
-    switch ( state->Descrs[s].command ) {
-    case COMMAND_ISEND:
-      ierr = MPI_Isend((void *)(state->Descrs[s].buf+base), 
-		       state->Descrs[s].bytes, 
-		       MPI_CHAR,
-		       state->Descrs[s].rank,
-		       state->Descrs[s].tag,
-		       MPIoffloadEngine::communicator_universe,
-		       (MPI_Request *)&state->Descrs[s].request);
-      assert(ierr==0);
-      state->start = PERI_PLUS(s);
-      return 1;
-      break;
-
-    case COMMAND_IRECV:
-      ierr=MPI_Irecv((void *)(state->Descrs[s].buf+base), 
-		     state->Descrs[s].bytes, 
-		     MPI_CHAR,
-		     state->Descrs[s].rank,
-		     state->Descrs[s].tag,
-		     MPIoffloadEngine::communicator_universe,
-		     (MPI_Request *)&state->Descrs[s].request);
-
-      //      std::cout<< " Request is "<<state->Descrs[s].request<<std::endl;
-      //      std::cout<< " Request0 is "<<state->Descrs[0].request<<std::endl;
-      assert(ierr==0);
-      state->start = PERI_PLUS(s);
-      return 1;
-      break;
-
-    case COMMAND_SENDRECV:
-
-      //      fprintf(stderr,"Sendrecv ->%d %d : <-%d %d \n",state->Descrs[s].dest, state->Descrs[s].xtag+i*10,state->Descrs[s].src, state->Descrs[s].rtag+i*10);
-
-      ierr=MPI_Sendrecv((void *)(state->Descrs[s].xbuf+base), state->Descrs[s].bytes, MPI_CHAR, state->Descrs[s].dest, state->Descrs[s].xtag+i*10,
-			(void *)(state->Descrs[s].rbuf+base), state->Descrs[s].bytes, MPI_CHAR, state->Descrs[s].src , state->Descrs[s].rtag+i*10,
-			MPIoffloadEngine::communicator_universe,MPI_STATUS_IGNORE);
-
-      assert(ierr==0);
-
-      //      fprintf(stderr,"Sendrecv done %d %d\n",ierr,i);
-      //      MPI_Barrier(MPIoffloadEngine::HorizontalComm);
-      //      fprintf(stderr,"Barrier\n");
-      i++;
-
-      state->start = PERI_PLUS(s);
-
-      return 1;
-      break;
-
-    case COMMAND_WAITALL:
-
-      for(int t=state->tail;t!=s; t=PERI_PLUS(t) ){
-	if ( state->Descrs[t].command != COMMAND_SENDRECV ) {
-	  MPI_Wait((MPI_Request *)&state->Descrs[t].request,MPI_STATUS_IGNORE);
-	}
-      };
-      s=PERI_PLUS(s);
-      state->start = s;
-      state->tail  = s;
-
-      WakeUpCompute();
-
-      return 1;
-      break;
-
-    default:
-      assert(0);
-      break;
-    }
-  }
-  return 0;
-}
-  //////////////////////////////////////////////////////////////////////////////
-  // External interaction with the queue
-  //////////////////////////////////////////////////////////////////////////////
-  
-void Slave::QueueSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) 
-{
-  int head =state->head;
-  int next = PERI_PLUS(head);
-  
-  // Set up descriptor
-  int worldrank;
-  int hashtag;
-  MPI_Comm    communicator;
-  MPI_Request request;
-  uint64_t relative;
-  
-  relative = (uint64_t)xbuf - base;
-  state->Descrs[head].xbuf    = relative;
-  
-  relative= (uint64_t)rbuf - base;
-  state->Descrs[head].rbuf    = relative;
-  
-  state->Descrs[head].bytes  = bytes;
-  
-  MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,xtag,comm,dest);
-  state->Descrs[head].dest   = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
-  state->Descrs[head].xtag    = hashtag;
-  
-  MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,rtag,comm,src);
-  state->Descrs[head].src    = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
-  state->Descrs[head].rtag    = hashtag;
-  
-  state->Descrs[head].command= COMMAND_SENDRECV;
-  
-  // Block until FIFO has space
-  while( state->tail==next );
-  
-  // Msync on weak order architectures
-  
-  // Advance pointer
-  state->head = next;
-  
-};
-uint64_t Slave::QueueCommand(int command,void *buf, int bytes, int tag, MPI_Comm comm,int commrank) 
-{
-  /////////////////////////////////////////
-  // Spin; if FIFO is full until not full
-  /////////////////////////////////////////
-  int head =state->head;
-  int next = PERI_PLUS(head);
-    
-  // Set up descriptor
-  int worldrank;
-  int hashtag;
-  MPI_Comm    communicator;
-  MPI_Request request;
-  
-  MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,tag,comm,commrank);
-
-  uint64_t relative= (uint64_t)buf - base;
-  state->Descrs[head].buf    = relative;
-  state->Descrs[head].bytes  = bytes;
-  state->Descrs[head].rank   = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
-  state->Descrs[head].tag    = hashtag;
-  state->Descrs[head].command= command;
-
-  /*  
-  if ( command == COMMAND_ISEND ) { 
-  std::cout << "QueueSend from "<< universe_rank <<" to commrank " << commrank 
-            << " to worldrank " << worldrank <<std::endl;
-  std::cout << " via VerticalRank "<< vertical_rank <<" to universerank " << MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]<<std::endl;
-  std::cout << " QueueCommand "<<buf<<"["<<bytes<<"]" << std::endl;
-  } 
-  if ( command == COMMAND_IRECV ) { 
-  std::cout << "QueueRecv on "<< universe_rank <<" from commrank " << commrank 
-            << " from worldrank " << worldrank <<std::endl;
-  std::cout << " via VerticalRank "<< vertical_rank <<" from universerank " << MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]<<std::endl;
-  std::cout << " QueueSend "<<buf<<"["<<bytes<<"]" << std::endl;
-  } 
-  */
-  // Block until FIFO has space
-  while( state->tail==next );
-
-  // Msync on weak order architectures
-  // Advance pointer
-  state->head = next;
-
-  return 0;
-}
-  
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Info that is setup once and indept of cartesian layout
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-MPI_Comm CartesianCommunicator::communicator_world;
-
-void CartesianCommunicator::Init(int *argc, char ***argv) 
-{
-  int flag;
-  MPI_Initialized(&flag); // needed to coexist with other libs apparently
-  if ( !flag ) {
-    MPI_Init(argc,argv);
-  }
-  communicator_world = MPI_COMM_WORLD;
-  MPI_Comm ShmComm;
-  MPIoffloadEngine::CommunicatorInit (communicator_world,ShmComm,ShmCommBuf);
-}
-void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
-{
-  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
-  assert(ierr==0);
-}
-int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
-{
-  int rank;
-  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
-  assert(ierr==0);
-  return rank;
-}
-void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
-{
-  coor.resize(_ndimension);
-  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
-  assert(ierr==0);
-}
-
-CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
-{ 
-  _ndimension = processors.size();
-  std::vector<int> periodic(_ndimension,1);
-
-  _Nprocessors=1;
-  _processors = processors;
-
-  for(int i=0;i<_ndimension;i++){
-    _Nprocessors*=_processors[i];
-  }
-
-  int Size; 
-  MPI_Comm_size(communicator_world,&Size);
-  assert(Size==_Nprocessors);
-
-  _processor_coor.resize(_ndimension);
-  MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
-  MPI_Comm_rank  (communicator,&_processor);
-  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
-};
-
-void CartesianCommunicator::GlobalSum(uint32_t &u){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSum(uint64_t &u){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSum(float &f){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSumVector(float *f,int N)
-{
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSum(double &d)
-{
-  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSumVector(double *d,int N)
-{
-  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-
-// Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFrom(void *xmit,
-					   int dest,
-					   void *recv,
-					   int from,
-					   int bytes)
-{
-  std::vector<CommsRequest_t> reqs(0);
-  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
-  SendToRecvFromComplete(reqs);
-}
-
-void CartesianCommunicator::SendRecvPacket(void *xmit,
-					   void *recv,
-					   int sender,
-					   int receiver,
-					   int bytes)
-{
-  MPI_Status stat;
-  assert(sender != receiver);
-  int tag = sender;
-  if ( _processor == sender ) {
-    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
-  }
-  if ( _processor == receiver ) { 
-    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
-  }
-}
-
-// Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes)
-{
-  MPI_Request xrq;
-  MPI_Request rrq;
-  int rank = _processor;
-  int ierr;
-  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
-  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
-  
-  assert(ierr==0);
-
-  list.push_back(xrq);
-  list.push_back(rrq);
-}
-
-void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						       void *xmit,
-						       int dest,
-						       void *recv,
-						       int from,
-						       int bytes)
-{
-  uint64_t xmit_i = (uint64_t) xmit;
-  uint64_t recv_i = (uint64_t) recv;
-  uint64_t shm    = (uint64_t) ShmCommBuf;
-  // assert xmit and recv lie in shared memory region
-  assert( (xmit_i >= shm) && (xmit_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
-  assert( (recv_i >= shm) && (recv_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
-  assert(from!=_processor);
-  assert(dest!=_processor);
-
-  MPIoffloadEngine::QueueMultiplexedSendRecv(xmit,recv,bytes,_processor,from,communicator,dest,from);
-
-  //MPIoffloadEngine::QueueRoundRobinSendRecv(xmit,recv,bytes,_processor,from,communicator,dest,from);
-
-  //MPIoffloadEngine::QueueMultiplexedSend(xmit,bytes,_processor,communicator,dest);
-  //MPIoffloadEngine::QueueMultiplexedRecv(recv,bytes,from,communicator,from);
-}
-
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list)
-{
-  MPIoffloadEngine::WaitAll();
-  //this->Barrier();
-}
-
-void CartesianCommunicator::StencilBarrier(void) { }
-
-void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
-{
-  int nreq=list.size();
-  std::vector<MPI_Status> status(nreq);
-  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
-  assert(ierr==0);
-}
-
-void CartesianCommunicator::Barrier(void)
-{
-  int ierr = MPI_Barrier(communicator);
-  assert(ierr==0);
-}
-
-void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
-{
-  int ierr=MPI_Bcast(data,
-		     bytes,
-		     MPI_BYTE,
-		     root,
-		     communicator);
-  assert(ierr==0);
-}
-
-void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
-{
-  int ierr= MPI_Bcast(data,
-		      bytes,
-		      MPI_BYTE,
-		      root,
-		      communicator_world);
-  assert(ierr==0);
-}
-
-void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
-
-void *CartesianCommunicator::ShmBuffer(int rank) {
-  return NULL;
-}
-void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { 
-  return NULL;
-}
-
-
-};
-
@@ -1,273 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/communicator/Communicator_mpi.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/GridCore.h>
-#include <Grid/GridQCDcore.h>
-#include <Grid/qcd/action/ActionCore.h>
-#include <mpi.h>
-
-namespace Grid {
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Info that is setup once and indept of cartesian layout
-///////////////////////////////////////////////////////////////////////////////////////////////////
-MPI_Comm CartesianCommunicator::communicator_world;
-
-// Should error check all MPI calls.
-void CartesianCommunicator::Init(int *argc, char ***argv) {
-  int flag;
-  int provided;
-  MPI_Initialized(&flag); // needed to coexist with other libs apparently
-  if ( !flag ) {
-    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
-    if ( provided != MPI_THREAD_MULTIPLE ) {
-      QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
-    }
-  }
-  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
-  ShmInitGeneric();
-}
-
-CartesianCommunicator::~CartesianCommunicator()
-{
-  int MPI_is_finalised;
-  MPI_Finalized(&MPI_is_finalised);
-  if (communicator && !MPI_is_finalised){
-    MPI_Comm_free(&communicator);
-    for(int i=0;i<  communicator_halo.size();i++){
-      MPI_Comm_free(&communicator_halo[i]);
-    }
-  }  
-}
-
-void CartesianCommunicator::GlobalSum(uint32_t &u){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSum(uint64_t &u){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalXOR(uint32_t &u){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalXOR(uint64_t &u){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSum(float &f){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSumVector(float *f,int N)
-{
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSum(double &d)
-{
-  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSumVector(double *d,int N)
-{
-  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
-{
-  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
-  assert(ierr==0);
-}
-int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
-{
-  int rank;
-  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
-  assert(ierr==0);
-  return rank;
-}
-void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
-{
-  coor.resize(_ndimension);
-  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
-  assert(ierr==0);
-}
-
-// Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFrom(void *xmit,
-					   int dest,
-					   void *recv,
-					   int from,
-					   int bytes)
-{
-  std::vector<CommsRequest_t> reqs(0);
-  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
-  SendToRecvFromComplete(reqs);
-}
-
-void CartesianCommunicator::SendRecvPacket(void *xmit,
-					   void *recv,
-					   int sender,
-					   int receiver,
-					   int bytes)
-{
-  MPI_Status stat;
-  assert(sender != receiver);
-  int tag = sender;
-  if ( _processor == sender ) {
-    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
-  }
-  if ( _processor == receiver ) { 
-    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
-  }
-}
-
-// Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes)
-{
-  int myrank = _processor;
-  int ierr;
-  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
-    MPI_Request xrq;
-    MPI_Request rrq;
-
-    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
-    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
-    
-    assert(ierr==0);
-    list.push_back(xrq);
-    list.push_back(rrq);
-  } else { 
-    // Give the CPU to MPI immediately; can use threads to overlap optionally
-    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
-		      recv,bytes,MPI_CHAR,from, from,
-		      communicator,MPI_STATUS_IGNORE);
-    assert(ierr==0);
-  }
-}
-void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
-{
-  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
-    int nreq=list.size();
-    std::vector<MPI_Status> status(nreq);
-    int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
-    assert(ierr==0);
-  }
-}
-
-void CartesianCommunicator::Barrier(void)
-{
-  int ierr = MPI_Barrier(communicator);
-  assert(ierr==0);
-}
-
-void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
-{
-  int ierr=MPI_Bcast(data,
-		     bytes,
-		     MPI_BYTE,
-		     root,
-		     communicator);
-  assert(ierr==0);
-}
-  ///////////////////////////////////////////////////////
-  // Should only be used prior to Grid Init finished.
-  // Check for this?
-  ///////////////////////////////////////////////////////
-int CartesianCommunicator::RankWorld(void){ 
-  int r; 
-  MPI_Comm_rank(communicator_world,&r);
-  return r;
-}
-void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
-{
-  int ierr= MPI_Bcast(data,
-		      bytes,
-		      MPI_BYTE,
-		      root,
-		      communicator_world);
-  assert(ierr==0);
-}
-
-double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-							 void *xmit,
-							 int xmit_to_rank,
-							 void *recv,
-							 int recv_from_rank,
-							 int bytes,int dir)
-{
-  int myrank = _processor;
-  int ierr;
-  int ncomm  =communicator_halo.size(); 
-  int commdir=dir%ncomm;
-  
-  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
-  // Give the CPU to MPI immediately; can use threads to overlap optionally
-  MPI_Request req[2];
-  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[commdir],&req[1]);
-  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[commdir],&req[0]);
-
-  list.push_back(req[0]);
-  list.push_back(req[1]);
-  return 2.0*bytes;
-}
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
-{ 
-  int nreq=waitall.size();
-  MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE);
-}
-double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
-						    int xmit_to_rank,
-						    void *recv,
-						    int recv_from_rank,
-						    int bytes,int dir)
-{
-  int myrank = _processor;
-  int ierr;
-  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo.size()<< <std::endl;
-
-  int ncomm  =communicator_halo.size(); 
-  int commdir=dir%ncomm;
-  // Give the CPU to MPI immediately; can use threads to overlap optionally
-  MPI_Request req[2];
-  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[commdir],&req[1]);
-  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[commdir],&req[0]);
-  MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
-  return 2.0*bytes;
-}
-
-
-
-}
-
@@ -32,14 +32,22 @@ namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
+Grid_MPI_Comm       CartesianCommunicator::communicator_world;

 void CartesianCommunicator::Init(int *argc, char *** arv)
 {
-  ShmInitGeneric();
+  GlobalSharedMemory::Init(communicator_world);
+  GlobalSharedMemory::SharedMemoryAllocate(
+		   GlobalSharedMemory::MAX_MPI_SHM_BYTES,
+		   GlobalSharedMemory::Hugepages);
 }

 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank) 
-  : CartesianCommunicator(processors) { srank=0;}
+  : CartesianCommunicator(processors) 
+{
+  srank=0;
+  SetCommunicator(communicator_world);
+}

 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
@@ -54,6 +62,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
    assert(_processors[d]==1);
    _processor_coor[d] = 0;
  }
+  SetCommunicator(communicator_world);
 }

 CartesianCommunicator::~CartesianCommunicator(){}
@@ -121,6 +130,36 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
  dest=0;
 }

+double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
+						     int xmit_to_rank,
+						     void *recv,
+						     int recv_from_rank,
+						     int bytes, int dir)
+{
+  std::vector<CommsRequest_t> list;
+  // Discard the "dir"
+  SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
+  SendToRecvFromComplete(list);
+  return 2.0*bytes;
+}
+double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+							 void *xmit,
+							 int xmit_to_rank,
+							 void *recv,
+							 int recv_from_rank,
+							 int bytes, int dir)
+{
+  // Discard the "dir"
+  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
+  return 2.0*bytes;
+}
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
+{
+  SendToRecvFromComplete(waitall);
+}
+
+void CartesianCommunicator::StencilBarrier(void){};
+

 }

@@ -1,357 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/communicator/Communicator_shmem.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/Grid.h>
-#include <mpp/shmem.h>
-#include <array>
-
-namespace Grid {
-
-  // Should error check all MPI calls.
-#define SHMEM_VET(addr) 
-
-#define SHMEM_VET_DEBUG(addr) {				\
-  if ( ! shmem_addr_accessible(addr,_processor) ) {\
-    std::fprintf(stderr,"%d Inaccessible shmem address %lx %s %s\n",_processor,addr,__FUNCTION__,#addr); \
-    BACKTRACEFILE();		   \
-  }\
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Info that is setup once and indept of cartesian layout
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-typedef struct HandShake_t { 
-  uint64_t seq_local;
-  uint64_t seq_remote;
-} HandShake;
-
-std::array<long,_SHMEM_REDUCE_SYNC_SIZE> make_psync_init(void) {
-  std::array<long,_SHMEM_REDUCE_SYNC_SIZE> ret;
-  ret.fill(SHMEM_SYNC_VALUE);
-  return ret;
-}
-static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync_init = make_psync_init();
-
-static Vector< HandShake > XConnections;
-static Vector< HandShake > RConnections;
-
-void CartesianCommunicator::Init(int *argc, char ***argv) {
-  shmem_init();
-  XConnections.resize(shmem_n_pes());
-  RConnections.resize(shmem_n_pes());
-  for(int pe =0 ; pe<shmem_n_pes();pe++){
-    XConnections[pe].seq_local = 0;
-    XConnections[pe].seq_remote= 0;
-    RConnections[pe].seq_local = 0;
-    RConnections[pe].seq_remote= 0;
-  }
-  shmem_barrier_all();
-  ShmInitGeneric();
-}
-
-CartesianCommunicator::~CartesianCommunicator(){}
-
-CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent) 
-  : CartesianCommunicator(processors) 
-{
-  std::cout << "Attempts to split SHMEM communicators will fail " <<std::endl;
-}
-CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
-{
-  _ndimension = processors.size();
-  std::vector<int> periodic(_ndimension,1);
-
-  _Nprocessors=1;
-  _processors = processors;
-  _processor_coor.resize(_ndimension);
-
-  _processor = shmem_my_pe();
-  
-  Lexicographic::CoorFromIndex(_processor_coor,_processor,_processors);
-
-  for(int i=0;i<_ndimension;i++){
-    _Nprocessors*=_processors[i];
-  }
-
-  int Size = shmem_n_pes(); 
-
-
-  assert(Size==_Nprocessors);
-}
-
-void CartesianCommunicator::GlobalSum(uint32_t &u){
-  static long long source ;
-  static long long dest   ;
-  static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
-
-  //  int nreduce=1;
-  //  int pestart=0;
-  //  int logStride=0;
-
-  source = u;
-  dest   = 0;
-  shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
-  shmem_barrier_all(); // necessary?
-  u = dest;
-}
-void CartesianCommunicator::GlobalSum(uint64_t &u){
-  static long long source ;
-  static long long dest   ;
-  static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
-
-  //  int nreduce=1;
-  //  int pestart=0;
-  //  int logStride=0;
-
-  source = u;
-  dest   = 0;
-  shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
-  shmem_barrier_all(); // necessary?
-  u = dest;
-}
-void CartesianCommunicator::GlobalSum(float &f){
-  static float source ;
-  static float dest   ;
-  static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
-
-  source = f;
-  dest   =0.0;
-  shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
-  shmem_barrier_all();
-  f = dest;
-}
-void CartesianCommunicator::GlobalSumVector(float *f,int N)
-{
-  static float source ;
-  static float dest   = 0 ;
-  static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
-
-  if ( shmem_addr_accessible(f,_processor)  ){
-    shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync.data());
-    shmem_barrier_all();
-    return;
-  }
-
-  for(int i=0;i<N;i++){
-    dest   =0.0;
-    source = f[i];
-    shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
-    shmem_barrier_all();
-    f[i] = dest;
-  }
-}
-void CartesianCommunicator::GlobalSum(double &d)
-{
-  static double source;
-  static double dest  ;
-  static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
-
-  source = d;
-  dest   = 0;
-  shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
-  shmem_barrier_all();
-  d = dest;
-}
-void CartesianCommunicator::GlobalSumVector(double *d,int N)
-{
-  static double source ;
-  static double dest   ;
-  static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
-
-
-  if ( shmem_addr_accessible(d,_processor)  ){
-    shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync.data());
-    shmem_barrier_all();
-    return;
-  }
-
-  for(int i=0;i<N;i++){
-    source = d[i];
-    dest   =0.0;
-    shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
-    shmem_barrier_all();
-    d[i] = dest;
-  }
-}
-void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
-{
-  std::vector<int> coor = _processor_coor;
-
-  assert(std::abs(shift) <_processors[dim]);
-
-  coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
-  Lexicographic::IndexFromCoor(coor,source,_processors);
-
-  coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
-  Lexicographic::IndexFromCoor(coor,dest,_processors);
-
-}
-int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
-{
-  int rank;
-  Lexicographic::IndexFromCoor(coor,rank,_processors);
-  return rank;
-}
-void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
-{
-  Lexicographic::CoorFromIndex(coor,rank,_processors);
-}
-
-// Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFrom(void *xmit,
-					   int dest,
-					   void *recv,
-					   int from,
-					   int bytes)
-{
-  SHMEM_VET(xmit);
-  SHMEM_VET(recv);
-  std::vector<CommsRequest_t> reqs(0);
-  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
-  SendToRecvFromComplete(reqs);
-}
-
-void CartesianCommunicator::SendRecvPacket(void *xmit,
-					   void *recv,
-					   int sender,
-					   int receiver,
-					   int bytes)
-{
-  static uint64_t seq;
-
-  assert(recv!=xmit);
-  volatile HandShake *RecvSeq = (volatile HandShake *) & RConnections[sender];
-  volatile HandShake *SendSeq = (volatile HandShake *) & XConnections[receiver];
-
-  if ( _processor == sender ) {
-
-    // Check he has posted a receive
-    while(SendSeq->seq_remote == SendSeq->seq_local);
-
-    // Advance our send count
-    seq = ++(SendSeq->seq_local);
-    
-    // Send this packet 
-    SHMEM_VET(recv);
-    shmem_putmem(recv,xmit,bytes,receiver);
-    shmem_fence();
-
-    //Notify him we're done
-    shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver);
-    shmem_fence();
-  }
-  if ( _processor == receiver ) {
-
-    // Post a receive
-    seq = ++(RecvSeq->seq_local);
-    shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender);
-
-    // Now wait until he has advanced our reception counter
-    while(RecvSeq->seq_remote != RecvSeq->seq_local);
-
-  }
-}
-
-// Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes)
-{
-  SHMEM_VET(xmit);
-  SHMEM_VET(recv);
-  //  shmem_putmem_nb(recv,xmit,bytes,dest,NULL);
-  shmem_putmem(recv,xmit,bytes,dest);
-
-  if ( CommunicatorPolicy == CommunicatorPolicySequential ) shmem_barrier_all(); 
-}
-void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
-{
-  //  shmem_quiet();      // I'm done
-  if( CommunicatorPolicy == CommunicatorPolicyConcurrent ) shmem_barrier_all();// He's done too
-}
-void CartesianCommunicator::Barrier(void)
-{
-  shmem_barrier_all();
-}
-void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
-{
-  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
-  static uint32_t word;
-  uint32_t *array = (uint32_t *) data;
-  assert( (bytes % 4)==0);
-  int words = bytes/4;
-
-  if ( shmem_addr_accessible(data,_processor)  ){
-    shmem_broadcast32(data,data,words,root,0,0,shmem_n_pes(),psync.data());
-    return;
-  }
-
-  for(int w=0;w<words;w++){
-    word = array[w];
-    shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync.data());
-    if ( shmem_my_pe() != root ) {
-      array[w] = word;
-    }
-    shmem_barrier_all();
-  }
-
-}
-void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
-{
-  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
-  static uint32_t word;
-  uint32_t *array = (uint32_t *) data;
-  assert( (bytes % 4)==0);
-  int words = bytes/4;
-
-  for(int w=0;w<words;w++){
-    word = array[w];
-    shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync.data());
-    if ( shmem_my_pe() != root ) {
-      array[w]= word;
-    }
-    shmem_barrier_all();
-  }
-}
-  
-int CartesianCommunicator::RankWorld(void){ 
-  return shmem_my_pe();
-}
-
-}
-
@@ -0,0 +1,92 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/communicator/SharedMemory.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/GridCore.h>
+
+namespace Grid { 
+
+// static data
+
+uint64_t            GlobalSharedMemory::MAX_MPI_SHM_BYTES   = 1024LL*1024LL*1024LL; 
+int                 GlobalSharedMemory::Hugepages = 0;
+int                 GlobalSharedMemory::_ShmSetup;
+int                 GlobalSharedMemory::_ShmAlloc;
+uint64_t            GlobalSharedMemory::_ShmAllocBytes;
+
+std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
+
+Grid_MPI_Comm       GlobalSharedMemory::WorldShmComm;
+int                 GlobalSharedMemory::WorldShmRank;
+int                 GlobalSharedMemory::WorldShmSize;
+std::vector<int>    GlobalSharedMemory::WorldShmRanks;
+
+Grid_MPI_Comm       GlobalSharedMemory::WorldComm;
+int                 GlobalSharedMemory::WorldSize;
+int                 GlobalSharedMemory::WorldRank;
+
+int                 GlobalSharedMemory::WorldNodes;
+int                 GlobalSharedMemory::WorldNode;
+
+void GlobalSharedMemory::SharedMemoryFree(void)
+{
+  assert(_ShmAlloc);
+  assert(_ShmAllocBytes>0);
+  for(int r=0;r<WorldShmSize;r++){
+    munmap(WorldShmCommBufs[r],_ShmAllocBytes);
+  }
+  _ShmAlloc = 0;
+  _ShmAllocBytes = 0;
+}
+/////////////////////////////////
+// Alloc, free shmem region
+/////////////////////////////////
+void *SharedMemory::ShmBufferMalloc(size_t bytes){
+  //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
+  void *ptr = (void *)heap_top;
+  heap_top  += bytes;
+  heap_bytes+= bytes;
+  if (heap_bytes >= heap_size) {
+    std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
+    std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
+    std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
+    assert(heap_bytes<heap_size);
+  }
+  return ptr;
+}
+void SharedMemory::ShmBufferFreeAll(void) { 
+  heap_top  =(size_t)ShmBufferSelf();
+  heap_bytes=0;
+}
+void *SharedMemory::ShmBufferSelf(void)
+{
+  return ShmCommBufs[ShmRank];
+}
+
+
+
+}
@@ -0,0 +1,165 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/communicator/SharedMemory.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+
+// TODO
+// 1) move includes into SharedMemory.cc
+//
+// 2) split shared memory into a) optimal communicator creation from comm world
+// 
+//                             b) shared memory buffers container
+//                                -- static globally shared; init once
+//                                -- per instance set of buffers.
+//                                   
+
+#pragma once 
+
+#include <Grid/GridCore.h>
+
+#if defined (GRID_COMMS_MPI3) 
+#include <mpi.h>
+#endif 
+#include <semaphore.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/mman.h>
+#include <zlib.h>
+#ifdef HAVE_NUMAIF_H
+#include <numaif.h>
+#endif
+
+namespace Grid {
+
+#if defined (GRID_COMMS_MPI3) 
+  typedef MPI_Comm    Grid_MPI_Comm;
+  typedef MPI_Request CommsRequest_t;
+#else 
+  typedef int CommsRequest_t;
+  typedef int Grid_MPI_Comm;
+#endif
+
+class GlobalSharedMemory {
+ private:
+  static const int     MAXLOG2RANKSPERNODE = 16;            
+
+  // Init once lock on the buffer allocation
+  static int      _ShmSetup;
+  static int      _ShmAlloc;
+  static uint64_t _ShmAllocBytes;
+
+ public:
+  static int      ShmSetup(void)      { return _ShmSetup; }
+  static int      ShmAlloc(void)      { return _ShmAlloc; }
+  static uint64_t ShmAllocBytes(void) { return _ShmAllocBytes; }
+  static uint64_t      MAX_MPI_SHM_BYTES;
+  static int           Hugepages;
+
+  static std::vector<void *> WorldShmCommBufs;
+
+  static Grid_MPI_Comm WorldComm;
+  static int           WorldRank;
+  static int           WorldSize;
+
+  static Grid_MPI_Comm WorldShmComm;
+  static int           WorldShmRank;
+  static int           WorldShmSize;
+
+  static int           WorldNodes;
+  static int           WorldNode;
+
+  static std::vector<int>  WorldShmRanks;
+
+  //////////////////////////////////////////////////////////////////////////////////////
+  // Create an optimal reordered communicator that makes MPI_Cart_create get it right
+  //////////////////////////////////////////////////////////////////////////////////////
+  static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
+  static void OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
+  ///////////////////////////////////////////////////
+  // Provide shared memory facilities off comm world
+  ///////////////////////////////////////////////////
+  static void SharedMemoryAllocate(uint64_t bytes, int flags);
+  static void SharedMemoryFree(void);
+
+};
+
+//////////////////////////////
+// one per communicator
+//////////////////////////////
+class SharedMemory 
+{
+ private:
+  static const int     MAXLOG2RANKSPERNODE = 16;            
+
+  size_t heap_top;
+  size_t heap_bytes;
+  size_t heap_size;
+
+ protected:
+
+  Grid_MPI_Comm    ShmComm; // for barriers
+  int    ShmRank; 
+  int    ShmSize;
+  std::vector<void *> ShmCommBufs;
+  std::vector<int>    ShmRanks;// Mapping comm ranks to Shm ranks
+
+ public:
+  SharedMemory() {};
+  ~SharedMemory();
+  ///////////////////////////////////////////////////////////////////////////////////////
+  // set the buffers & sizes
+  ///////////////////////////////////////////////////////////////////////////////////////
+  void SetCommunicator(Grid_MPI_Comm comm);
+
+  ////////////////////////////////////////////////////////////////////////
+  // For this instance ; disjoint buffer sets between splits if split grid
+  ////////////////////////////////////////////////////////////////////////
+  void ShmBarrier(void); 
+
+  ///////////////////////////////////////////////////
+  // Call on any instance
+  ///////////////////////////////////////////////////
+  void SharedMemoryTest(void);
+  void *ShmBufferSelf(void);
+  void *ShmBuffer    (int rank);
+  void *ShmBufferTranslate(int rank,void * local_p);
+  void *ShmBufferMalloc(size_t bytes);
+  void  ShmBufferFreeAll(void) ;
+  
+  //////////////////////////////////////////////////////////////////////////
+  // Make info on Nodes & ranks and Shared memory available
+  //////////////////////////////////////////////////////////////////////////
+  int NodeCount(void) { return GlobalSharedMemory::WorldNodes;};
+  int RankCount(void) { return GlobalSharedMemory::WorldSize;};
+
+};
+
+}
@@ -0,0 +1,410 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/communicator/SharedMemory.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/GridCore.h>
+
+namespace Grid { 
+
+/*Construct from an MPI communicator*/
+void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
+{
+  assert(_ShmSetup==0);
+  WorldComm = comm;
+  MPI_Comm_rank(WorldComm,&WorldRank);
+  MPI_Comm_size(WorldComm,&WorldSize);
+  // WorldComm, WorldSize, WorldRank
+
+  /////////////////////////////////////////////////////////////////////
+  // Split into groups that can share memory
+  /////////////////////////////////////////////////////////////////////
+  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
+  MPI_Comm_rank(WorldShmComm     ,&WorldShmRank);
+  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);
+  // WorldShmComm, WorldShmSize, WorldShmRank
+
+  // WorldNodes
+  WorldNodes = WorldSize/WorldShmSize;
+  assert( (WorldNodes * WorldShmSize) == WorldSize );
+
+  // FIXME: Check all WorldShmSize are the same ?
+
+  /////////////////////////////////////////////////////////////////////
+  // find world ranks in our SHM group (i.e. which ranks are on our node)
+  /////////////////////////////////////////////////////////////////////
+  MPI_Group WorldGroup, ShmGroup;
+  MPI_Comm_group (WorldComm, &WorldGroup); 
+  MPI_Comm_group (WorldShmComm, &ShmGroup);
+
+  std::vector<int> world_ranks(WorldSize);   for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
+
+  WorldShmRanks.resize(WorldSize); 
+  MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &WorldShmRanks[0]); 
+
+  ///////////////////////////////////////////////////////////////////
+  // Identify who is in my group and nominate the leader
+  ///////////////////////////////////////////////////////////////////
+  int g=0;
+  std::vector<int> MyGroup;
+  MyGroup.resize(WorldShmSize);
+  for(int rank=0;rank<WorldSize;rank++){
+    if(WorldShmRanks[rank]!=MPI_UNDEFINED){
+      assert(g<WorldShmSize);
+      MyGroup[g++] = rank;
+    }
+  }
+  
+  std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
+  int myleader = MyGroup[0];
+  
+  std::vector<int> leaders_1hot(WorldSize,0);
+  std::vector<int> leaders_group(WorldNodes,0);
+  leaders_1hot [ myleader ] = 1;
+    
+  ///////////////////////////////////////////////////////////////////
+  // global sum leaders over comm world
+  ///////////////////////////////////////////////////////////////////
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm);
+  assert(ierr==0);
+
+  ///////////////////////////////////////////////////////////////////
+  // find the group leaders world rank
+  ///////////////////////////////////////////////////////////////////
+  int group=0;
+  for(int l=0;l<WorldSize;l++){
+    if(leaders_1hot[l]){
+      leaders_group[group++] = l;
+    }
+  }
+
+  ///////////////////////////////////////////////////////////////////
+  // Identify the node of the group in which I (and my leader) live
+  ///////////////////////////////////////////////////////////////////
+  WorldNode=-1;
+  for(int g=0;g<WorldNodes;g++){
+    if (myleader == leaders_group[g]){
+      WorldNode=g;
+    }
+  }
+  assert(WorldNode!=-1);
+  _ShmSetup=1;
+}
+
+void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
+{
+  ////////////////////////////////////////////////////////////////
+  // Assert power of two shm_size.
+  ////////////////////////////////////////////////////////////////
+  int log2size = -1;
+  for(int i=0;i<=MAXLOG2RANKSPERNODE;i++){  
+    if ( (0x1<<i) == WorldShmSize ) {
+      log2size = i;
+      break;
+    }
+  }
+  assert(log2size != -1);
+
+  ////////////////////////////////////////////////////////////////
+  // Identify subblock of ranks on node spreading across dims
+  // in a maximally symmetrical way
+  ////////////////////////////////////////////////////////////////
+  int ndimension              = processors.size();
+  std::vector<int> processor_coor(ndimension);
+  std::vector<int> WorldDims = processors;   std::vector<int> ShmDims  (ndimension,1);  std::vector<int> NodeDims (ndimension);
+  std::vector<int> ShmCoor  (ndimension);    std::vector<int> NodeCoor (ndimension);    std::vector<int> WorldCoor(ndimension);
+  int dim = 0;
+  for(int l2=0;l2<log2size;l2++){
+    while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
+    ShmDims[dim]*=2;
+    dim=(dim+1)%ndimension;
+  }
+
+  ////////////////////////////////////////////////////////////////
+  // Establish torus of processes and nodes with sub-blockings
+  ////////////////////////////////////////////////////////////////
+  for(int d=0;d<ndimension;d++){
+    NodeDims[d] = WorldDims[d]/ShmDims[d];
+  }
+
+  ////////////////////////////////////////////////////////////////
+  // Check processor counts match
+  ////////////////////////////////////////////////////////////////
+  int Nprocessors=1;
+  for(int i=0;i<ndimension;i++){
+    Nprocessors*=processors[i];
+  }
+  assert(WorldSize==Nprocessors);
+
+  ////////////////////////////////////////////////////////////////
+  // Establish mapping between lexico physics coord and WorldRank
+  ////////////////////////////////////////////////////////////////
+  int rank;
+
+  Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode   ,NodeDims);
+  Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
+  for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
+  Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
+
+  /////////////////////////////////////////////////////////////////
+  // Build the new communicator
+  /////////////////////////////////////////////////////////////////
+  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
+  assert(ierr==0);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////
+// Hugetlbfs mapping intended
+////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef GRID_MPI3_SHMMMAP
+void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
+{
+  std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<<std::endl;
+  assert(_ShmSetup==1);
+  assert(_ShmAlloc==0);
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // allocate the shared windows for our group
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  MPI_Barrier(WorldShmComm);
+  WorldShmCommBufs.resize(WorldShmSize);
+  
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  // Hugetlbf and others map filesystems as mappable huge pages
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  char shm_name [NAME_MAX];
+  for(int r=0;r<WorldShmSize;r++){
+    
+    sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",WorldNode,r);
+    int fd=open(shm_name,O_RDWR|O_CREAT,0666);
+    if ( fd == -1) { 
+      printf("open %s failed\n",shm_name);
+      perror("open hugetlbfs");
+      exit(0);
+    }
+    int mmap_flag = MAP_SHARED ;
+#ifdef MAP_POPULATE    
+    mmap_flag|=MAP_POPULATE;
+#endif
+#ifdef MAP_HUGETLB
+    if ( flags ) mmap_flag |= MAP_HUGETLB;
+#endif
+    void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
+    if ( ptr == (void *)MAP_FAILED ) {    
+      printf("mmap %s failed\n",shm_name);
+      perror("failed mmap");      assert(0);    
+    }
+    assert(((uint64_t)ptr&0x3F)==0);
+    close(fd);
+    WorldShmCommBufs[r] =ptr;
+    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
+  }
+  _ShmAlloc=1;
+  _ShmAllocBytes  = bytes;
+};
+#endif // MMAP
+
+#ifdef GRID_MPI3_SHMOPEN
+////////////////////////////////////////////////////////////////////////////////////////////
+// POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
+// tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for 
+// the posix shm virtual file system
+////////////////////////////////////////////////////////////////////////////////////////////
+void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
+{ 
+  std::cout << "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
+  assert(_ShmSetup==1);
+  assert(_ShmAlloc==0); 
+  MPI_Barrier(WorldShmComm);
+  WorldShmCommBufs.resize(WorldShmSize);
+
+  char shm_name [NAME_MAX];
+  if ( WorldShmRank == 0 ) {
+    for(int r=0;r<WorldShmSize;r++){
+	
+      size_t size = bytes;
+      
+      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldNode,r);
+      
+      shm_unlink(shm_name);
+      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
+      if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      }
+      ftruncate(fd, size);
+	
+      int mmap_flag = MAP_SHARED;
+#ifdef MAP_POPULATE 
+      mmap_flag |= MAP_POPULATE;
+#endif
+#ifdef MAP_HUGETLB
+      if (flags) mmap_flag |= MAP_HUGETLB;
+#endif
+      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
+      
+      std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
+      if ( ptr == (void * )MAP_FAILED ) {       
+	perror("failed mmap");     
+	assert(0);    
+      }
+      assert(((uint64_t)ptr&0x3F)==0);
+      
+      WorldShmCommBufs[r] =ptr;
+      close(fd);
+    }
+  }
+
+  MPI_Barrier(WorldShmComm);
+  
+  if ( WorldShmRank != 0 ) { 
+    for(int r=0;r<WorldShmSize;r++){
+
+      size_t size = bytes ;
+      
+      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldNode,r);
+      
+      int fd=shm_open(shm_name,O_RDWR,0666);
+      if ( fd<0 ) {	perror("failed shm_open");	assert(0);      }
+      
+      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+      if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
+      assert(((uint64_t)ptr&0x3F)==0);
+      WorldShmCommBufs[r] =ptr;
+
+      close(fd);
+    }
+  }
+  _ShmAlloc=1;
+  _ShmAllocBytes = bytes;
+}
+#endif
+
+  ////////////////////////////////////////////////////////
+  // Global shared functionality finished
+  // Now move to per communicator functionality
+  ////////////////////////////////////////////////////////
+void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
+{
+  int rank, size;
+  MPI_Comm_rank(comm,&rank);
+  MPI_Comm_size(comm,&size);
+  ShmRanks.resize(size);
+
+  /////////////////////////////////////////////////////////////////////
+  // Split into groups that can share memory
+  /////////////////////////////////////////////////////////////////////
+  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
+  MPI_Comm_rank(ShmComm     ,&ShmRank);
+  MPI_Comm_size(ShmComm     ,&ShmSize);
+  ShmCommBufs.resize(ShmSize);
+
+  //////////////////////////////////////////////////////////////////////
+  // Map ShmRank to WorldShmRank and use the right buffer
+  //////////////////////////////////////////////////////////////////////
+  assert (GlobalSharedMemory::ShmAlloc()==1);
+  heap_size = GlobalSharedMemory::ShmAllocBytes();
+  for(int r=0;r<ShmSize;r++){
+
+    uint32_t wsr = (r==ShmRank) ? GlobalSharedMemory::WorldShmRank : 0 ;
+
+    MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
+
+    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
+    //    std::cout << "SetCommunicator ShmCommBufs ["<< r<< "] = "<< ShmCommBufs[r]<< "  wsr = "<<wsr<<std::endl;
+  }
+  ShmBufferFreeAll();
+
+  /////////////////////////////////////////////////////////////////////
+  // find comm ranks in our SHM group (i.e. which ranks are on our node)
+  /////////////////////////////////////////////////////////////////////
+  MPI_Group FullGroup, ShmGroup;
+  MPI_Comm_group (comm   , &FullGroup); 
+  MPI_Comm_group (ShmComm, &ShmGroup);
+
+  std::vector<int> ranks(size);   for(int r=0;r<size;r++) ranks[r]=r;
+  MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); 
+}
+//////////////////////////////////////////////////////////////////
+// On node barrier
+//////////////////////////////////////////////////////////////////
+void SharedMemory::ShmBarrier(void)
+{
+  MPI_Barrier  (ShmComm);
+}
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Test the shared memory is working
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+void SharedMemory::SharedMemoryTest(void)
+{
+  ShmBarrier();
+  if ( ShmRank == 0 ) {
+    for(int r=0;r<ShmSize;r++){
+      uint64_t * check = (uint64_t *) ShmCommBufs[r];
+      check[0] = GlobalSharedMemory::WorldNode;
+      check[1] = r;
+      check[2] = 0x5A5A5A;
+    }
+  }
+  ShmBarrier();
+  for(int r=0;r<ShmSize;r++){
+    uint64_t * check = (uint64_t *) ShmCommBufs[r];
+    
+    assert(check[0]==GlobalSharedMemory::WorldNode);
+    assert(check[1]==r);
+    assert(check[2]==0x5A5A5A);
+    
+  }
+  ShmBarrier();
+}
+
+void *SharedMemory::ShmBuffer(int rank)
+{
+  int gpeer = ShmRanks[rank];
+  if (gpeer == MPI_UNDEFINED){
+    return NULL;
+  } else { 
+    return ShmCommBufs[gpeer];
+  }
+}
+void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
+{
+  static int count =0;
+  int gpeer = ShmRanks[rank];
+  assert(gpeer!=ShmRank); // never send to self
+  if (gpeer == MPI_UNDEFINED){
+    return NULL;
+  } else { 
+    uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
+    uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
+    return (void *) remote;
+  }
+}
+SharedMemory::~SharedMemory()
+{
+  int MPI_is_finalised;  MPI_Finalized(&MPI_is_finalised);
+  if ( !MPI_is_finalised ) { 
+    MPI_Comm_free(&ShmComm);
+  }
+};
+
+}
@@ -0,0 +1,128 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/communicator/SharedMemory.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/GridCore.h>
+
+namespace Grid { 
+
+/*Construct from an MPI communicator*/
+void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
+{
+  assert(_ShmSetup==0);
+  WorldComm = 0;
+  WorldRank = 0;
+  WorldSize = 1;
+  WorldShmComm = 0 ;
+  WorldShmRank = 0 ;
+  WorldShmSize = 1 ;
+  WorldNodes   = 1 ;
+  WorldNode    = 0 ;
+  WorldShmRanks.resize(WorldSize); WorldShmRanks[0] = 0;
+  WorldShmCommBufs.resize(1);
+  _ShmSetup=1;
+}
+
+void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
+{
+  optimal_comm = WorldComm;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////
+// Hugetlbfs mapping intended, use anonymous mmap
+////////////////////////////////////////////////////////////////////////////////////////////
+void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
+{
+  void * ShmCommBuf ; 
+  assert(_ShmSetup==1);
+  assert(_ShmAlloc==0);
+  int mmap_flag =0;
+#ifdef MAP_ANONYMOUS
+  mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
+#endif
+#ifdef MAP_ANON
+  mmap_flag = mmap_flag| MAP_SHARED | MAP_ANON;
+#endif
+#ifdef MAP_HUGETLB
+  if ( flags ) mmap_flag |= MAP_HUGETLB;
+#endif
+  ShmCommBuf =(void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); 
+  if (ShmCommBuf == (void *)MAP_FAILED) {
+    perror("mmap failed ");
+    exit(EXIT_FAILURE);  
+  }
+#ifdef MADV_HUGEPAGE
+  if (!Hugepages ) madvise(ShmCommBuf,bytes,MADV_HUGEPAGE);
+#endif
+  bzero(ShmCommBuf,bytes);
+  WorldShmCommBufs[0] = ShmCommBuf;
+  _ShmAllocBytes=bytes;
+  _ShmAlloc=1;
+};
+
+  ////////////////////////////////////////////////////////
+  // Global shared functionality finished
+  // Now move to per communicator functionality
+  ////////////////////////////////////////////////////////
+void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
+{
+  assert(GlobalSharedMemory::ShmAlloc()==1);
+  ShmRanks.resize(1);
+  ShmCommBufs.resize(1);
+  ShmRanks[0] = 0;
+  ShmRank     = 0;
+  ShmSize     = 1;
+  //////////////////////////////////////////////////////////////////////
+  // Map ShmRank to WorldShmRank and use the right buffer
+  //////////////////////////////////////////////////////////////////////
+  ShmCommBufs[0] = GlobalSharedMemory::WorldShmCommBufs[0];
+  heap_size      = GlobalSharedMemory::ShmAllocBytes();
+  ShmBufferFreeAll();
+  return;
+}
+//////////////////////////////////////////////////////////////////
+// On node barrier
+//////////////////////////////////////////////////////////////////
+void SharedMemory::ShmBarrier(void){ return ; }
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Test the shared memory is working
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+void SharedMemory::SharedMemoryTest(void) { return; }
+
+void *SharedMemory::ShmBuffer(int rank)
+{
+  return NULL;
+}
+void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
+{
+  return NULL;
+}
+SharedMemory::~SharedMemory()
+{};
+
+}
@@ -86,6 +86,7 @@ protected:
  Colours &Painter;
  int active;
  int timing_mode;
+  int topWidth{-1};
  static int timestamp;
  std::string name, topName;
  std::string COLOUR;
@@ -124,11 +125,17 @@ public:
      Reset(); 
    }
  }
+  void setTopWidth(const int w) {topWidth = w;}

  friend std::ostream& operator<< (std::ostream& stream, Logger& log){

    if ( log.active ) {
-      stream << log.background()<<  std::left << log.topName << log.background()<< " : ";
+      stream << log.background()<<  std::left;
+      if (log.topWidth > 0)
+      {
+        stream << std::setw(log.topWidth);
+      }
+      stream << log.topName << log.background()<< " : ";
      stream << log.colour() <<  std::left << log.name << log.background() << " : ";
      if ( log.timestamp ) {
 	log.StopWatch->Stop();
@@ -39,6 +39,7 @@ namespace QCD {
    static const int Zdir = 2;
    static const int Tdir = 3;

+  
    static const int Xp = 0;
    static const int Yp = 1;
    static const int Zp = 2;
@@ -420,15 +421,16 @@ namespace QCD {
    //////////////////////////////////////////////
    // Fermion <-> propagator assignements
    //////////////////////////////////////////////
-    template <class Prop, class Ferm>
-    void FermToProp(Prop &p, const Ferm &f, const int s, const int c)
+    //template <class Prop, class Ferm>
+    template <class Fimpl>
+      void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
    {
-        for(int j = 0; j < Ns; ++j)
+      for(int j = 0; j < Ns; ++j)
        {
            auto pjs = peekSpin(p, j, s);
            auto fj  = peekSpin(f, j);
            
-            for(int i = 0; i < Nc; ++i)
+            for(int i = 0; i < Fimpl::Dimension; ++i)
            {
                pokeColour(pjs, peekColour(fj, i), i, c);
            }
@@ -436,15 +438,16 @@ namespace QCD {
        }
    }
    
-    template <class Prop, class Ferm>
-    void PropToFerm(Ferm &f, const Prop &p, const int s, const int c)
+    //template <class Prop, class Ferm>
+    template <class Fimpl>
+      void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
    {
        for(int j = 0; j < Ns; ++j)
        {
            auto pjs = peekSpin(p, j, s);
            auto fj  = peekSpin(f, j);
            
-            for(int i = 0; i < Nc; ++i)
+            for(int i = 0; i < Fimpl::Dimension; ++i)
            {
                pokeColour(fj, peekColour(pjs, i, c), i);
            }
@@ -492,41 +495,17 @@ namespace QCD {
      return traceIndex<ColourIndex>(lhs);
    }

+    //////////////////////////////////////////
+    // Current types
+    //////////////////////////////////////////
+    GRID_SERIALIZABLE_ENUM(Current, undef,
+                           Vector,  0,
+                           Axial,   1,
+                           Tadpole, 2);
+
 }   //namespace QCD
 } // Grid

-/*
-<<<<<<< HEAD
-#include <Grid/qcd/utils/SpaceTimeGrid.h>
-#include <Grid/qcd/spin/Dirac.h>
-#include <Grid/qcd/spin/TwoSpinor.h>
-#include <Grid/qcd/utils/LinalgUtils.h>
-#include <Grid/qcd/utils/CovariantCshift.h>
-
-// Include representations  
-#include <Grid/qcd/utils/SUn.h>
-#include <Grid/qcd/utils/SUnAdjoint.h>
-#include <Grid/qcd/utils/SUnTwoIndex.h>
-#include <Grid/qcd/representations/hmc_types.h>
-
-// Scalar field
-#include <Grid/qcd/utils/ScalarObjs.h>
-
-#include <Grid/qcd/action/Actions.h>
-
-#include <Grid/qcd/smearing/Smearing.h>
-
-#include <Grid/qcd/hmc/integrators/Integrator.h>
-#include <Grid/qcd/hmc/integrators/Integrator_algorithm.h>
-#include <Grid/qcd/observables/hmc_observable.h>
-#include <Grid/qcd/hmc/HMC.h>
-
-
-//#include <Grid/qcd/modules/mods.h>
-=======
-
->>>>>>> develop
-*/


 #endif
@@ -73,7 +73,7 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi
  this->DW(psi,tmp_f,DaggerYes);

  for(int s=0;s<Ls;s++){
-    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp_f,s,s);// chi = (1-c[s] D_W) psi
+    axpby_ssp(chi,Coeff_t(1.0),psi,conjugate(-cs[s]),tmp_f,s,s);// chi = (1-c[s] D_W) psi
  }
 }

@@ -50,11 +50,13 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 ////////////////////////////////////////////

 #include <Grid/qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
-#include <Grid/qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
+#include <Grid/qcd/action/fermion/WilsonTMFermion.h>     // 4d wilson like
+#include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
-//#include <Grid/qcd/action/fermion/CloverFermion.h>
+
 #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
 #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
+
 #include <Grid/qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
 #include <Grid/qcd/action/fermion/DomainWallFermion.h>
 #include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
@@ -104,10 +106,33 @@ typedef WilsonFermion<WilsonTwoIndexSymmetricImplR> WilsonTwoIndexSymmetricFermi
 typedef WilsonFermion<WilsonTwoIndexSymmetricImplF> WilsonTwoIndexSymmetricFermionF;
 typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermionD;

+typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonTwoIndexAntiSymmetricFermionR;
+typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonTwoIndexAntiSymmetricFermionF;
+typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonTwoIndexAntiSymmetricFermionD;
+
+// Twisted mass fermion
 typedef WilsonTMFermion<WilsonImplR> WilsonTMFermionR;
 typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
 typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;

+// Clover fermions
+typedef WilsonCloverFermion<WilsonImplR> WilsonCloverFermionR;
+typedef WilsonCloverFermion<WilsonImplF> WilsonCloverFermionF;
+typedef WilsonCloverFermion<WilsonImplD> WilsonCloverFermionD;
+
+typedef WilsonCloverFermion<WilsonAdjImplR> WilsonCloverAdjFermionR;
+typedef WilsonCloverFermion<WilsonAdjImplF> WilsonCloverAdjFermionF;
+typedef WilsonCloverFermion<WilsonAdjImplD> WilsonCloverAdjFermionD;
+
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
+
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
+
+// Domain Wall fermions
 typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
 typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
 typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
@@ -70,7 +70,9 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>

 #define TwoIndexFermOpTemplateInstantiate(A) \
  template class A<WilsonTwoIndexSymmetricImplF>; \
-  template class A<WilsonTwoIndexSymmetricImplD>; 
+  template class A<WilsonTwoIndexSymmetricImplD>; \
+  template class A<WilsonTwoIndexAntiSymmetricImplF>; \
+  template class A<WilsonTwoIndexAntiSymmetricImplD>;

 #define FermOp5dVecTemplateInstantiate(A) \
  template class A<DomainWallVec5dImplF>;	\
@@ -113,6 +113,21 @@ namespace Grid {
      ///////////////////////////////////////////////
      virtual void ImportGauge(const GaugeField & _U)=0;

+      //////////////////////////////////////////////////////////////////////
+      // Conserved currents, either contract at sink or insert sequentially.
+      //////////////////////////////////////////////////////////////////////
+      virtual void ContractConservedCurrent(PropagatorField &q_in_1,
+                                            PropagatorField &q_in_2,
+                                            PropagatorField &q_out,
+                                            Current curr_type,
+                                            unsigned int mu)=0;
+      virtual void SeqConservedCurrent(PropagatorField &q_in, 
+                                       PropagatorField &q_out,
+                                       Current curr_type,
+                                       unsigned int mu,
+                                       std::vector<Real> mom,
+                                       unsigned int tmin, 
+                                       unsigned int tmax)=0;
    };

  }
@@ -164,6 +164,7 @@ namespace QCD {
    public:

    static const int Dimension = Representation::Dimension;
+    static const bool isFundamental = Representation::isFundamental;
    static const bool LsVectorised=false;
    static const int Nhcs = Options::Nhcs;

@@ -212,6 +213,13 @@ namespace QCD {
                         StencilImpl &St) {
      mult(&phi(), &U(mu), &chi());
    }
+    
+    inline void multLinkProp(SitePropagator &phi,
+                             const SiteDoubledGaugeField &U,
+                             const SitePropagator &chi,
+                             int mu) {
+       mult(&phi(), &U(mu), &chi());
+    }
      
    template <class ref>
    inline void loadLinkElement(Simd &reg, ref &memory) {
@@ -254,8 +262,22 @@ namespace QCD {
      GaugeLinkField link(mat._grid);
      link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
      PokeIndex<LorentzIndex>(mat,link,mu);
-    }   
+    }  
+    
+    inline void outerProductImpl(PropagatorField &mat, const FermionField &B, const FermionField &A){
+      mat = outerProduct(B,A); 
+    }  
+
+    inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
+      mat = TraceIndex<SpinIndex>(P); 
+    }
      
+    inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
+      for (int mu = 0; mu < Nd; mu++)
+      mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
+    }
+
+
    inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
      
      int Ls=Btilde._grid->_fdimensions[0];
@@ -277,27 +299,28 @@ namespace QCD {
  ////////////////////////////////////////////////////////////////////////////////////
  // Single flavour four spinors with colour index, 5d redblack
  ////////////////////////////////////////////////////////////////////////////////////
-template<class S,int Nrepresentation=Nc, class Options=CoeffReal>
-class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
+template<class S,class Representation = FundamentalRepresentation, class Options=CoeffReal>
+class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Representation::Dimension> > { 
  public:

-  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Nrepresentation> > Gimpl;
+  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > Gimpl;
  INHERIT_GIMPL_TYPES(Gimpl);

-  static const int Dimension = Nrepresentation;
+  static const int Dimension = Representation::Dimension;
+  static const bool isFundamental = Representation::isFundamental;
  static const bool LsVectorised=true;
  static const int Nhcs = Options::Nhcs;
      
  typedef typename Options::_Coeff_t Coeff_t;      
  typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
  
-  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
-  template <typename vtype> using iImplPropagator        = iScalar<iMatrix<iMatrix<vtype, Nrepresentation>, Ns> >;
-  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
-  template <typename vtype> using iImplHalfCommSpinor    = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhcs> >;
-  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>;
-  template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
-  template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
+  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
+  template <typename vtype> using iImplPropagator        = iScalar<iMatrix<iMatrix<vtype, Dimension>, Ns> >;
+  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
+  template <typename vtype> using iImplHalfCommSpinor    = iScalar<iVector<iVector<vtype, Dimension>, Nhcs> >;
+  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
+  template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Dimension> >, Nd>;
+  template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
  
  typedef iImplSpinor<Simd>            SiteSpinor;
  typedef iImplPropagator<Simd>        SitePropagator;
@@ -333,14 +356,27 @@ class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
                       const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
                       StencilImpl &St) {
    SiteGaugeLink UU;
-    for (int i = 0; i < Nrepresentation; i++) {
-      for (int j = 0; j < Nrepresentation; j++) {
+    for (int i = 0; i < Dimension; i++) {
+      for (int j = 0; j < Dimension; j++) {
        vsplat(UU()()(i, j), U(mu)()(i, j));
      }
    }
    mult(&phi(), &UU(), &chi());
  }
-      
+
+  inline void multLinkProp(SitePropagator &phi,
+                           const SiteDoubledGaugeField &U,
+                           const SitePropagator &chi,
+                           int mu) {
+    SiteGaugeLink UU;
+    for (int i = 0; i < Dimension; i++) {
+      for (int j = 0; j < Dimension; j++) {
+        vsplat(UU()()(i, j), U(mu)()(i, j));
+      }
+    }
+    mult(&phi(), &UU(), &chi());
+  }
+
  inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu) 
  {
    SiteScalarGaugeField  ScalarUmu;
@@ -373,6 +409,19 @@ class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
    assert(0);
  }

+  inline void outerProductImpl(PropagatorField &mat, const FermionField &Btilde, const FermionField &A){
+    assert(0);
+  } 
+
+  inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
+    assert(0);
+  }
+
+  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
+    assert(0);
+  }
+
+
  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {

    assert(0);
@@ -425,25 +474,26 @@ class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
    ////////////////////////////////////////////////////////////////////////////////////////
    // Flavour doubled spinors; is Gparity the only? what about C*?
    ////////////////////////////////////////////////////////////////////////////////////////
-template <class S, int Nrepresentation, class Options=CoeffReal>
-class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
+template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
+class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
 public:

- static const int Dimension = Nrepresentation;
+ static const int Dimension = Representation::Dimension;
+ static const bool isFundamental = Representation::isFundamental;
 static const int Nhcs = Options::Nhcs;
 static const bool LsVectorised=false;

- typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
+ typedef ConjugateGaugeImpl< GaugeImplTypes<S,Dimension> > Gimpl;
 INHERIT_GIMPL_TYPES(Gimpl);

 typedef typename Options::_Coeff_t Coeff_t;
 typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
      
- template <typename vtype> using iImplSpinor            = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>,   Ngp>;
- template <typename vtype> using iImplPropagator        = iVector<iMatrix<iMatrix<vtype, Nrepresentation>, Ns>,   Ngp>;
- template <typename vtype> using iImplHalfSpinor        = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>,  Ngp>;
- template <typename vtype> using iImplHalfCommSpinor    = iVector<iVector<iVector<vtype, Nrepresentation>, Nhcs>, Ngp>;
- template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
+ template <typename vtype> using iImplSpinor            = iVector<iVector<iVector<vtype, Dimension>, Ns>,   Ngp>;
+ template <typename vtype> using iImplPropagator        = iVector<iMatrix<iMatrix<vtype, Dimension>, Ns>,   Ngp>;
+ template <typename vtype> using iImplHalfSpinor        = iVector<iVector<iVector<vtype, Dimension>, Nhs>,  Ngp>;
+ template <typename vtype> using iImplHalfCommSpinor    = iVector<iVector<iVector<vtype, Dimension>, Nhcs>, Ngp>;
+ template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>, Ngp>;

 typedef iImplSpinor<Simd>            SiteSpinor;
 typedef iImplPropagator<Simd>        SitePropagator;
@@ -537,7 +587,12 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
   }
   
 }
-
+    // Fixme: Gparity prop * link
+    inline void multLinkProp(SitePropagator &phi, const SiteDoubledGaugeField &U,
+                             const SitePropagator &chi, int mu)
+    {
+        assert(0);
+    }

 template <class ref>
 inline void loadLinkElement(Simd &reg, ref &memory) {
@@ -611,6 +666,25 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
   return;
 }
      
+ inline void outerProductImpl(PropagatorField &mat, const FermionField &Btilde, const FermionField &A){
+   //mat = outerProduct(Btilde, A);
+   assert(0);
+  }
+
+  inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
+    assert(0);
+    /*
+    auto tmp = TraceIndex<SpinIndex>(P);
+    parallel_for(auto ss = tmp.begin(); ss < tmp.end(); ss++) {
+      mat[ss]() = tmp[ss](0, 0) + conjugate(tmp[ss](1, 1));
+    }
+    */
+  }
+
+  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
+    assert(0);
+  }
+  
 inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {

   int Ls = Btilde._grid->_fdimensions[0];
@@ -640,6 +714,7 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:

    typedef RealD  _Coeff_t ;
    static const int Dimension = Representation::Dimension;
+    static const bool isFundamental = Representation::isFundamental;
    static const bool LsVectorised=false;
    typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
      
@@ -751,8 +826,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
      GaugeLinkField link(mat._grid);
      link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
      PokeIndex<LorentzIndex>(mat,link,mu);
-    }   
-      
+    } 
+          
    inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
      assert (0); 
      // Must never hit
@@ -768,6 +843,7 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
    public:

    static const int Dimension = Representation::Dimension;
+    static const bool isFundamental = Representation::isFundamental;
    static const bool LsVectorised=true;
    typedef RealD   Coeff_t ;
    typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
@@ -958,29 +1034,33 @@ typedef WilsonImpl<vComplex,  TwoIndexSymmetricRepresentation, CoeffReal > Wilso
 typedef WilsonImpl<vComplexF, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplF;  // Float
 typedef WilsonImpl<vComplexD, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplD;  // Double
 
-typedef DomainWallVec5dImpl<vComplex ,Nc, CoeffReal> DomainWallVec5dImplR; // Real.. whichever prec
-typedef DomainWallVec5dImpl<vComplexF,Nc, CoeffReal> DomainWallVec5dImplF; // Float
-typedef DomainWallVec5dImpl<vComplexD,Nc, CoeffReal> DomainWallVec5dImplD; // Double
+typedef WilsonImpl<vComplex,  TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplR;   // Real.. whichever prec
+typedef WilsonImpl<vComplexF, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplF;  // Float
+typedef WilsonImpl<vComplexD, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplD;  // Double
+
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplR; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplF; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplD; // Double
 
-typedef DomainWallVec5dImpl<vComplex ,Nc, CoeffRealHalfComms> DomainWallVec5dImplRL; // Real.. whichever prec
-typedef DomainWallVec5dImpl<vComplexF,Nc, CoeffRealHalfComms> DomainWallVec5dImplFH; // Float
-typedef DomainWallVec5dImpl<vComplexD,Nc, CoeffRealHalfComms> DomainWallVec5dImplDF; // Double
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplRL; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplFH; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplDF; // Double
 
-typedef DomainWallVec5dImpl<vComplex ,Nc,CoeffComplex> ZDomainWallVec5dImplR; // Real.. whichever prec
-typedef DomainWallVec5dImpl<vComplexF,Nc,CoeffComplex> ZDomainWallVec5dImplF; // Float
-typedef DomainWallVec5dImpl<vComplexD,Nc,CoeffComplex> ZDomainWallVec5dImplD; // Double
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplR; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplF; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplD; // Double
 
-typedef DomainWallVec5dImpl<vComplex ,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplRL; // Real.. whichever prec
-typedef DomainWallVec5dImpl<vComplexF,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplFH; // Float
-typedef DomainWallVec5dImpl<vComplexD,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplDF; // Double
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplRL; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplFH; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplDF; // Double
 
-typedef GparityWilsonImpl<vComplex , Nc,CoeffReal> GparityWilsonImplR;  // Real.. whichever prec
-typedef GparityWilsonImpl<vComplexF, Nc,CoeffReal> GparityWilsonImplF;  // Float
-typedef GparityWilsonImpl<vComplexD, Nc,CoeffReal> GparityWilsonImplD;  // Double
+typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffReal> GparityWilsonImplR;  // Real.. whichever prec
+typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffReal> GparityWilsonImplF;  // Float
+typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffReal> GparityWilsonImplD;  // Double
 
-typedef GparityWilsonImpl<vComplex , Nc,CoeffRealHalfComms> GparityWilsonImplRL;  // Real.. whichever prec
-typedef GparityWilsonImpl<vComplexF, Nc,CoeffRealHalfComms> GparityWilsonImplFH;  // Float
-typedef GparityWilsonImpl<vComplexD, Nc,CoeffRealHalfComms> GparityWilsonImplDF;  // Double
+typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplRL;  // Real.. whichever prec
+typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplFH;  // Float
+typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplDF;  // Double

 typedef StaggeredImpl<vComplex,  FundamentalRepresentation > StaggeredImplR;   // Real.. whichever prec
 typedef StaggeredImpl<vComplexF, FundamentalRepresentation > StaggeredImplF;  // Float
@@ -393,6 +393,31 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
  }
 };

+//////////////////////////////////////////////////////// 
+// Conserved current - not yet implemented.
+////////////////////////////////////////////////////////
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
+                                                        PropagatorField &q_in_2,
+                                                        PropagatorField &q_out,
+                                                        Current curr_type,
+                                                        unsigned int mu)
+{
+    assert(0);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
+                                                         PropagatorField &q_out,
+                                                         Current curr_type,
+                                                         unsigned int mu, 
+                                                         std::vector<Real> mom,
+                                                         unsigned int tmin,
+                                                         unsigned int tmax)
+{
+    assert(0);
+}
+
 FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion);

  //AdjointFermOpTemplateInstantiate(ImprovedStaggeredFermion);
@@ -157,6 +157,22 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS

  LebesgueOrder Lebesgue;
  LebesgueOrder LebesgueEvenOdd;
+  
+  ///////////////////////////////////////////////////////////////
+  // Conserved current utilities
+  ///////////////////////////////////////////////////////////////
+  void ContractConservedCurrent(PropagatorField &q_in_1,
+                                PropagatorField &q_in_2,
+                                PropagatorField &q_out,
+                                Current curr_type,
+                                unsigned int mu);
+  void SeqConservedCurrent(PropagatorField &q_in,
+                           PropagatorField &q_out,
+                           Current curr_type,
+                           unsigned int mu, 
+                           std::vector<Real> mom,
+                           unsigned int tmin,
+                           unsigned int tmax);
 };

 typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
@@ -405,6 +405,30 @@ void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,
  MooeeInv(in, out);
 }

+//////////////////////////////////////////////////////// 
+// Conserved current - not yet implemented.
+////////////////////////////////////////////////////////
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
+                                                         PropagatorField &q_in_2,
+                                                         PropagatorField &q_out,
+                                                         Current curr_type,
+                                                         unsigned int mu)
+{
+    assert(0);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
+                                                          PropagatorField &q_out,
+                                                          Current curr_type,
+                                                          unsigned int mu, 
+                                                          std::vector<Real> mom,
+                                                          unsigned int tmin,
+                                                          unsigned int tmax)
+{
+    assert(0);
+}

 FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion5D);
 FermOpStaggeredVec5dTemplateInstantiate(ImprovedStaggeredFermion5D);
@@ -170,6 +170,21 @@ namespace QCD {
    // Comms buffer
    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
    
+    ///////////////////////////////////////////////////////////////
+    // Conserved current utilities
+    ///////////////////////////////////////////////////////////////
+    void ContractConservedCurrent(PropagatorField &q_in_1,
+                                  PropagatorField &q_in_2,
+                                  PropagatorField &q_out,
+                                  Current curr_type,
+                                  unsigned int mu);
+    void SeqConservedCurrent(PropagatorField &q_in,
+                             PropagatorField &q_out,
+                             Current curr_type,
+                             unsigned int mu, 
+                             std::vector<Real> mom,
+                             unsigned int tmin,
+                             unsigned int tmax);
  };

 }}
@@ -0,0 +1,243 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
+
+    Copyright (C) 2017
+
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/Eigen/Dense>
+#include <Grid/qcd/spin/Dirac.h>
+
+namespace Grid
+{
+namespace QCD
+{
+
+// *NOT* EO
+template <class Impl>
+RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
+{
+  FermionField temp(out._grid);
+
+  // Wilson term
+  out.checkerboard = in.checkerboard;
+  this->Dhop(in, out, DaggerNo);
+
+  // Clover term
+  Mooee(in, temp);
+
+  out += temp;
+  return norm2(out);
+}
+
+template <class Impl>
+RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
+{
+  FermionField temp(out._grid);
+
+  // Wilson term
+  out.checkerboard = in.checkerboard;
+  this->Dhop(in, out, DaggerYes);
+
+  // Clover term
+  MooeeDag(in, temp);
+
+  out += temp;
+  return norm2(out);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
+{
+  WilsonFermion<Impl>::ImportGauge(_Umu);
+  GridBase *grid = _Umu._grid;
+  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
+
+  // Compute the field strength terms mu>nu
+  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
+  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
+  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
+
+  // Compute the Clover Operator acting on Colour and Spin
+  // multiply here by the clover coefficients for the anisotropy
+  CloverTerm  = fillCloverYZ(Bx) * csw_r;
+  CloverTerm += fillCloverXZ(By) * csw_r;
+  CloverTerm += fillCloverXY(Bz) * csw_r;
+  CloverTerm += fillCloverXT(Ex) * csw_t;
+  CloverTerm += fillCloverYT(Ey) * csw_t;
+  CloverTerm += fillCloverZT(Ez) * csw_t;
+  CloverTerm += diag_mass;
+
+  int lvol = _Umu._grid->lSites();
+  int DimRep = Impl::Dimension;
+
+  Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+  Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+
+  std::vector<int> lcoor;
+  typename SiteCloverType::scalar_object Qx = zero, Qxinv = zero;
+
+  for (int site = 0; site < lvol; site++)
+  {
+    grid->LocalIndexToLocalCoor(site, lcoor);
+    EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+    peekLocalSite(Qx, CloverTerm, lcoor);
+    Qxinv = zero;
+    //if (csw!=0){
+    for (int j = 0; j < Ns; j++)
+      for (int k = 0; k < Ns; k++)
+        for (int a = 0; a < DimRep; a++)
+          for (int b = 0; b < DimRep; b++)
+            EigenCloverOp(a + j * DimRep, b + k * DimRep) = Qx()(j, k)(a, b);
+    //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
+
+    EigenInvCloverOp = EigenCloverOp.inverse();
+    //std::cout << EigenInvCloverOp << std::endl;
+    for (int j = 0; j < Ns; j++)
+      for (int k = 0; k < Ns; k++)
+        for (int a = 0; a < DimRep; a++)
+          for (int b = 0; b < DimRep; b++)
+            Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
+    //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
+    //  }
+    pokeLocalSite(Qxinv, CloverTermInv, lcoor);
+  }
+
+  // Separate the even and odd parts
+  pickCheckerboard(Even, CloverTermEven, CloverTerm);
+  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
+
+  pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
+  pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
+
+  pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
+  pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
+
+  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
+  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
+{
+  this->MooeeInternal(in, out, DaggerNo, InverseNo);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
+{
+  this->MooeeInternal(in, out, DaggerYes, InverseNo);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
+{
+  this->MooeeInternal(in, out, DaggerNo, InverseYes);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
+{
+  this->MooeeInternal(in, out, DaggerYes, InverseYes);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
+{
+  out.checkerboard = in.checkerboard;
+  CloverFieldType *Clover;
+  assert(in.checkerboard == Odd || in.checkerboard == Even);
+
+  if (dag)
+  {
+    if (in._grid->_isCheckerBoarded)
+    {
+      if (in.checkerboard == Odd)
+      {
+        Clover = (inv) ? &CloverTermInvDagOdd : &CloverTermDagOdd;
+      }
+      else
+      {
+        Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
+      }
+      out = *Clover * in;
+    }
+    else
+    {
+      Clover = (inv) ? &CloverTermInv : &CloverTerm;
+      out = adj(*Clover) * in;
+    }
+  }
+  else
+  {
+    if (in._grid->_isCheckerBoarded)
+    {
+
+      if (in.checkerboard == Odd)
+      {
+        //  std::cout << "Calling clover term Odd" << std::endl;
+        Clover = (inv) ? &CloverTermInvOdd : &CloverTermOdd;
+      }
+      else
+      {
+        //  std::cout << "Calling clover term Even" << std::endl;
+        Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
+      }
+      out = *Clover * in;
+      //  std::cout << GridLogMessage << "*Clover.checkerboard "  << (*Clover).checkerboard << std::endl;
+    }
+    else
+    {
+      Clover = (inv) ? &CloverTermInv : &CloverTerm;
+      out = *Clover * in;
+    }
+  }
+
+} // MooeeInternal
+
+
+// Derivative parts
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
+{
+  assert(0);
+}
+
+// Derivative parts
+template <class Impl>
+void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
+{
+  assert(0); // not implemented yet
+}
+
+FermOpTemplateInstantiate(WilsonCloverFermion);
+AdjointFermOpTemplateInstantiate(WilsonCloverFermion);
+TwoIndexFermOpTemplateInstantiate(WilsonCloverFermion);
+//GparityFermOpTemplateInstantiate(WilsonCloverFermion);
+}
+}
@@ -0,0 +1,366 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h
+
+    Copyright (C) 2017
+
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+    Author: David Preti <>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#ifndef GRID_QCD_WILSON_CLOVER_FERMION_H
+#define GRID_QCD_WILSON_CLOVER_FERMION_H
+
+#include <Grid/Grid.h>
+
+namespace Grid
+{
+namespace QCD
+{
+
+///////////////////////////////////////////////////////////////////
+// Wilson Clover
+//
+// Operator ( with anisotropy coefficients):
+//
+// Q =   1 + (Nd-1)/xi_0 + m
+//     + W_t + (nu/xi_0) * W_s
+//     - 1/2*[ csw_t * sum_s (sigma_ts F_ts) + (csw_s/xi_0) * sum_ss (sigma_ss F_ss)  ]
+//
+// s spatial, t temporal directions.
+// where W_t and W_s are the temporal and spatial components of the
+// Wilson Dirac operator
+//
+// csw_r = csw_t to recover the isotropic version
+//////////////////////////////////////////////////////////////////
+
+template <class Impl>
+class WilsonCloverFermion : public WilsonFermion<Impl>
+{
+public:
+  // Types definitions
+  INHERIT_IMPL_TYPES(Impl);
+  template <typename vtype>
+  using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
+  typedef iImplClover<Simd> SiteCloverType;
+  typedef Lattice<SiteCloverType> CloverFieldType;
+
+public:
+  typedef WilsonFermion<Impl> WilsonBase;
+
+  virtual void Instantiatable(void){};
+  // Constructors
+  WilsonCloverFermion(GaugeField &_Umu, GridCartesian &Fgrid,
+                      GridRedBlackCartesian &Hgrid,
+                      const RealD _mass,
+                      const RealD _csw_r = 0.0,
+                      const RealD _csw_t = 0.0,
+                      const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
+                      const ImplParams &impl_p = ImplParams()) : WilsonFermion<Impl>(_Umu,
+                                                                                     Fgrid,
+                                                                                     Hgrid,
+                                                                                     _mass, impl_p, clover_anisotropy),
+                                                                 CloverTerm(&Fgrid),
+                                                                 CloverTermInv(&Fgrid),
+                                                                 CloverTermEven(&Hgrid),
+                                                                 CloverTermOdd(&Hgrid),
+                                                                 CloverTermInvEven(&Hgrid),
+                                                                 CloverTermInvOdd(&Hgrid),
+                                                                 CloverTermDagEven(&Hgrid),
+                                                                 CloverTermDagOdd(&Hgrid),
+                                                                 CloverTermInvDagEven(&Hgrid),
+                                                                 CloverTermInvDagOdd(&Hgrid)
+  {
+    assert(Nd == 4); // require 4 dimensions
+
+    if (clover_anisotropy.isAnisotropic)
+    {
+      csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
+      diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
+    }
+    else
+    {
+      csw_r = _csw_r * 0.5;
+      diag_mass = 4.0 + _mass;
+    }
+    csw_t = _csw_t * 0.5;
+
+    if (csw_r == 0)
+      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
+    if (csw_t == 0)
+      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
+
+    ImportGauge(_Umu);
+  }
+
+  virtual RealD M(const FermionField &in, FermionField &out);
+  virtual RealD Mdag(const FermionField &in, FermionField &out);
+
+  virtual void Mooee(const FermionField &in, FermionField &out);
+  virtual void MooeeDag(const FermionField &in, FermionField &out);
+  virtual void MooeeInv(const FermionField &in, FermionField &out);
+  virtual void MooeeInvDag(const FermionField &in, FermionField &out);
+  virtual void MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv);
+
+  //virtual void MDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
+  virtual void MooDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
+  virtual void MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
+
+  void ImportGauge(const GaugeField &_Umu);
+
+  // Derivative parts unpreconditioned pseudofermions
+  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
+  {
+    conformable(X._grid, Y._grid);
+    conformable(X._grid, force._grid);
+    GaugeLinkField force_mu(force._grid), lambda(force._grid);
+    GaugeField clover_force(force._grid);
+    PropagatorField Lambda(force._grid);
+
+    // Guido: Here we are hitting some performance issues:
+    // need to extract the components of the DoubledGaugeField
+    // for each call
+    // Possible solution
+    // Create a vector object to store them? (cons: wasting space)
+    std::vector<GaugeLinkField> U(Nd, this->Umu._grid);
+
+    Impl::extractLinkField(U, this->Umu);
+
+    force = zero;
+    // Derivative of the Wilson hopping term
+    this->DhopDeriv(force, X, Y, dag);
+
+    ///////////////////////////////////////////////////////////
+    // Clover term derivative
+    ///////////////////////////////////////////////////////////
+    Impl::outerProductImpl(Lambda, X, Y);
+    //std::cout << "Lambda:" << Lambda << std::endl;
+
+    Gamma::Algebra sigma[] = {
+        Gamma::Algebra::SigmaXY,
+        Gamma::Algebra::SigmaXZ,
+        Gamma::Algebra::SigmaXT,
+        Gamma::Algebra::MinusSigmaXY,
+        Gamma::Algebra::SigmaYZ,
+        Gamma::Algebra::SigmaYT,
+        Gamma::Algebra::MinusSigmaXZ,
+        Gamma::Algebra::MinusSigmaYZ,
+        Gamma::Algebra::SigmaZT,
+        Gamma::Algebra::MinusSigmaXT,
+        Gamma::Algebra::MinusSigmaYT,
+        Gamma::Algebra::MinusSigmaZT};
+
+    /*
+      sigma_{\mu \nu}=
+      | 0         sigma[0]  sigma[1]  sigma[2] |
+      | sigma[3]    0       sigma[4]  sigma[5] |
+      | sigma[6]  sigma[7]     0      sigma[8] |
+      | sigma[9]  sigma[10] sigma[11]   0      |
+    */
+
+    int count = 0;
+    clover_force = zero;
+    for (int mu = 0; mu < 4; mu++)
+    {
+      force_mu = zero;
+      for (int nu = 0; nu < 4; nu++)
+      {
+        if (mu == nu)
+        continue;
+        
+        RealD factor;
+        if (nu == 4 || mu == 4)
+        {
+          factor = 2.0 * csw_t;
+        }
+        else
+        {
+          factor = 2.0 * csw_r;
+        }
+        PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
+        Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
+        force_mu -= factor*Cmunu(U, lambda, mu, nu);                   // checked
+        count++;
+      }
+
+      pokeLorentz(clover_force, U[mu] * force_mu, mu);
+    }
+    //clover_force *= csw;
+    force += clover_force;
+  }
+
+  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
+  GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
+  {
+    conformable(lambda._grid, U[0]._grid);
+    GaugeLinkField out(lambda._grid), tmp(lambda._grid);
+    // insertion in upper staple
+    // please check redundancy of shift operations
+
+    // C1+
+    tmp = lambda * U[nu];
+    out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
+
+    // C2+
+    tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
+    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
+
+    // C3+
+    tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
+    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
+
+    // C4+
+    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
+
+    // insertion in lower staple
+    // C1-
+    out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
+
+    // C2-
+    tmp = adj(lambda) * U[nu];
+    out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
+
+    // C3-
+    tmp = lambda * U[nu];
+    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
+
+    // C4-
+    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
+
+    return out;
+  }
+
+private:
+  // here fixing the 4 dimensions, make it more general?
+
+  RealD csw_r;                                               // Clover coefficient - spatial
+  RealD csw_t;                                               // Clover coefficient - temporal
+  RealD diag_mass;                                           // Mass term
+  CloverFieldType CloverTerm, CloverTermInv;                 // Clover term
+  CloverFieldType CloverTermEven, CloverTermOdd;             // Clover term EO
+  CloverFieldType CloverTermInvEven, CloverTermInvOdd;       // Clover term Inv EO
+  CloverFieldType CloverTermDagEven, CloverTermDagOdd;       // Clover term Dag EO
+  CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
+
+  // eventually these can be compressed into 6x6 blocks instead of the 12x12
+  // using the DeGrand-Rossi basis for the gamma matrices
+  CloverFieldType fillCloverYZ(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F._grid);
+    T = zero;
+    PARALLEL_FOR_LOOP
+    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    {
+      T._odata[i]()(0, 1) = timesMinusI(F._odata[i]()());
+      T._odata[i]()(1, 0) = timesMinusI(F._odata[i]()());
+      T._odata[i]()(2, 3) = timesMinusI(F._odata[i]()());
+      T._odata[i]()(3, 2) = timesMinusI(F._odata[i]()());
+    }
+
+    return T;
+  }
+
+  CloverFieldType fillCloverXZ(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F._grid);
+    T = zero;
+    PARALLEL_FOR_LOOP
+    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    {
+      T._odata[i]()(0, 1) = -F._odata[i]()();
+      T._odata[i]()(1, 0) = F._odata[i]()();
+      T._odata[i]()(2, 3) = -F._odata[i]()();
+      T._odata[i]()(3, 2) = F._odata[i]()();
+    }
+
+    return T;
+  }
+
+  CloverFieldType fillCloverXY(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F._grid);
+    T = zero;
+    PARALLEL_FOR_LOOP
+    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    {
+
+      T._odata[i]()(0, 0) = timesMinusI(F._odata[i]()());
+      T._odata[i]()(1, 1) = timesI(F._odata[i]()());
+      T._odata[i]()(2, 2) = timesMinusI(F._odata[i]()());
+      T._odata[i]()(3, 3) = timesI(F._odata[i]()());
+    }
+
+    return T;
+  }
+
+  CloverFieldType fillCloverXT(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F._grid);
+    T = zero;
+    PARALLEL_FOR_LOOP
+    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    {
+      T._odata[i]()(0, 1) = timesI(F._odata[i]()());
+      T._odata[i]()(1, 0) = timesI(F._odata[i]()());
+      T._odata[i]()(2, 3) = timesMinusI(F._odata[i]()());
+      T._odata[i]()(3, 2) = timesMinusI(F._odata[i]()());
+    }
+
+    return T;
+  }
+
+  CloverFieldType fillCloverYT(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F._grid);
+    T = zero;
+    PARALLEL_FOR_LOOP
+    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    {
+      T._odata[i]()(0, 1) = -(F._odata[i]()());
+      T._odata[i]()(1, 0) = (F._odata[i]()());
+      T._odata[i]()(2, 3) = (F._odata[i]()());
+      T._odata[i]()(3, 2) = -(F._odata[i]()());
+    }
+
+    return T;
+  }
+
+  CloverFieldType fillCloverZT(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F._grid);
+    T = zero;
+    PARALLEL_FOR_LOOP
+    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    {
+      T._odata[i]()(0, 0) = timesI(F._odata[i]()());
+      T._odata[i]()(1, 1) = timesMinusI(F._odata[i]()());
+      T._odata[i]()(2, 2) = timesMinusI(F._odata[i]()());
+      T._odata[i]()(3, 3) = timesI(F._odata[i]()());
+    }
+
+    return T;
+  }
+};
+}
+}
+
+#endif // GRID_QCD_WILSON_CLOVER_FERMION_H
@@ -265,7 +265,6 @@ public:
    if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
    if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
  }
-  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;

  std::vector<int> same_node;
  std::vector<int> surface_list;
@@ -47,7 +47,8 @@ int WilsonFermionStatic::HandOptDslash;
 template <class Impl>
 WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
                                   GridRedBlackCartesian &Hgrid, RealD _mass,
-                                   const ImplParams &p)
+                                   const ImplParams &p,
+                                   const WilsonAnisotropyCoefficients &anis)
    : Kernels(p),
      _grid(&Fgrid),
      _cbgrid(&Hgrid),
@@ -60,16 +61,41 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
      Umu(&Fgrid),
      UmuEven(&Hgrid),
      UmuOdd(&Hgrid),
-      _tmp(&Hgrid)
+      _tmp(&Hgrid),
+      anisotropyCoeff(anis)
 {
  // Allocate the required comms buffer
  ImportGauge(_Umu);
+  if  (anisotropyCoeff.isAnisotropic){
+    diag_mass = mass + 1.0 + (Nd-1)*(anisotropyCoeff.nu / anisotropyCoeff.xi_0);
+  } else {
+    diag_mass = 4.0 + mass;
+  }
+
+
 }

 template <class Impl>
 void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) {
  GaugeField HUmu(_Umu._grid);
-  HUmu = _Umu * (-0.5);
+
+  //Here multiply the anisotropy coefficients
+  if (anisotropyCoeff.isAnisotropic)
+  {
+
+    for (int mu = 0; mu < Nd; mu++)
+    {
+      GaugeLinkField U_dir = (-0.5)*PeekIndex<LorentzIndex>(_Umu, mu);
+      if (mu != anisotropyCoeff.t_direction)
+        U_dir *= (anisotropyCoeff.nu / anisotropyCoeff.xi_0);
+
+      PokeIndex<LorentzIndex>(HUmu, U_dir, mu);
+    }
+  }
+  else
+  {
+    HUmu = _Umu * (-0.5);
+  }
  Impl::DoubleStore(GaugeGrid(), Umu, HUmu);
  pickCheckerboard(Even, UmuEven, Umu);
  pickCheckerboard(Odd, UmuOdd, Umu);
@@ -83,14 +109,14 @@ template <class Impl>
 RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) {
  out.checkerboard = in.checkerboard;
  Dhop(in, out, DaggerNo);
-  return axpy_norm(out, 4 + mass, in, out);
+  return axpy_norm(out, diag_mass, in, out);
 }

 template <class Impl>
 RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
  out.checkerboard = in.checkerboard;
  Dhop(in, out, DaggerYes);
-  return axpy_norm(out, 4 + mass, in, out);
+  return axpy_norm(out, diag_mass, in, out);
 }

 template <class Impl>
@@ -114,7 +140,7 @@ void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
 template <class Impl>
 void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
  out.checkerboard = in.checkerboard;
-  typename FermionField::scalar_type scal(4.0 + mass);
+  typename FermionField::scalar_type scal(diag_mass);
  out = scal * in;
 }

@@ -127,7 +153,7 @@ void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
 template<class Impl>
 void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
  out.checkerboard = in.checkerboard;
-  out = (1.0/(4.0+mass))*in;
+  out = (1.0/(diag_mass))*in;
 }
  
 template<class Impl>
@@ -204,7 +230,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,

  FermionField Btilde(B._grid);
  FermionField Atilde(B._grid);
-  Atilde = A;
+  Atilde = A;//redundant

  st.HaloExchange(B, compressor);

@@ -345,6 +371,112 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
  }
 };

+/*******************************************************************************
+ * Conserved current utilities for Wilson fermions, for contracting propagators
+ * to make a conserved current sink or inserting the conserved current 
+ * sequentially.
+ ******************************************************************************/
+template <class Impl>
+void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
+                                                   PropagatorField &q_in_2,
+                                                   PropagatorField &q_out,
+                                                   Current curr_type,
+                                                   unsigned int mu)
+{
+    Gamma g5(Gamma::Algebra::Gamma5);
+    conformable(_grid, q_in_1._grid);
+    conformable(_grid, q_in_2._grid);
+    conformable(_grid, q_out._grid);
+    PropagatorField tmp1(_grid), tmp2(_grid);
+    q_out = zero;
+
+    // Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
+    // Inefficient comms method but not performance critical.
+    tmp1 = Cshift(q_in_1, mu, 1);
+    tmp2 = Cshift(q_in_2, mu, 1);
+    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
+    {
+        Kernels::ContractConservedCurrentSiteFwd(tmp1._odata[sU],
+                                                 q_in_2._odata[sU],
+                                                 q_out._odata[sU],
+                                                 Umu, sU, mu);
+        Kernels::ContractConservedCurrentSiteBwd(q_in_1._odata[sU],
+                                                 tmp2._odata[sU],
+                                                 q_out._odata[sU],
+                                                 Umu, sU, mu);
+    }
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
+                                              PropagatorField &q_out,
+                                              Current curr_type,
+                                              unsigned int mu,
+                                              std::vector<Real> mom,
+                                              unsigned int tmin, 
+                                              unsigned int tmax)
+{
+    conformable(_grid, q_in._grid);
+    conformable(_grid, q_out._grid);
+    Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
+    ComplexD i(0.0,1.0);
+    PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
+    unsigned int tshift = (mu == Tp) ? 1 : 0;
+    unsigned int LLt    = GridDefaultLatt()[Tp];
+
+    // Momentum projection
+    ph = zero;
+    for(unsigned int mu = 0; mu < Nd - 1; mu++)
+    {
+        LatticeCoordinate(coor, mu);
+        ph = ph + mom[mu]*coor*((1./(_grid->_fdimensions[mu])));
+    }
+    ph = exp((RealD)(2*M_PI)*i*ph);
+
+    q_out = zero;
+    LatticeInteger coords(_grid);
+    LatticeCoordinate(coords, Tp);
+
+    // Need q(x + mu) and q(x - mu).
+    tmp = Cshift(q_in, mu, 1);
+    tmpFwd = tmp*ph;
+    tmp = ph*q_in;
+    tmpBwd = Cshift(tmp, mu, -1);
+
+    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
+    {
+        // Compute the sequential conserved current insertion only if our simd
+        // object contains a timeslice we need.
+        vInteger t_mask   = ((coords._odata[sU] >= tmin) &&
+                             (coords._odata[sU] <= tmax));
+        Integer timeSlices = Reduce(t_mask);
+
+        if (timeSlices > 0)
+        {
+            Kernels::SeqConservedCurrentSiteFwd(tmpFwd._odata[sU], 
+                                                q_out._odata[sU], 
+                                                Umu, sU, mu, t_mask);
+        }
+
+        // Repeat for backward direction.
+        t_mask     = ((coords._odata[sU] >= (tmin + tshift)) && 
+                      (coords._odata[sU] <= (tmax + tshift)));
+
+	//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
+	unsigned int t0 = 0;
+	if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords._odata[sU] == t0 ));
+
+        timeSlices = Reduce(t_mask);
+
+        if (timeSlices > 0)
+        {
+            Kernels::SeqConservedCurrentSiteBwd(tmpBwd._odata[sU], 
+                                                q_out._odata[sU], 
+                                                Umu, sU, mu, t_mask);
+        }
+    }
+}
+
 FermOpTemplateInstantiate(WilsonFermion);
 AdjointFermOpTemplateInstantiate(WilsonFermion);
 TwoIndexFermOpTemplateInstantiate(WilsonFermion);
@@ -44,6 +44,21 @@ class WilsonFermionStatic {
  static const int npoint = 8;
 };

+ struct WilsonAnisotropyCoefficients: Serializable
+ {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonAnisotropyCoefficients,
+  bool, isAnisotropic,
+  int, t_direction,
+  double, xi_0,
+  double, nu);
+
+  WilsonAnisotropyCoefficients():
+    isAnisotropic(false), 
+    t_direction(Nd-1), 
+    xi_0(1.0), 
+    nu(1.0){}
+};
+
 template <class Impl>
 class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
 public:
@@ -65,8 +80,8 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
  // override multiply; cut number routines if pass dagger argument
  // and also make interface more uniformly consistent
  //////////////////////////////////////////////////////////////////
-  RealD M(const FermionField &in, FermionField &out);
-  RealD Mdag(const FermionField &in, FermionField &out);
+  virtual RealD M(const FermionField &in, FermionField &out);
+  virtual RealD Mdag(const FermionField &in, FermionField &out);

  /////////////////////////////////////////////////////////
  // half checkerboard operations
@@ -117,8 +132,9 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {

  // Constructor
  WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
-                GridRedBlackCartesian &Hgrid, RealD _mass,
-                const ImplParams &p = ImplParams());
+                GridRedBlackCartesian &Hgrid, RealD _mass, 
+                const ImplParams &p = ImplParams(), 
+                const WilsonAnisotropyCoefficients &anis = WilsonAnisotropyCoefficients() );

  // DoubleStore impl dependent
  void ImportGauge(const GaugeField &_Umu);
@@ -130,6 +146,7 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
  //    protected:
 public:
  RealD mass;
+  RealD diag_mass;

  GridBase *_grid;
  GridBase *_cbgrid;
@@ -146,6 +163,24 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {

  LebesgueOrder Lebesgue;
  LebesgueOrder LebesgueEvenOdd;
+
+  WilsonAnisotropyCoefficients anisotropyCoeff;
+  
+  ///////////////////////////////////////////////////////////////
+  // Conserved current utilities
+  ///////////////////////////////////////////////////////////////
+  void ContractConservedCurrent(PropagatorField &q_in_1,
+                                PropagatorField &q_in_2,
+                                PropagatorField &q_out,
+                                Current curr_type,
+                                unsigned int mu);
+  void SeqConservedCurrent(PropagatorField &q_in,
+                           PropagatorField &q_out,
+                           Current curr_type,
+                           unsigned int mu, 
+                           std::vector<Real> mom,
+                           unsigned int tmin,
+                           unsigned int tmax);
 };

 typedef WilsonFermion<WilsonImplF> WilsonFermionF;
@@ -12,6 +12,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
+Author: Andrew Lawson <andrew.lawson1991@gmail.com>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -702,6 +703,168 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe

 }

+/*******************************************************************************
+ * Conserved current utilities for Wilson fermions, for contracting propagators
+ * to make a conserved current sink or inserting the conserved current 
+ * sequentially.
+ ******************************************************************************/
+
+// Helper macro to reverse Simd vector. Fixme: slow, generic implementation.
+#define REVERSE_LS(qSite, qSiteRev, Nsimd) \
+{ \
+    std::vector<typename SitePropagator::scalar_object> qSiteVec(Nsimd); \
+    extract(qSite, qSiteVec); \
+    for (int i = 0; i < Nsimd / 2; ++i) \
+    { \
+        typename SitePropagator::scalar_object tmp = qSiteVec[i]; \
+        qSiteVec[i] = qSiteVec[Nsimd - i - 1]; \
+        qSiteVec[Nsimd - i - 1] = tmp; \
+    } \
+    merge(qSiteRev, qSiteVec); \
+}
+
+template <class Impl>
+void WilsonFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
+                                                     PropagatorField &q_in_2,
+                                                     PropagatorField &q_out,
+                                                     Current curr_type,
+                                                     unsigned int mu)
+{
+    conformable(q_in_1._grid, FermionGrid());
+    conformable(q_in_1._grid, q_in_2._grid);
+    conformable(_FourDimGrid, q_out._grid);
+    PropagatorField tmp1(FermionGrid()), tmp2(FermionGrid());
+    unsigned int LLs = q_in_1._grid->_rdimensions[0];
+    q_out = zero;
+
+    // Forward, need q1(x + mu, s), q2(x, Ls - 1 - s). Backward, need q1(x, s), 
+    // q2(x + mu, Ls - 1 - s). 5D lattice so shift 4D coordinate mu by one.
+    tmp1 = Cshift(q_in_1, mu + 1, 1);
+    tmp2 = Cshift(q_in_2, mu + 1, 1);
+    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
+    {
+        unsigned int sF1 = sU * LLs;
+        unsigned int sF2 = (sU + 1) * LLs - 1;
+
+        for (unsigned int s = 0; s < LLs; ++s)
+        {
+            bool axial_sign = ((curr_type == Current::Axial) && \
+                               (s < (LLs / 2)));
+            SitePropagator qSite2, qmuSite2;
+
+            // If vectorised in 5th dimension, reverse q2 vector to match up
+            // sites correctly.
+            if (Impl::LsVectorised)
+            {
+                REVERSE_LS(q_in_2._odata[sF2], qSite2, Ls / LLs);
+                REVERSE_LS(tmp2._odata[sF2], qmuSite2, Ls / LLs);
+            }
+            else
+            {
+                qSite2   = q_in_2._odata[sF2];
+                qmuSite2 = tmp2._odata[sF2];
+            }
+            Kernels::ContractConservedCurrentSiteFwd(tmp1._odata[sF1], 
+                                                     qSite2, 
+                                                     q_out._odata[sU],
+                                                     Umu, sU, mu, axial_sign);
+            Kernels::ContractConservedCurrentSiteBwd(q_in_1._odata[sF1],
+                                                     qmuSite2,
+                                                     q_out._odata[sU],
+                                                     Umu, sU, mu, axial_sign);
+            sF1++;
+            sF2--;
+        }
+    }
+}
+
+
+template <class Impl>
+void WilsonFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
+                                                PropagatorField &q_out,
+                                                Current curr_type, 
+                                                unsigned int mu,
+                                                std::vector<Real> mom,
+                                                unsigned int tmin, 
+                                                unsigned int tmax)
+{
+    conformable(q_in._grid, FermionGrid());
+    conformable(q_in._grid, q_out._grid);
+    Lattice<iSinglet<Simd>> ph(FermionGrid()), coor(FermionGrid());
+    PropagatorField tmpFwd(FermionGrid()), tmpBwd(FermionGrid()),
+                    tmp(FermionGrid());
+    ComplexD i(0.0, 1.0);
+    unsigned int tshift = (mu == Tp) ? 1 : 0;
+    unsigned int LLs = q_in._grid->_rdimensions[0];
+    unsigned int LLt    = GridDefaultLatt()[Tp];
+
+    // Momentum projection.
+    ph = zero;
+    for(unsigned int nu = 0; nu < Nd - 1; nu++)
+    {
+        // Shift coordinate lattice index by 1 to account for 5th dimension.
+        LatticeCoordinate(coor, nu + 1);
+        ph = ph + mom[nu]*coor*((1./(_FourDimGrid->_fdimensions[nu])));
+    }
+    ph = exp((RealD)(2*M_PI)*i*ph);
+
+    q_out = zero;
+    LatticeInteger coords(_FourDimGrid);
+    LatticeCoordinate(coords, Tp);
+
+    // Need q(x + mu, s) and q(x - mu, s). 5D lattice so shift 4D coordinate mu
+    // by one.
+    tmp = Cshift(q_in, mu + 1, 1);
+    tmpFwd = tmp*ph;
+    tmp = ph*q_in;
+    tmpBwd = Cshift(tmp, mu + 1, -1);
+
+    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
+    {
+        // Compute the sequential conserved current insertion only if our simd
+        // object contains a timeslice we need.
+        vInteger t_mask   = ((coords._odata[sU] >= tmin) &&
+                             (coords._odata[sU] <= tmax));
+        Integer timeSlices = Reduce(t_mask);
+
+        if (timeSlices > 0)
+        {
+            unsigned int sF = sU * LLs;
+            for (unsigned int s = 0; s < LLs; ++s)
+            {
+                bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
+                Kernels::SeqConservedCurrentSiteFwd(tmpFwd._odata[sF], 
+                                                    q_out._odata[sF], Umu, sU,
+                                                    mu, t_mask, axial_sign);
+                ++sF;
+            }
+        }
+
+        // Repeat for backward direction.
+        t_mask     = ((coords._odata[sU] >= (tmin + tshift)) && 
+                      (coords._odata[sU] <= (tmax + tshift)));
+
+	//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
+	unsigned int t0 = 0;
+	if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords._odata[sU] == t0 ));
+
+        timeSlices = Reduce(t_mask);
+
+        if (timeSlices > 0)
+        {
+            unsigned int sF = sU * LLs;
+            for (unsigned int s = 0; s < LLs; ++s)
+            {
+                bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
+                Kernels::SeqConservedCurrentSiteBwd(tmpBwd._odata[sF], 
+                                                    q_out._odata[sF], Umu, sU,
+                                                    mu, t_mask, axial_sign);
+                ++sF;
+            }
+        }
+    }
+}
+
 FermOpTemplateInstantiate(WilsonFermion5D);
 GparityFermOpTemplateInstantiate(WilsonFermion5D);
  
@@ -214,6 +214,21 @@ namespace QCD {
    // Comms buffer
    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
    
+    ///////////////////////////////////////////////////////////////
+    // Conserved current utilities
+    ///////////////////////////////////////////////////////////////
+    void ContractConservedCurrent(PropagatorField &q_in_1,
+                                  PropagatorField &q_in_2,
+                                  PropagatorField &q_out,
+                                  Current curr_type, 
+                                  unsigned int mu);
+    void SeqConservedCurrent(PropagatorField &q_in,
+                             PropagatorField &q_out,
+                             Current curr_type,
+                             unsigned int mu,
+                             std::vector<Real> mom,
+                             unsigned int tmin,
+                             unsigned int tmax);
  };

 }}
@@ -281,6 +281,172 @@ void WilsonKernels<Impl>::DhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHal
  vstream(out._odata[sF], result);
 }

+/*******************************************************************************
+ * Conserved current utilities for Wilson fermions, for contracting propagators
+ * to make a conserved current sink or inserting the conserved current 
+ * sequentially. Common to both 4D and 5D.
+ ******************************************************************************/
+// N.B. Functions below assume a -1/2 factor within U.
+#define WilsonCurrentFwd(expr, mu) ((expr - Gamma::gmu[mu]*expr))
+#define WilsonCurrentBwd(expr, mu) ((expr + Gamma::gmu[mu]*expr))
+
+/*******************************************************************************
+ * Name: ContractConservedCurrentSiteFwd
+ * Operation: (1/2) * q2[x] * U(x) * (g[mu] - 1) * q1[x + mu]
+ * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
+ *        - Pass in q_in_1 shifted in +ve mu direction.
+ ******************************************************************************/
+template<class Impl>
+void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd(
+                                                  const SitePropagator &q_in_1,
+                                                  const SitePropagator &q_in_2,
+                                                  SitePropagator &q_out,
+                                                  DoubledGaugeField &U,
+                                                  unsigned int sU,
+                                                  unsigned int mu,
+                                                  bool switch_sign)
+{
+    SitePropagator result, tmp;
+    Gamma g5(Gamma::Algebra::Gamma5);
+    Impl::multLinkProp(tmp, U._odata[sU], q_in_1, mu);
+    result = g5 * adj(q_in_2) * g5 * WilsonCurrentFwd(tmp, mu);
+    if (switch_sign)
+    {
+        q_out -= result;
+    }
+    else
+    {
+        q_out += result;
+    }
+}
+
+/*******************************************************************************
+ * Name: ContractConservedCurrentSiteBwd
+ * Operation: (1/2) * q2[x + mu] * U^dag(x) * (g[mu] + 1) * q1[x]
+ * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
+ *        - Pass in q_in_2 shifted in +ve mu direction.
+ ******************************************************************************/
+template<class Impl>
+void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd(
+                                                  const SitePropagator &q_in_1,
+                                                  const SitePropagator &q_in_2,
+                                                  SitePropagator &q_out,
+                                                  DoubledGaugeField &U,
+                                                  unsigned int sU,
+                                                  unsigned int mu,
+                                                  bool switch_sign)
+{
+    SitePropagator result, tmp;
+    Gamma g5(Gamma::Algebra::Gamma5);
+    Impl::multLinkProp(tmp, U._odata[sU], q_in_1, mu + Nd);
+    result = g5 * adj(q_in_2) * g5 * WilsonCurrentBwd(tmp, mu);
+    if (switch_sign)
+    {
+        q_out += result;
+    }
+    else
+    {
+        q_out -= result;
+    }
+}
+
+// G-parity requires more specialised implementation.
+#define NO_CURR_SITE(Impl) \
+template <> \
+void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd( \
+                                                  const SitePropagator &q_in_1, \
+                                                  const SitePropagator &q_in_2, \
+                                                  SitePropagator &q_out,        \
+                                                  DoubledGaugeField &U,         \
+                                                  unsigned int sU,              \
+                                                  unsigned int mu,              \
+                                                  bool switch_sign)             \
+{ \
+    assert(0); \
+} \
+template <> \
+void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd( \
+                                                  const SitePropagator &q_in_1, \
+                                                  const SitePropagator &q_in_2, \
+                                                  SitePropagator &q_out,        \
+                                                  DoubledGaugeField &U,         \
+                                                  unsigned int mu,              \
+                                                  unsigned int sU,              \
+                                                  bool switch_sign)             \
+{ \
+    assert(0); \
+}
+
+NO_CURR_SITE(GparityWilsonImplF);
+NO_CURR_SITE(GparityWilsonImplD);
+NO_CURR_SITE(GparityWilsonImplFH);
+NO_CURR_SITE(GparityWilsonImplDF);
+
+
+/*******************************************************************************
+ * Name: SeqConservedCurrentSiteFwd
+ * Operation: (1/2) * U(x) * (g[mu] - 1) * q[x + mu]
+ * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
+ *        - Pass in q_in shifted in +ve mu direction.
+ ******************************************************************************/
+template<class Impl>
+void WilsonKernels<Impl>::SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
+                                                     SitePropagator &q_out,
+                                                     DoubledGaugeField &U,
+                                                     unsigned int sU,
+                                                     unsigned int mu,
+                                                     vInteger t_mask,
+                                                     bool switch_sign)
+{
+    SitePropagator result;
+    Impl::multLinkProp(result, U._odata[sU], q_in, mu);
+    result = WilsonCurrentFwd(result, mu);
+
+    // Zero any unwanted timeslice entries.
+    result = predicatedWhere(t_mask, result, 0.*result);
+
+    if (switch_sign)
+    {
+        q_out -= result;
+    }
+    else
+    {
+        q_out += result;
+    }
+}
+
+/*******************************************************************************
+ * Name: SeqConservedCurrentSiteFwd
+ * Operation: (1/2) * U^dag(x) * (g[mu] + 1) * q[x - mu]
+ * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
+ *        - Pass in q_in shifted in -ve mu direction.
+ ******************************************************************************/
+template<class Impl>
+void WilsonKernels<Impl>::SeqConservedCurrentSiteBwd(const SitePropagator &q_in, 
+                                                     SitePropagator &q_out,
+                                                     DoubledGaugeField &U,
+                                                     unsigned int sU,
+                                                     unsigned int mu,
+                                                     vInteger t_mask,
+                                                     bool switch_sign)
+{
+    SitePropagator result;
+    Impl::multLinkProp(result, U._odata[sU], q_in, mu + Nd);
+    result = WilsonCurrentBwd(result, mu);
+
+    // Zero any unwanted timeslice entries.
+    result = predicatedWhere(t_mask, result, 0.*result);
+
+    if (switch_sign)
+    {
+        q_out += result;
+    }
+    else
+    {
+        q_out -= result;
+    }
+}
+
 FermOpTemplateInstantiate(WilsonKernels);
 AdjointFermOpTemplateInstantiate(WilsonKernels);
 TwoIndexFermOpTemplateInstantiate(WilsonKernels);
@@ -55,7 +55,7 @@ template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public
 public:
   
  template <bool EnableBool = true>
-  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
+  typename std::enable_if<Impl::isFundamental==true && Nc == 3 &&EnableBool, void>::type
  DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) 
  {
@@ -99,7 +99,7 @@ public:
  }
     
  template <bool EnableBool = true>
-  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
+  typename std::enable_if<(Impl::isFundamental==false || (Impl::isFundamental==true && Nc != 3)) && EnableBool, void>::type
  DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 	   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1 ) {
    // no kernel choice  
@@ -116,7 +116,7 @@ public:
  }
     
  template <bool EnableBool = true>
-  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
+  typename std::enable_if<Impl::isFundamental==true && Nc == 3 && EnableBool,void>::type
  DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 	      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) 
 {
@@ -161,7 +161,7 @@ public:
  }

  template <bool EnableBool = true>
-  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,void>::type
+  typename std::enable_if<(Impl::isFundamental==false || (Impl::isFundamental==true && Nc != 3)) && EnableBool,void>::type
  DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf,
 		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) {

@@ -180,6 +180,38 @@ public:
  void DhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
 		       int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma);
      
+  //////////////////////////////////////////////////////////////////////////////
+  // Utilities for inserting Wilson conserved current.
+  //////////////////////////////////////////////////////////////////////////////
+  void ContractConservedCurrentSiteFwd(const SitePropagator &q_in_1,
+                                       const SitePropagator &q_in_2,
+                                       SitePropagator &q_out,
+                                       DoubledGaugeField &U,
+                                       unsigned int sU,
+                                       unsigned int mu,
+                                       bool switch_sign = false);
+  void ContractConservedCurrentSiteBwd(const SitePropagator &q_in_1,
+                                       const SitePropagator &q_in_2,
+                                       SitePropagator &q_out,
+                                       DoubledGaugeField &U,
+                                       unsigned int sU,
+                                       unsigned int mu,
+                                       bool switch_sign = false);
+  void SeqConservedCurrentSiteFwd(const SitePropagator &q_in, 
+                                  SitePropagator &q_out,
+                                  DoubledGaugeField &U,
+                                  unsigned int sU,
+                                  unsigned int mu,
+                                  vInteger t_mask,
+                                  bool switch_sign = false);
+  void SeqConservedCurrentSiteBwd(const SitePropagator &q_in,
+                                  SitePropagator &q_out,
+                                  DoubledGaugeField &U,
+                                  unsigned int sU,
+                                  unsigned int mu,
+                                  vInteger t_mask,
+                                  bool switch_sign = false);
+
 private:
     // Specialised variants
  void GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
@@ -946,5 +946,6 @@ INSTANTIATE_THEM(DomainWallVec5dImplFH);
 INSTANTIATE_THEM(DomainWallVec5dImplDF);
 INSTANTIATE_THEM(ZDomainWallVec5dImplFH);
 INSTANTIATE_THEM(ZDomainWallVec5dImplDF);
-
+INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplF);
+INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplD);
 }}
@@ -71,18 +71,14 @@ class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {

    RealD factor = 0.5 * beta / RealD(Nc);

-    //GaugeLinkField Umu(U._grid);
+    GaugeLinkField Umu(U._grid);
    GaugeLinkField dSdU_mu(U._grid);
    for (int mu = 0; mu < Nd; mu++) {
-      //Umu = PeekIndex<LorentzIndex>(U, mu);
+      Umu = PeekIndex<LorentzIndex>(U, mu);

      // Staple in direction mu
-      //WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
-      //dSdU_mu = Ta(Umu * dSdU_mu) * factor;
-
-  
-      WilsonLoops<Gimpl>::StapleMult(dSdU_mu, U, mu);
-      dSdU_mu = Ta(dSdU_mu) * factor;
+      WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
+      dSdU_mu = Ta(Umu * dSdU_mu) * factor;

      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
    }
@@ -16,12 +16,12 @@ class ScalarImplTypes {
    typedef iImplField<Simd> SiteField;
    typedef SiteField        SitePropagator;
    typedef SiteField        SiteComplex;
-    
+
    typedef Lattice<SiteField> Field;
    typedef Field              ComplexField;
    typedef Field              FermionField;
    typedef Field              PropagatorField;
-    
+
    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
      gaussian(pRNG, P);
    }
@@ -47,54 +47,60 @@ class ScalarImplTypes {
    static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
      U = 1.0;
    }
-    
+
    static void MomentumSpacePropagator(Field &out, RealD m)
    {
      GridBase           *grid = out._grid;
      Field              kmu(grid), one(grid);
      const unsigned int nd    = grid->_ndimension;
      std::vector<int>   &l    = grid->_fdimensions;
-      
+
      one = Complex(1.0,0.0);
      out = m*m;
      for(int mu = 0; mu < nd; mu++)
      {
        Real twoPiL = M_PI*2./l[mu];
-        
+
        LatticeCoordinate(kmu,mu);
        kmu = 2.*sin(.5*twoPiL*kmu);
        out = out + kmu*kmu;
      }
      out = one/out;
    }
-    
+
    static void FreePropagator(const Field &in, Field &out,
                               const Field &momKernel)
    {
      FFT   fft((GridCartesian *)in._grid);
      Field inFT(in._grid);
-      
+
      fft.FFT_all_dim(inFT, in, FFT::forward);
      inFT = inFT*momKernel;
      fft.FFT_all_dim(out, inFT, FFT::backward);
    }
-    
+
    static void FreePropagator(const Field &in, Field &out, RealD m)
    {
      Field momKernel(in._grid);
-      
+
      MomentumSpacePropagator(momKernel, m);
      FreePropagator(in, out, momKernel);
    }
-    
+
  };

+  #ifdef  USE_FFT_ACCELERATION
+  #ifndef FFT_MASS
+  #error  "USE_FFT_ACCELERATION is defined but not FFT_MASS"
+  #endif
+  #endif
+  
  template <class S, unsigned int N>
  class ScalarAdjMatrixImplTypes {
  public:
    typedef S Simd;
    typedef QCD::SU<N> Group;
-    
+
    template <typename vtype>
    using iImplField   = iScalar<iScalar<iMatrix<vtype, N>>>;
    template <typename vtype>
@@ -103,24 +109,119 @@ class ScalarImplTypes {
    typedef iImplField<Simd>   SiteField;
    typedef SiteField          SitePropagator;
    typedef iImplComplex<Simd> SiteComplex;
-    
+
    typedef Lattice<SiteField>   Field;
    typedef Lattice<SiteComplex> ComplexField;
    typedef Field                FermionField;
    typedef Field                PropagatorField;

-    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) {
+    static void MomentaSquare(ComplexField &out)
+    {
+      GridBase *grid = out._grid;
+      const std::vector<int> &l = grid->FullDimensions();
+      ComplexField kmu(grid);
+
+      for (int mu = 0; mu < grid->Nd(); mu++)
+      {
+        Real twoPiL = M_PI * 2.0 / l[mu];
+        LatticeCoordinate(kmu, mu);
+        kmu = 2.0 * sin(0.5 * twoPiL * kmu);
+        out += kmu * kmu;
+      }
+    }
+
+    static void MomentumSpacePropagator(ComplexField &out, RealD m)
+    {
+      GridBase *grid = out._grid;
+      ComplexField one(grid);
+      one = Complex(1.0, 0.0);
+      out = m * m;
+      MomentaSquare(out);
+      out = one / out;
+    }
+
+    static inline void generate_momenta(Field &P, GridParallelRNG &pRNG)
+    {
+#ifndef USE_FFT_ACCELERATION
      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
+#else
+
+      Field Pgaussian(P._grid), Pp(P._grid);
+      ComplexField p2(P._grid); p2 = zero;
+      RealD M = FFT_MASS;
+      
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pgaussian);
+
+      FFT theFFT((GridCartesian*)P._grid);
+      theFFT.FFT_all_dim(Pp, Pgaussian, FFT::forward);
+      MomentaSquare(p2);
+      p2 += M * M;
+      p2 = sqrt(p2);
+      Pp *= p2;
+      theFFT.FFT_all_dim(P, Pp, FFT::backward);
+
+#endif //USE_FFT_ACCELERATION
    }

    static inline Field projectForce(Field& P) {return P;}

-    static inline void update_field(Field& P, Field& U, double ep) {
-      U += P*ep;
+    static inline void update_field(Field &P, Field &U, double ep)
+    {
+#ifndef USE_FFT_ACCELERATION
+      double t0=usecond(); 
+      U += P * ep;
+      double t1=usecond();
+      double total_time = (t1-t0)/1e6;
+      std::cout << GridLogIntegrator << "Total time for updating field (s)       : " << total_time << std::endl; 
+#else
+      // FFT transform P(x) -> P(p)
+      // divide by (M^2+p^2)  M external parameter (how to pass?)
+      // P'(p) = P(p)/(M^2+p^2)
+      // Transform back -> P'(x)
+      // U += P'(x)*ep
+
+      Field Pp(U._grid), P_FFT(U._grid);     
+      static ComplexField p2(U._grid);
+      RealD M = FFT_MASS;
+      
+      FFT theFFT((GridCartesian*)U._grid);
+      theFFT.FFT_all_dim(Pp, P, FFT::forward);
+
+      static bool first_call = true;
+      if (first_call)
+      {
+        // avoid recomputing
+        MomentumSpacePropagator(p2, M);
+        first_call = false;
+      }
+      Pp *= p2;
+      theFFT.FFT_all_dim(P_FFT, Pp, FFT::backward);
+      U += P_FFT * ep;
+
+#endif //USE_FFT_ACCELERATION
    }

-    static inline RealD FieldSquareNorm(Field& U) {
-      return (TensorRemove(sum(trace(U*U))).real());
+    static inline RealD FieldSquareNorm(Field &U)
+    {
+#ifndef USE_FFT_ACCELERATION
+      return (TensorRemove(sum(trace(U * U))).real());
+#else
+      // In case of Fourier acceleration we have to:
+      // compute U(p)*U(p)/(M^2+p^2))   Parseval theorem
+      // 1 FFT needed U(x) -> U(p)
+      // M to be passed
+
+      FFT theFFT((GridCartesian*)U._grid);
+      Field Up(U._grid);
+
+      theFFT.FFT_all_dim(Up, U, FFT::forward);
+      RealD M = FFT_MASS;
+      ComplexField p2(U._grid);
+      MomentumSpacePropagator(p2, M);
+      Field Up2 = Up * p2;
+      // from the definition of the DFT we need to divide by the volume
+      return (-TensorRemove(sum(trace(adj(Up) * Up2))).real() / U._grid->gSites());
+#endif //USE_FFT_ACCELERATION
    }

    static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
@@ -146,7 +247,7 @@ class ScalarImplTypes {
  typedef ScalarImplTypes<vComplex> ScalarImplCR;
  typedef ScalarImplTypes<vComplexF> ScalarImplCF;
  typedef ScalarImplTypes<vComplexD> ScalarImplCD;
-    
+
  // Hardcoding here the size of the matrices
  typedef ScalarAdjMatrixImplTypes<vComplex,  QCD::Nc> ScalarAdjImplR;
  typedef ScalarAdjMatrixImplTypes<vComplexF, QCD::Nc> ScalarAdjImplF;
@@ -155,7 +256,7 @@ class ScalarImplTypes {
  template <int Colours > using ScalarNxNAdjImplR = ScalarAdjMatrixImplTypes<vComplex,   Colours >;
  template <int Colours > using ScalarNxNAdjImplF = ScalarAdjMatrixImplTypes<vComplexF,  Colours >;
  template <int Colours > using ScalarNxNAdjImplD = ScalarAdjMatrixImplTypes<vComplexD,  Colours >;
-  
+
  //}
 }

@@ -30,119 +30,179 @@ directory
 #ifndef SCALAR_INT_ACTION_H
 #define SCALAR_INT_ACTION_H

-
 // Note: this action can completely absorb the ScalarAction for real float fields
 // use the scalarObjs to generalise the structure

-namespace Grid {
-  // FIXME drop the QCD namespace everywhere here
+namespace Grid
+{
+// FIXME drop the QCD namespace everywhere here

-  template <class Impl, int Ndim >
-  class ScalarInteractionAction : public QCD::Action<typename Impl::Field> {
-  public:
-    INHERIT_FIELD_TYPES(Impl);
-  private:
-    RealD mass_square;
-    RealD lambda;
+template <class Impl, int Ndim>
+class ScalarInteractionAction : public QCD::Action<typename Impl::Field>
+{
+public:
+  INHERIT_FIELD_TYPES(Impl);

+private:
+  RealD mass_square;
+  RealD lambda;
+  RealD g;
+  const unsigned int N = Impl::Group::Dimension;

-    typedef typename Field::vector_object vobj;
-    typedef CartesianStencil<vobj,vobj> Stencil;
+  typedef typename Field::vector_object vobj;
+  typedef CartesianStencil<vobj, vobj> Stencil;

-    SimpleCompressor<vobj> compressor;
-    int npoint = 2*Ndim;
-    std::vector<int> directions;//    = {0,1,2,3,0,1,2,3};  // forcing 4 dimensions
-    std::vector<int> displacements;//  = {1,1,1,1, -1,-1,-1,-1};
+  SimpleCompressor<vobj> compressor;
+  int npoint = 2 * Ndim;
+  std::vector<int> directions;    //
+  std::vector<int> displacements; //

-
-  public:
-
-    ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l), displacements(2*Ndim,0), directions(2*Ndim,0){
-      for (int mu = 0 ; mu < Ndim; mu++){
-		directions[mu]         = mu; directions[mu+Ndim]    = mu;
-		displacements[mu]      =  1; displacements[mu+Ndim] = -1;
-      }
+public:
+  ScalarInteractionAction(RealD ms, RealD l, RealD gval) : mass_square(ms), lambda(l), g(gval), displacements(2 * Ndim, 0), directions(2 * Ndim, 0)
+  {
+    for (int mu = 0; mu < Ndim; mu++)
+    {
+      directions[mu] = mu;
+      directions[mu + Ndim] = mu;
+      displacements[mu] = 1;
+      displacements[mu + Ndim] = -1;
    }
+  }

-    virtual std::string LogParameters() {
-      std::stringstream sstream;
-      sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda      << std::endl;
-      sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
-      return sstream.str();
-    }
+  virtual std::string LogParameters()
+  {
+    std::stringstream sstream;
+    sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda << std::endl;
+    sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
+    sstream << GridLogMessage << "[ScalarAction] g           : " << g << std::endl;
+    return sstream.str();
+  }

-    virtual std::string action_name() {return "ScalarAction";}
+  virtual std::string action_name() { return "ScalarAction"; }

-    virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
+  virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}

-    virtual RealD S(const Field &p) {
-      assert(p._grid->Nd() == Ndim);
-      static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
-      phiStencil.HaloExchange(p, compressor);
-      Field action(p._grid), pshift(p._grid), phisquared(p._grid);
-      phisquared = p*p;
-      action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared;
-      for (int mu = 0; mu < Ndim; mu++) {
-	//  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
-	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
-	  int permute_type;
-	  StencilEntry *SE;
-	  vobj temp2;
-	  const vobj *temp, *t_p;
-	    
-	  SE = phiStencil.GetEntry(permute_type, mu, i);
-	  t_p  = &p._odata[i];
-	  if ( SE->_is_local ) {
-	    temp = &p._odata[SE->_offset];
-	    if ( SE->_permute ) {
-	      permute(temp2, *temp, permute_type);
-	      action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2;
-	    } else {
-	      action._odata[i] -= (*temp)*(*t_p) + (*t_p)*(*temp);
-	    }
-	  } else {
-	    action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset];
-	  }
-	}
-	//  action -= pshift*p + p*pshift;
-      }
-      // NB the trace in the algebra is normalised to 1/2
-      // minus sign coming from the antihermitian fields
-      return -(TensorRemove(sum(trace(action)))).real();
-    };
-
-    virtual void deriv(const Field &p, Field &force) {
-      assert(p._grid->Nd() == Ndim);
-      force = (2.0*Ndim + mass_square)*p - lambda/12.*p*p*p;
-      // move this outside
-      static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
-      phiStencil.HaloExchange(p, compressor);
-      
-      //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
-      for (int point = 0; point < npoint; point++) {
-	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
-	  const vobj *temp;
-	  vobj temp2;
-	  int permute_type;
-	  StencilEntry *SE;
-	  SE = phiStencil.GetEntry(permute_type, point, i);
-	  
-	  if ( SE->_is_local ) {
-	    temp = &p._odata[SE->_offset];
-	    if ( SE->_permute ) {
-	      permute(temp2, *temp, permute_type);
-	      force._odata[i] -= temp2;
-	    } else {
-	      force._odata[i] -= *temp;
-	    }
-	  } else {
-	    force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
-	  }
-	}
+  virtual RealD S(const Field &p)
+  {
+    assert(p._grid->Nd() == Ndim);
+    static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
+    phiStencil.HaloExchange(p, compressor);
+    Field action(p._grid), pshift(p._grid), phisquared(p._grid);
+    phisquared = p * p;
+    action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared;
+    for (int mu = 0; mu < Ndim; mu++)
+    {
+      //  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
+      parallel_for(int i = 0; i < p._grid->oSites(); i++)
+      {
+        int permute_type;
+        StencilEntry *SE;
+        vobj temp2;
+        const vobj *temp, *t_p;
+
+        SE = phiStencil.GetEntry(permute_type, mu, i);
+        t_p = &p._odata[i];
+        if (SE->_is_local)
+        {
+          temp = &p._odata[SE->_offset];
+          if (SE->_permute)
+          {
+            permute(temp2, *temp, permute_type);
+            action._odata[i] -= temp2 * (*t_p) + (*t_p) * temp2;
+          }
+          else
+          {
+            action._odata[i] -= (*temp) * (*t_p) + (*t_p) * (*temp);
+          }
+        }
+        else
+        {
+          action._odata[i] -= phiStencil.CommBuf()[SE->_offset] * (*t_p) + (*t_p) * phiStencil.CommBuf()[SE->_offset];
+        }
      }
+      //  action -= pshift*p + p*pshift;
    }
+    // NB the trace in the algebra is normalised to 1/2
+    // minus sign coming from the antihermitian fields
+    return -(TensorRemove(sum(trace(action)))).real() * N / g;
  };
-  
-}  // namespace Grid

-#endif  // SCALAR_INT_ACTION_H
+  virtual void deriv(const Field &p, Field &force)
+  {
+    double t0 = usecond();
+    assert(p._grid->Nd() == Ndim);
+    force = (2. * Ndim + mass_square) * p - 2. * lambda * p * p * p;
+    double interm_t = usecond();
+
+    // move this outside
+    static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
+
+    phiStencil.HaloExchange(p, compressor);
+    double halo_t = usecond();
+    int chunk = 128;
+    //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
+
+    // inverting the order of the loops slows down the code(! g++ 7)
+    // cannot try to reduce the number of  force writes by factor npoint...
+    // use cache blocking
+    for (int point = 0; point < npoint; point++)
+    {
+
+#pragma omp parallel 
+{
+        int permute_type;
+        StencilEntry *SE;
+        const vobj *temp;
+
+#pragma omp for schedule(static, chunk)
+      for (int i = 0; i < p._grid->oSites(); i++)
+      {
+        SE = phiStencil.GetEntry(permute_type, point, i);
+        // prefetch next p?
+
+        if (SE->_is_local)
+        {
+          temp = &p._odata[SE->_offset];
+      
+          if (SE->_permute)
+          {
+            vobj temp2;
+            permute(temp2, *temp, permute_type);
+            force._odata[i] -= temp2;
+          }
+          else
+          {
+            force._odata[i] -= *temp; // slow part. Dominated by this read/write (BW)
+          }
+        }
+        else
+        {
+          force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
+        }
+      }
+
+    }
+  }
+  force *= N / g;
+
+  double t1 = usecond();
+  double total_time = (t1 - t0) / 1e6;
+  double interm_time = (interm_t - t0) / 1e6;
+  double halo_time = (halo_t - interm_t) / 1e6;
+  double stencil_time = (t1 - halo_t) / 1e6;
+  std::cout << GridLogIntegrator << "Total time for force computation (s)       : " << total_time << std::endl;
+  std::cout << GridLogIntegrator << "Intermediate time for force computation (s): " << interm_time << std::endl;
+  std::cout << GridLogIntegrator << "Halo time in force computation (s)         : " << halo_time << std::endl;
+  std::cout << GridLogIntegrator << "Stencil time in force computation (s)      : " << stencil_time << std::endl;
+  double flops = p._grid->gSites() * (14 * N * N * N + 18 * N * N + 2);
+  double flops_no_stencil = p._grid->gSites() * (14 * N * N * N + 6 * N * N + 2);
+  double Gflops = flops / (total_time * 1e9);
+  double Gflops_no_stencil = flops_no_stencil / (interm_time * 1e9);
+  std::cout << GridLogIntegrator << "Flops: " << flops << "  - Gflop/s : " << Gflops << std::endl;
+  std::cout << GridLogIntegrator << "Flops NS: " << flops_no_stencil << "  - Gflop/s NS: " << Gflops_no_stencil << std::endl;
+}
+};
+
+} // namespace Grid
+
+#endif // SCALAR_INT_ACTION_H
@@ -211,7 +211,7 @@ typedef HMCWrapperTemplate<ScalarAdjImplR, MinimumNorm2, ScalarMatrixFields>
    ScalarAdjGenericHMCRunner;

 template <int Colours> 
-using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR<Colours>, MinimumNorm2, ScalarNxNMatrixFields<Colours> >;
+using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR<Colours>, ForceGradient, ScalarNxNMatrixFields<Colours> >;

 }  // namespace QCD
 }  // namespace Grid
@@ -92,6 +92,19 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
  PlaquetteMod(): ObsBase(NoParameters()){}
 };

+template < class Impl >
+class PolyakovMod: public ObservableModule<PolyakovLogger<Impl>, NoParameters>{
+  typedef ObservableModule<PolyakovLogger<Impl>, NoParameters> ObsBase;
+  using ObsBase::ObsBase; // for constructors
+
+  // acquire resource
+  virtual void initialize(){
+    this->ObservablePtr.reset(new PolyakovLogger<Impl>());
+  }
+  public:
+  PolyakovMod(): ObsBase(NoParameters()){}
+};
+

 template < class Impl >
 class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters>{
@@ -45,5 +45,7 @@ class HmcObservable {

 #include "plaquette.h"
 #include "topological_charge.h"
+#include "polyakov_loop.h"
+

 #endif  //  HMC_OBSERVABLE_H
@@ -0,0 +1,68 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/modules/polyakov_line.h
+
+Copyright (C) 2017
+
+Author: David Preti <david.preti@csic.es>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef HMC_POLYAKOV_H
+#define HMC_POLYAKOV_H
+
+namespace Grid {
+namespace QCD {
+
+// this is only defined for a gauge theory
+template <class Impl>
+class PolyakovLogger : public HmcObservable<typename Impl::Field> {
+ public:
+  // here forces the Impl to be of gauge fields
+  // if not the compiler will complain
+  INHERIT_GIMPL_TYPES(Impl);
+
+  // necessary for HmcObservable compatibility
+  typedef typename Impl::Field Field;
+
+  void TrajectoryComplete(int traj,
+                          Field &U,
+                          GridSerialRNG &sRNG,
+                          GridParallelRNG &pRNG) {
+
+    ComplexD polyakov = WilsonLoops<Impl>::avgPolyakovLoop(U);
+
+    int def_prec = std::cout.precision();
+
+    std::cout << GridLogMessage
+        << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
+        << "Polyakov Loop: [ " << traj << " ] "<< polyakov << std::endl;
+
+    std::cout.precision(def_prec);
+
+  }
+};
+
+}  // namespace QCD
+}  // namespace Grid
+
+#endif  // HMC_POLYAKOV_H
@@ -23,6 +23,7 @@ class AdjointRep {
  typedef typename SU_Adjoint<ncolour>::LatticeAdjMatrix LatticeMatrix;
  typedef typename SU_Adjoint<ncolour>::LatticeAdjField LatticeField;
  static const int Dimension = ncolour * ncolour - 1;
+  static const bool isFundamental = false;

  LatticeField U;

@@ -19,6 +19,7 @@ template <int ncolour>
 class FundamentalRep {
 public:
  static const int Dimension = ncolour;
+  static const bool isFundamental = true;

  // typdef to be used by the Representations class in HMC to get the
  // types for the higher representation fields
@@ -29,6 +29,7 @@ class TwoIndexRep {
  typedef typename SU_TwoIndex<ncolour, S>::LatticeTwoIndexMatrix LatticeMatrix;
  typedef typename SU_TwoIndex<ncolour, S>::LatticeTwoIndexField LatticeField;
  static const int Dimension = ncolour * (ncolour + S) / 2;
+  static const bool isFundamental = false;

  LatticeField U;

@@ -123,6 +123,28 @@ public:
    return sumplaq / vol / faces / Nc; // Nd , Nc dependent... FIXME
  }

+
+  //////////////////////////////////////////////////
+  // average over all x,y,z the temporal loop
+  //////////////////////////////////////////////////
+  static ComplexD avgPolyakovLoop(const GaugeField &Umu) {  //assume Nd=4
+    GaugeMat Ut(Umu._grid), P(Umu._grid);
+    ComplexD out;
+    int T = Umu._grid->GlobalDimensions()[3];
+    int X = Umu._grid->GlobalDimensions()[0];
+    int Y = Umu._grid->GlobalDimensions()[1];
+    int Z = Umu._grid->GlobalDimensions()[2];
+
+    Ut = peekLorentz(Umu,3); //Select temporal direction
+    P = Ut;
+    for (int t=1;t<T;t++){ 
+      P = Gimpl::CovShiftForward(Ut,3,P);
+    }
+   RealD norm = 1.0/(Nc*X*Y*Z*T);
+   out = sum(trace(P))*norm;
+   return out;   
+}
+
  //////////////////////////////////////////////////
  // average over traced single links
  //////////////////////////////////////////////////
@@ -190,6 +212,7 @@ public:


 // For the force term
+/*
 static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
    GridBase *grid = Umu._grid;
    std::vector<GaugeMat> U(Nd, grid);
@@ -203,7 +226,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {

    for (int nu = 0; nu < Nd; nu++) {
      if (nu != mu) {
-        // this is ~10% faster than the Staple
+        // this is ~10% faster than the Staple  -- PAB: so what it gives the WRONG answers for other BC's!
        tmp1 = Cshift(U[nu], mu, 1);
        tmp2 = Cshift(U[mu], nu, 1);
        staple += tmp1* adj(U[nu]*tmp2);
@@ -213,7 +236,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
    }
    staple = U[mu]*staple;
 }
-
+*/
  //////////////////////////////////////////////////
  // the sum over all staples on each site
  //////////////////////////////////////////////////
@@ -291,9 +314,9 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
    }
  }

-  //////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
  // the sum over all staples on each site in direction mu,nu, lower part
-  //////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
  static void StapleLower(GaugeMat &staple, const GaugeLorentz &Umu, int mu,
                          int nu) {
    if (nu != mu) {
@@ -315,7 +338,9 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
      //
      staple = Gimpl::ShiftStaple(
          Gimpl::CovShiftBackward(U[nu], nu,
-                                  Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu);
+                                  Gimpl::CovShiftBackward(U[mu], mu, U[nu])),
+          mu);
+
    }
  }

@@ -325,7 +350,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  static void FieldStrength(GaugeMat &FS, const GaugeLorentz &Umu, int mu, int nu){
      // Fmn +--<--+  Ut +--<--+
      //     |     |     |     |
-      //  (x)+-->--+     +-->--+(x)
+      //  (x)+-->--+     +-->--+(x)  - h.c.
      //     |     |     |     |
      //     +--<--+     +--<--+

@@ -335,7 +360,9 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
      GaugeMat v = Vup - Vdn;
      GaugeMat u = PeekIndex<LorentzIndex>(Umu, mu);  // some redundant copies
      GaugeMat vu = v*u;
-      FS = 0.25*Ta(u*v + Cshift(vu, mu, -1));
+      //FS = 0.25*Ta(u*v + Cshift(vu, mu, -1));
+      FS = (u*v + Cshift(vu, mu, -1));
+      FS = 0.125*(FS - adj(FS));
  }

  static Real TopologicalCharge(GaugeLorentz &U){
@@ -360,6 +387,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
    return TensorRemove(Tq).real();
  }

+
  //////////////////////////////////////////////////////
  // Similar to above for rectangle is required
  //////////////////////////////////////////////////////
@@ -31,44 +31,78 @@ Author: Guido Cossu <guido.cossu@ed.ac.uk>
 #define GRID_SERIALISATION_ABSTRACT_READER_H

 #include <type_traits>
+#include <Grid/tensors/Tensors.h>

 namespace Grid {
-  // Vector IO utilities ///////////////////////////////////////////////////////
-  // helper function to read space-separated values
+  // Grid scalar tensors to nested std::vectors //////////////////////////////////
  template <typename T>
-  std::vector<T> strToVec(const std::string s)
+  struct TensorToVec
  {
-    std::istringstream sstr(s);
-    T                  buf;
-    std::vector<T>     v;
-    
-    while(!sstr.eof())
+    typedef T type;
+  };
+
+  template <typename T>
+  struct TensorToVec<iScalar<T>>
+  {
+    typedef typename TensorToVec<T>::type type;
+  };
+
+  template <typename T, int N>
+  struct TensorToVec<iVector<T, N>>
+  {
+    typedef typename std::vector<typename TensorToVec<T>::type> type;
+  };
+
+  template <typename T, int N>
+  struct TensorToVec<iMatrix<T, N>>
+  {
+    typedef typename std::vector<std::vector<typename TensorToVec<T>::type>> type;
+  };
+
+  template <typename T>
+  typename TensorToVec<T>::type tensorToVec(const T &t)
+  {
+    return t;
+  }
+
+  template <typename T>
+  typename TensorToVec<iScalar<T>>::type tensorToVec(const iScalar<T>& t)
+  {
+    return tensorToVec(t._internal);
+  }
+
+  template <typename T, int N>
+  typename TensorToVec<iVector<T, N>>::type tensorToVec(const iVector<T, N>& t)
+  {
+    typename TensorToVec<iVector<T, N>>::type v;
+
+    v.resize(N);
+    for (unsigned int i = 0; i < N; i++) 
    {
-      sstr >> buf;
-      v.push_back(buf);
+      v[i] = tensorToVec(t._internal[i]);
    }
-    
+
    return v;
  }
-  
-  // output to streams for vectors
-  template < class T >
-  inline std::ostream & operator<<(std::ostream &os, const std::vector<T> &v)
+
+  template <typename T, int N>
+  typename TensorToVec<iMatrix<T, N>>::type tensorToVec(const iMatrix<T, N>& t)
  {
-    os << "[";
-    for (auto &x: v)
+    typename TensorToVec<iMatrix<T, N>>::type v;
+
+    v.resize(N);
+    for (unsigned int i = 0; i < N; i++)
    {
-      os << x << " ";
+      v[i].resize(N);
+      for (unsigned int j = 0; j < N; j++) 
+      {
+        v[i][j] = tensorToVec(t._internal[i][j]);
+      }
    }
-    if (v.size() > 0)
-    {
-      os << "\b";
-    }
-    os << "]";
-    
-    return os;
+
+    return v;
  }
-  
+
  // Vector element trait //////////////////////////////////////////////////////  
  template <typename T>
  struct element
@@ -151,15 +185,15 @@ namespace Grid {
    do
    {
      is.get(c);
-    } while (c != '<' && !is.eof());
-    if (c == '<')
+    } while (c != '(' && !is.eof());
+    if (c == '(')
    {
      int start = is.tellg();
      do
      {
        is.get(c);
-      } while (c != '>' && !is.eof());
-      if (c == '>')
+      } while (c != ')' && !is.eof());
+      if (c == ')')
      {
        int end = is.tellg();
        int psize = end - start - 1;
@@ -182,7 +216,43 @@ namespace Grid {
  template <class T1, class T2>
  inline std::ostream & operator<<(std::ostream &os, const std::pair<T1, T2> &p)
  {
-    os << "<" << p.first << " " << p.second << ">";
+    os << "(" << p.first << " " << p.second << ")";
+    return os;
+  }
+
+  // Vector IO utilities ///////////////////////////////////////////////////////
+  // helper function to read space-separated values
+  template <typename T>
+  std::vector<T> strToVec(const std::string s)
+  {
+    std::istringstream sstr(s);
+    T                  buf;
+    std::vector<T>     v;
+    
+    while(!sstr.eof())
+    {
+      sstr >> buf;
+      v.push_back(buf);
+    }
+    
+    return v;
+  }
+  
+  // output to streams for vectors
+  template < class T >
+  inline std::ostream & operator<<(std::ostream &os, const std::vector<T> &v)
+  {
+    os << "[";
+    for (auto &x: v)
+    {
+      os << x << " ";
+    }
+    if (v.size() > 0)
+    {
+      os << "\b";
+    }
+    os << "]";
+    
    return os;
  }

@@ -79,7 +79,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define ZEND2f(Criir,Ciirr, tmp)  "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp   ";\n"\
                                  "vsubps  " #tmp "," #Ciirr "," #Criir"{%k7}"  ";\n"

-#define ZEND1d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Criir "," #Criir "," #tmp  ";\n"\ 
+#define ZEND1d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Criir "," #Criir "," #tmp  ";\n"\
                                  "vaddps  " #tmp "," #Criir "," #Criir"{%k6}"  ";\n"

 #define ZEND2d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp   ";\n"\
@@ -105,7 +105,6 @@ template<class vobj,class cobj>
 class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
 public:

-  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
  typedef typename cobj::vector_type vector_type;
  typedef typename cobj::scalar_type scalar_type;
  typedef typename cobj::scalar_object scalar_object;
@@ -204,7 +204,7 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
 // Reinit guard
 /////////////////////////////////////////////////////////
 static int Grid_is_initialised = 0;
-
+static MemoryStats dbgMemStats;

 void Grid_init(int *argc,char ***argv)
 {
@@ -220,11 +220,11 @@ void Grid_init(int *argc,char ***argv)
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm");
    GridCmdOptionInt(arg,MB);
    uint64_t MB64 = MB;
-    CartesianCommunicator::MAX_MPI_SHM_BYTES = MB64*1024LL*1024LL;
+    GlobalSharedMemory::MAX_MPI_SHM_BYTES = MB64*1024LL*1024LL;
  }

  if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-hugepages") ){
-    CartesianCommunicator::Hugepages = 1;
+    GlobalSharedMemory::Hugepages = 1;
  }


@@ -251,6 +251,11 @@ void Grid_init(int *argc,char ***argv)
    assert(fp!=(FILE *)NULL);
  }

+  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-mem") ){
+    MemoryProfiler::debug = true;
+    MemoryProfiler::stats = &dbgMemStats;
+  }
+
  ////////////////////////////////////
  // Banner
  ////////////////////////////////////
@@ -324,6 +329,7 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<"  --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
    std::cout<<GridLogMessage<<"  --debug-signals : catch sigsegv and print a blame report"<<std::endl;
    std::cout<<GridLogMessage<<"  --debug-stdout  : print stdout from EVERY node"<<std::endl;
+    std::cout<<GridLogMessage<<"  --debug-mem     : print Grid allocator activity"<<std::endl;
    std::cout<<GridLogMessage<<"  --notimestamp   : suppress millisecond resolution stamps"<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"Performance:"<<std::endl;
@@ -392,8 +398,8 @@ void Grid_init(int *argc,char ***argv)
 		  Grid_default_latt,
 		  Grid_default_mpi);

-  std::cout << GridLogMessage << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
-  if ( CartesianCommunicator::Hugepages) {
+  std::cout << GridLogMessage << "Requesting "<< GlobalSharedMemory::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
+  if ( GlobalSharedMemory::Hugepages) {
    std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl;
  }

@@ -0,0 +1,72 @@
+   /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/util/Profiling.h
+
+    Copyright (C) 2018
+
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#ifndef GRID_PERF_PROFILING_H
+#define GRID_PERF_PROFILING_H
+
+#include <sstream>
+#include <iostream>
+#include <functional>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <signal.h>
+
+struct System
+{
+    static void profile(const std::string& name,std::function<void()> body) {
+        std::string filename = name.find(".data") == std::string::npos ? (name + ".data") : name;
+
+        // Launch profiler
+        pid_t pid;
+        std::stringstream s;
+        s << getpid();
+        pid = fork();
+        if (pid == 0) {
+            auto fd=open("/dev/null",O_RDWR);
+            dup2(fd,1);
+            dup2(fd,2);
+            exit(execl("/usr/bin/perf","perf","record","-o",filename.c_str(),"-p",s.str().c_str(),nullptr));
+        }
+
+        // Run body
+        body();
+
+        // Kill profiler  
+        kill(pid,SIGINT);
+        waitpid(pid,nullptr,0);
+    }
+
+    static void profile(std::function<void()> body) {
+        profile("perf.data",body);
+    }
+};
+
+#endif // GRID_PERF_PROFILING_H